diff --git a/.gitignore b/.gitignore index 77cc897..bad1588 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ cmake-*/ compile* Testing/ CLAUDE.md +dist/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 1247944..2ba2128 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.16) -project(vmsig VERSION 0.2.0 C) +project(vmsig VERSION 0.3.0 LANGUAGES C) set(CMAKE_C_STANDARD 17) set(CMAKE_C_STANDARD_REQUIRED ON) @@ -10,12 +10,11 @@ option(VMSIG_LTO "Enable LTO" OFF) # builds against headers only: the SI calls are hidden behind these flags, and the # stub mode proves the seam without a real VM. option(VMSIG_WITH_VMIE "Link real vmie (libvmie.a, PIC) for armed memctx" OFF) -option(VMSIG_WITH_VMCTL "Link real vmctl (libvmctl.a, PIC) for armed input" OFF) -# ---- Sibling library sources (set these to your local checkouts) ------------ -# Only needed for the armed builds below; the default stub build needs neither. +# ---- Sibling library source (set to your local checkout) -------------------- +# vmie stays an EXTERNAL library (.so/.deb); only needed for the armed memctx build. +# The input driver (vmctl) is ABSORBED in-tree (src/si/input/) — no external flag. set(LIBVMIE_PATH "" CACHE PATH "Path to the vmie library sources (for VMSIG_WITH_VMIE)") -set(LIBVMCTL_PATH "" CACHE PATH "Path to the vmctl library sources (for VMSIG_WITH_VMCTL)") find_package(Threads REQUIRED) @@ -29,7 +28,17 @@ add_library(vmsig SHARED src/adapter/input/input.c src/adapter/vmhost/vmhost.c src/control/inproc.c - src/control/socket.c) + src/control/socket.c + src/discovery/slot.c + src/discovery/linux/host_probe.c + src/discovery/discovery.c + # SI input driver (vmctl), absorbed in-tree (host-only: QMP + uinput) + src/si/input/open.c + src/si/input/qmp.c + src/si/input/qmp_driver.c + src/si/input/keymap.c + src/si/input/power.c + src/si/input/linux/uinput_driver.c) target_include_directories(vmsig PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include @@ -38,30 +47,24 @@ target_include_directories(vmsig ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/include ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/memctx/include ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/input/include - ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/vmhost/include) + ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/vmhost/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/discovery/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/si/input/include) target_link_libraries(vmsig PRIVATE Threads::Threads) -# armed: the real vmie as a pre-built PIC .a (IMPORTED) — headers and symbols. +# armed: vmie stays an EXTERNAL shared library (.so/.deb) — pre-built, IMPORTED. Both +# libvmsig (armed memctx) and libvgpu-perception link it dynamically (no duplication; the +# package Depends on libvmie). Headers + symbols come from the imported target. if(VMSIG_WITH_VMIE) - add_library(vmie STATIC IMPORTED) + add_library(vmie SHARED IMPORTED) set_target_properties(vmie PROPERTIES - IMPORTED_LOCATION ${LIBVMIE_PATH}/.build/libvmie.a + IMPORTED_LOCATION ${LIBVMIE_PATH}/.build/libvmie.so INTERFACE_INCLUDE_DIRECTORIES ${LIBVMIE_PATH}/include) target_link_libraries(vmsig PRIVATE vmie) target_compile_definitions(vmsig PRIVATE VMSIG_WITH_VMIE) endif() -# armed: the real vmctl as a pre-built PIC .a (IMPORTED). -if(VMSIG_WITH_VMCTL) - add_library(vmctl STATIC IMPORTED) - set_target_properties(vmctl PROPERTIES - IMPORTED_LOCATION ${LIBVMCTL_PATH}/.build/libvmctl.a - INTERFACE_INCLUDE_DIRECTORIES ${LIBVMCTL_PATH}/include) - target_link_libraries(vmsig PRIVATE vmctl) - target_compile_definitions(vmsig PRIVATE VMSIG_WITH_VMCTL) -endif() - target_compile_options(vmsig PRIVATE -O2 -Wall -Wextra) if(VMSIG_LTO) target_compile_options(vmsig PRIVATE -flto) @@ -73,6 +76,50 @@ add_executable(vmsig_cli src/cli.c) target_link_libraries(vmsig_cli PRIVATE vmsig) target_compile_options(vmsig_cli PRIVATE -Wall -Wextra) +# ---- vgpu-perception: host-side vgpu Sensor S-lib (absorbed in-tree) --------- +# A SEPARATE shipped library (NOT fused into libvmsig — it is consumed by the shell, not the +# signaling core). Host-only: reads the vgpu shared region from its own RO vmie_mem. Built +# only when armed (needs vmie). The in-guest Windows producer (vgpu-streamer.exe) stays in a +# separate repo and is NOT part of this delivery. +if(VMSIG_WITH_VMIE) + add_library(vgpu-perception SHARED + src/si/vgpu-perception/discover.c + src/si/vgpu-perception/sample.c + src/si/vgpu-perception/control.c) + target_include_directories(vgpu-perception + PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/si/vgpu-perception/include) + target_link_libraries(vgpu-perception PUBLIC vmie) # memmodel.h/win32.h via the vmie target + target_compile_options(vgpu-perception PRIVATE -O2 -Wall -Wextra) + + add_executable(vgpu_perceptiontest src/test/test_perception.c) + target_include_directories(vgpu_perceptiontest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/si/vgpu-perception/include) + target_link_libraries(vgpu_perceptiontest PRIVATE vgpu-perception) + target_compile_options(vgpu_perceptiontest PRIVATE -O2 -Wall -Wextra) + add_test(NAME vgpu_perception COMMAND vgpu_perceptiontest) + set_tests_properties(vgpu_perception PROPERTIES + ENVIRONMENT "LD_LIBRARY_PATH=${LIBVMIE_PATH}/.build:${CMAKE_BINARY_DIR}") +endif() + +# ---- vmsigd: the management daemon ----------------------------------------- +# Links libvmsig (works in stub or armed; armed memctx needs vmie at runtime). Discovery + +# socket + a coarse per-uid admission policy; serves whatever appears under the watch dir. +add_executable(vmsigd + src/daemon/vmsigd.c + src/daemon/config.c + src/daemon/admission.c) +target_link_libraries(vmsigd PRIVATE vmsig Threads::Threads) +target_include_directories(vmsigd PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/daemon/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/discovery/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/core/include) +target_compile_options(vmsigd PRIVATE -O2 -Wall -Wextra) +if(VMSIG_LTO) + target_compile_options(vmsigd PRIVATE -flto) + target_link_options(vmsigd PRIVATE -flto) +endif() + # ---- transfer-context tests (ctest) ----------------------------------------- enable_testing() add_executable(vmsig_test src/test/test_ctx.c) @@ -97,6 +144,43 @@ target_link_libraries(vmsig_mvmtest PRIVATE vmsig) target_compile_options(vmsig_mvmtest PRIVATE -Wall -Wextra) add_test(NAME mvm COMMAND vmsig_mvmtest) +add_executable(vmsig_dyneptest src/test/test_dynep.c) +target_link_libraries(vmsig_dyneptest PRIVATE vmsig Threads::Threads) +target_compile_options(vmsig_dyneptest PRIVATE -Wall -Wextra) +add_test(NAME dynep COMMAND vmsig_dyneptest) + +add_executable(vmsig_rostertest src/test/test_roster.c) +target_link_libraries(vmsig_rostertest PRIVATE vmsig) +target_include_directories(vmsig_rostertest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/core/include) +target_compile_options(vmsig_rostertest PRIVATE -Wall -Wextra) +add_test(NAME roster COMMAND vmsig_rostertest) + +add_executable(vmsig_slottest src/test/test_slot.c) +target_link_libraries(vmsig_slottest PRIVATE vmsig) +target_include_directories(vmsig_slottest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/discovery/include) +target_compile_options(vmsig_slottest PRIVATE -Wall -Wextra) +add_test(NAME slot COMMAND vmsig_slottest) + +add_executable(vmsig_discoverytest src/test/test_discovery.c) +target_link_libraries(vmsig_discoverytest PRIVATE vmsig) +target_include_directories(vmsig_discoverytest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/discovery/include) +target_compile_options(vmsig_discoverytest PRIVATE -Wall -Wextra) +add_test(NAME discovery COMMAND vmsig_discoverytest) + +add_executable(vmsig_daemoncfgtest + src/test/test_daemoncfg.c + src/daemon/config.c + src/daemon/admission.c) +target_link_libraries(vmsig_daemoncfgtest PRIVATE vmsig) +target_include_directories(vmsig_daemoncfgtest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/daemon/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/discovery/include) +target_compile_options(vmsig_daemoncfgtest PRIVATE -Wall -Wextra) +add_test(NAME daemoncfg COMMAND vmsig_daemoncfgtest) + add_executable(vmsig_authztest src/test/test_authz.c) target_link_libraries(vmsig_authztest PRIVATE vmsig) target_compile_options(vmsig_authztest PRIVATE -Wall -Wextra) @@ -105,7 +189,8 @@ add_test(NAME authz COMMAND vmsig_authztest) add_executable(vmsig_memctxtest src/test/test_memctx.c) target_link_libraries(vmsig_memctxtest PRIVATE vmsig Threads::Threads) target_include_directories(vmsig_memctxtest PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/src/core/include) + ${CMAKE_CURRENT_SOURCE_DIR}/src/core/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/adapter/memctx/include) target_compile_options(vmsig_memctxtest PRIVATE -Wall -Wextra) add_test(NAME memctx COMMAND vmsig_memctxtest) @@ -137,3 +222,27 @@ add_test(NAME memwrite COMMAND vmsig_memwritetest) # the demonstrator doubles as an end-to-end seam test (self-terminates rc=0) add_test(NAME cli COMMAND vmsig_cli) + +# ---- install rules (for the .deb stage) ------------------------------------- +option(VMSIG_INSTALL "Generate install() rules (daemon/lib/headers/unit/config)" OFF) +if(VMSIG_INSTALL) + include(GNUInstallDirs) + install(TARGETS vmsigd RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}) + install(TARGETS vmsig LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) + if(TARGET vgpu-perception) # armed builds ship the host vgpu S-lib alongside + install(TARGETS vgpu-perception LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) + endif() + # public contracts (signaling + absorbed SI host headers) under include/vmsig/ + install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/vmsig + FILES_MATCHING PATTERN "vmsig*.h" + PATTERN "vmctl.h" + PATTERN "vgpu_stream.h" + PATTERN "vgpu_perception.h") + install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/packaging/systemd/vmsigd.service + DESTINATION lib/systemd/system) + install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/packaging/tmpfiles/vmsig.conf + DESTINATION lib/tmpfiles.d) + install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/packaging/config/vmsigd.conf + DESTINATION /etc/vmsig) +endif() diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4455490 --- /dev/null +++ b/Makefile @@ -0,0 +1,46 @@ +# vmsig packaging — `make deb` builds the .deb over a `cmake --install` stage. +# Private values are NOT baked into the tree: pass them via the variables below (the +# defaults are neutral placeholders; CI overrides them from vars/secrets). +# +# make deb LIBVMIE_PATH=/path/to/vmie VERSION=1.2.3 \ +# MAINTAINER="Name " DEPENDS="libc6, libvmie0" + +VERSION ?= 0.0.0 +MAINTAINER ?= vmsig packaging +# libvmie0 is vmie's own runtime package (SONAME libvmie.so.0): libvmsig.so and +# libvgpu-perception.so dynamically link it, so it is a HARD runtime dependency. +DEPENDS ?= libc6, libvmie0 +ARCH ?= amd64 +LIBVMIE_PATH ?= + +BUILD_DIR ?= .build-pkg +STAGE ?= $(CURDIR)/dist/stage +DIST ?= $(CURDIR)/dist + +.PHONY: deb clean + +# Armed package: the shipped daemon needs vmie for memctx. vmie stays an external dependency +# (the package Depends on its runtime; pass DEPENDS to add it). +deb: + @test -n "$(LIBVMIE_PATH)" || { echo "set LIBVMIE_PATH=/path/to/vmie sources (armed memctx)"; exit 1; } + rm -rf $(STAGE) + cmake -S . -B $(BUILD_DIR) -DCMAKE_BUILD_TYPE=Release -DVMSIG_INSTALL=ON \ + -DVMSIG_WITH_VMIE=ON -DLIBVMIE_PATH=$(LIBVMIE_PATH) + cmake --build $(BUILD_DIR) -j + DESTDIR=$(STAGE) cmake --install $(BUILD_DIR) --prefix /usr + mkdir -p $(STAGE)/DEBIAN + sed -e 's/@VERSION@/$(VERSION)/' \ + -e 's|@MAINTAINER@|$(MAINTAINER)|' \ + -e 's/@DEPENDS@/$(DEPENDS)/' \ + packaging/deb/control.in > $(STAGE)/DEBIAN/control + cp packaging/deb/conffiles $(STAGE)/DEBIAN/conffiles + install -m 0755 packaging/deb/postinst $(STAGE)/DEBIAN/postinst + install -m 0755 packaging/deb/prerm $(STAGE)/DEBIAN/prerm + # strip inherited setgid from staged dirs (a setgid build tree => dpkg-deb rejects DEBIAN) + find $(STAGE) -type d -exec chmod g-s {} + + mkdir -p $(DIST) + dpkg-deb --root-owner-group --build $(STAGE) $(DIST)/vmsig_$(VERSION)_$(ARCH).deb + @echo "built: $(DIST)/vmsig_$(VERSION)_$(ARCH).deb" + +clean: + rm -rf $(BUILD_DIR) $(DIST) diff --git a/include/vgpu_perception.h b/include/vgpu_perception.h new file mode 100644 index 0000000..7cc648d --- /dev/null +++ b/include/vgpu_perception.h @@ -0,0 +1,270 @@ +#ifndef VGPU_PERCEPTION_H +#define VGPU_PERCEPTION_H + +/* vgpu_perception.h — host-side, read-only perception over the vgpu region. + * + * A pure functional core that builds vgpu semantics ON TOP OF a guest + * address-space root handed in by the caller. It only PERCEIVES: it discovers + * the region by structural invariants, samples frames and reads cursor / + * geometry / lifecycle, and returns SNAPSHOTS (POD values). It never owns + * coherence, never opens RW guest memory, never decides control or behavioural + * timing, never emits events upward. + * + * Where the region lives (the correction that shapes this API): the region is a + * RW shared mapping projected into the USER address space of a producer PROCESS, + * NOT a kernel VA in the System address space. So the core is handed a RO win32 + * context (which the caller opened with the System kcr3), enumerates processes + * with proc_list, and finds the region in a process user-AS under that process's + * own cr3 (process.cr3). The System kcr3 is needed ONLY to open the context and + * walk processes; once the region is found, it is always read under the + * producer's process.cr3 (cached in the handle). The handle carries proc_cr3. + * + * What this core does NOT do (by design — those belong to the caller): + * - It does NOT own the vmie_win32 context / vmie_mem: both are BORROWED. The + * caller opens the RO win32 context (its lifetime is tied to the guest + * address-space mapping epoch) and closes it when that mapping goes stale. + * The core never opens or closes either. + * - It does NOT sleep / poll / spawn threads / arm timers: the two-phase + * liveness handshake is two calls; the WAIT between them is the caller's. + * - It does NOT transport frames. Frame transport is the caller's concern; + * the core is a PULL source — the caller takes desc+bytes from + * vgpup_sample_frame and routes them. No sink callback here. + * - It does NOT write control. vgpup_build_control_write only BUILDS the + * desired frame + offsets; the actual write is performed elsewhere, by a + * component that holds read-write access to the region. + * + * Two epochs + producer restart (the caller owns the policy; the core only + * reports facts — this is a flat pull model, no polling from below): + * - Address-space invalidation (new kcr3 / new epoch): the caller closes the + * win32 context, drops the old vgpup_region, opens a fresh context on the + * new epoch and re-discovers (vgpup_open). The old handle is invalid (a + * different address space entirely). + * - vgpu run_epoch advance while the context stays live (session break, same + * process): vgpup_read_status records r->run_epoch; vgpup_run_epoch reports + * it. The caller compares and decides whether to reset vgpu state — the + * region/process are unchanged. The core holds no reset policy. + * - Producer process restart (new pid/cr3 under the same live kcr3): the win32 + * context is still valid (kernel alive), but the old handle's proc_cr3 / + * region_gva point at a dead process address space. Symptom: a read under + * r->proc_cr3 returns <0 (the process pages are gone). The core only REPORTS + * this (<0 from a read); the DECISION to re-discover is the caller's — it + * calls vgpup_close(old) + vgpup_open(v) so a fresh proc_list finds the + * restarted producer with its new cr3. + * + * Ownership convention: + * - vmie_win32* v, vmie_mem* m — BORROWED. The caller owns their lifecycle + * (tied to the address-space mapping). The core only reads through them. + * - vgpup_region* — heap-owned by the core (small private state). Create with + * vgpup_open, release with vgpup_close. Closing it does NOT touch v / m. + * + * Conventions (mirror memmodel.h): + * - The System kcr3 opens the RO win32 context; the REGION lives in the USER + * address space of the producer process and is read under its process.cr3 + * (cached in the handle as proc_cr3). A "GVA" is a 64-bit guest VA in that + * process address space. + * - All guest reads go through gva_read into a local copy; no borrowed + * pointer into guest memory ever escapes a seqlock window or this API. + * - Integer returns: 0 success / negative failure for deterministic calls. + * Lossy read calls (sample/cursor/geometry) are tristate: 1 = consistent + * snapshot produced, 0 = no fresh data / writer kept it busy past the retry + * limit / would not fit (a SKIP, never an error — do not block), <0 = a + * hard memory-read error (page gone / process restarted — the caller + * re-discovers; see "Two epochs + producer restart" above). + * + * Example (the caller drives the two-phase liveness and the read loop): + * + * // caller already opened a RO win32 context with the System kcr3: + * vmie_win32* v = caller_ctx; // BORROWED by the core + * vmie_mem* m = vmie_win32_mem(v); // BORROWED; for the generic gva_* + * + * vgpup_region* r = vgpup_open(v); // phase 1: find producer + candidate + * if (!r) { return; } // no region in any process + * + * // phase 2 is the caller's: it waits >= VGPU_HEARTBEAT_PERIOD_MS, then + * uint64_t proc_cr3, region_gva, hb0; + * vgpup_discover_candidate(v, &proc_cr3, ®ion_gva, &hb0); // (or reuse open's) + * // ... the caller sleeps here, NOT the core ... + * int alive = vgpup_confirm_alive(m, proc_cr3, region_gva, hb0); + * + * // sampling (lossy pull): + * static uint8_t buf[VGPU_SLOT_STRIDE]; + * vgpup_frame_info fi; + * if (vgpup_sample_frame(r, m, buf, sizeof buf, &fi) == 1) { + * // route fi.desc + buf[0..fi.bytes) to the chosen transport + * } + * + * vgpup_close(r); // frees core state only; v / m stay with the caller + */ + +#include +#include + +#include "vgpu_stream.h" /* region ABI: producer/control types, slot geometry */ +#include "win32.h" /* vmie_win32*, proc_list, process, vmie_win32_mem; + * pulls in memmodel.h for vmie_mem / gva_* — the + * producer is found via proc_list under the System + * kcr3, then the region is read under process.cr3 */ + +/* Opaque found vgpu region in a producer's user address space. Heap-owned by the + * core; holds only small private state (proc_cr3, region/ctrl/ring GVA, last + * frame_id, last run_epoch). It does NOT own v / m — those are passed back in on + * every read. */ +typedef struct vgpup_region vgpup_region; + +/* ---- handle / lifecycle (the core does NOT own the win32 context) --------- */ + +/* Phase-1 discover + bind: enumerate processes (proc_list) over the BORROWED RO + * win32 context v, scan each process user-AS by structural invariants, snapshot + * hb0, and build a handle carrying the producer's proc_cr3 + region/ctrl/ring + * GVA. v is BORROWED — the core reads through it but never closes it (its + * lifetime is the caller's, tied to the address-space mapping epoch). Returns a + * heap-owned vgpup_region*, or NULL if no region is found in any process. + * Liveness is NOT + * yet proven: the caller must call vgpup_confirm_alive after waiting + * >= VGPU_HEARTBEAT_PERIOD_MS. Sampling before confirmation is allowed (lossy); + * "producer alive" is true only after a positive confirm. + * + * If a later read returns <0, the producer process may have restarted (its + * pages are gone): the caller re-discovers via vgpup_close(r) + vgpup_open(v). */ +vgpup_region* vgpup_open(vmie_win32* v); + +/* Release ONLY the core state. Does NOT touch v / m — the caller closes those + * (their lifetime is the caller's). Safe on NULL. */ +void vgpup_close(vgpup_region* r); + +/* ---- two-phase discovery (the WAIT belongs to the caller) ----------------- */ + +/* Phase 1: find a producer and a candidate region in its user-AS (no liveness). + * Walks proc_list over v and, for each process, scans its user-AS under + * process.cr3 by structural invariants. On the first hit writes the producer's + * cr3 to *out_proc_cr3, the region base GVA to *out_region_gva and the heartbeat + * snapshot to *out_hb0, and returns 0. Returns <0 if no candidate is found in + * any process or a read fails. Pure; does NOT wait. Needs v for proc_list. */ +int vgpup_discover_candidate(vmie_win32* v, uint64_t* out_proc_cr3, + uint64_t* out_region_gva, uint64_t* out_hb0); + +/* Phase 2: confirm liveness. The caller calls this >= VGPU_HEARTBEAT_PERIOD_MS + * after phase 1. Re-reads heartbeat at region_gva under proc_cr3 and returns 1 + * if it advanced (alive producer), 0 if it did not tick (dead / not the region), + * <0 on a read error. Takes vmie_mem* m (== vmie_win32_mem(v)) and proc_cr3 — + * the win32 surface is no longer needed here, only gva_read. Pure; does NOT + * wait — the inter-phase delay is the caller's. */ +int vgpup_confirm_alive(vmie_mem* m, uint64_t proc_cr3, + uint64_t region_gva, uint64_t hb0); + +/* ---- snapshots (POD values; read under their seqlock discipline) ---------- */ + +/* Snapshot of the last published frame's descriptor (read under seq[slot]). */ +typedef struct { + uint32_t width, height, stride, format; + uint64_t frame_id; + uint64_t timestamp_ns; +} vgpup_frame_desc; + +/* Result of a frame sample: the descriptor plus the count of bytes copied into + * the caller's buffer (== height*stride, tight). */ +typedef struct { + vgpup_frame_desc desc; + size_t bytes; +} vgpup_frame_info; + +/* Cursor snapshot (read under the cursor_seq acquire gate). seq lets the caller + * tell "cursor idle" from "producer stopped reporting". */ +typedef struct { + uint32_t seq; /* cursor_seq observed for this snapshot */ + uint32_t visible; /* 1 = shown, 0 = hidden */ + int32_t x, y; /* unpacked from cursor_pos (signed) */ + uint16_t hot_x, hot_y; /* unpacked from cursor_hotspot */ + uint16_t glyph_w, glyph_h; /* unpacked from cursor_glyph */ + uint32_t id; /* VGPU_CURSOR_ID_* */ +} vgpup_cursor; + +/* Display-geometry snapshot (read under the geom_seq seqlock). */ +typedef struct { + int32_t virt_x, virt_y; + uint32_t virt_w, virt_h; + int32_t cap_x, cap_y; + uint32_t dpi, refresh_mhz; +} vgpup_geometry; + +/* Lifecycle / status snapshot (cold line; single naturally-aligned atomic + * fields, no seqlock — "fresh enough" by the lossy contract). */ +typedef struct { + uint64_t heartbeat; + uint32_t run_epoch; + uint32_t status; /* VGPU_ST_* */ + uint32_t backend; /* VGPU_BK_* */ + uint32_t error_code; + uint32_t applied_fps; + uint32_t supported_formats; + uint32_t ctrl_ack; + uint32_t full_frame_ack; + uint64_t content_change_ns; +} vgpup_status; + +/* ---- read API (lossy; seqlock discipline lives inside) -------------------- * + * All read functions read under r->proc_cr3 (the producer's cr3, cached in the + * handle at discovery). m is a BORROWED vmie_mem* (== vmie_win32_mem(v)); the + * cr3 is NOT in the signature — it travels in the handle. A <0 return is a hard + * memory-read error: the producer process may have restarted, so the caller + * re-discovers (see "Two epochs + producer restart" in the file header). */ + +/* Sample the latest frame. Seqlock-reads latest/seq[slot]/desc, copies the slot + * bytes out of the RING via gva_read, then re-checks seq[slot] in one window. + * dst is the caller's buffer, cap its capacity. Returns 1 = a fresh frame was + * copied (info filled), 0 = no new frame / writer busy past the retry limit / + * frame would not fit cap (lossy SKIP, not an error), <0 = a memory-read error. + * "Fresh" dedups by frame_id: a frame_id <= the last sampled one returns 0. */ +int vgpup_sample_frame(vgpup_region* r, vmie_mem* m, + uint8_t* dst, size_t cap, vgpup_frame_info* info); + +/* Read the cursor under the cursor_seq acquire gate. 1 = consistent snapshot, + * 0 = writer busy past the retry limit, <0 = read error. */ +int vgpup_read_cursor(vgpup_region* r, vmie_mem* m, vgpup_cursor* out); + +/* Read display geometry under the geom_seq seqlock. Returns as read_cursor. */ +int vgpup_read_geometry(vgpup_region* r, vmie_mem* m, vgpup_geometry* out); + +/* Read the cold-line status/lifecycle. 0 = success, <0 = read error. The single + * atomic fields carry no seqlock; the snapshot is "fresh enough" (lossy). */ +int vgpup_read_status(vgpup_region* r, vmie_mem* m, vgpup_status* out); + +/* The run_epoch from the last vgpup_read_status — a session-break detector for + * the caller while the address space stays live. The core only reports the raw + * value; it holds no reset policy (what to reset is the caller's decision). */ +uint32_t vgpup_run_epoch(const vgpup_region* r); + +/* ---- control-write — SEAM ONLY (this never writes) ------------------------ */ + +/* Desired control-block value (host-RW fields). The caller builds it and later + * forwards it to the writer; the actual gva_write is performed elsewhere, by the + * component that holds read-write access to the region. */ +typedef struct { + uint32_t desired_state; /* VGPU_CMD_* */ + uint32_t target_fps; /* 0 = producer default */ + uint32_t draw_cursor; /* 0/1 */ + uint32_t full_frame_req; /* edge counter (caller bumps vs the previous) */ +} vgpup_control_intent; + +/* Build a control frame WITHOUT writing: fill a vgpu_control_t image from `in`, + * and report the control-block GVA plus the offset/length of the significant + * field range, so an external read-write writer can perform an atomic write + * under the ctrl_gen seqlock. This NEVER touches guest memory (the RO fd would + * not allow it anyway). ctrl_gen is left zero here: the writer owns it under the + * seqlock. The significant range is desired_state .. full_frame_req; + * consumer_tick/attached carry separate heartbeat/intent semantics and are NOT + * part of this intent. + * out_frame — filled vgpu_control_t (significant fields from `in`) + * out_ctrl_gva — control-block GVA (region base + VGPU_CONTROL_OFFSET). This + * GVA is valid in the PRODUCER's user address space: the + * external write MUST be performed under r->proc_cr3, NOT the + * System kcr3. + * out_off — offset of the first significant field (offsetof desired_state) + * out_len — length of the significant range (through full_frame_req) + * Returns 0 on success, <0 if r is NULL. The write itself is performed + * elsewhere; there is no live gva_write here and there must not be. */ +int vgpup_build_control_write(vgpup_region* r, const vgpup_control_intent* in, + vgpu_control_t* out_frame, uint64_t* out_ctrl_gva, + uint32_t* out_off, uint32_t* out_len); + +#endif /* VGPU_PERCEPTION_H */ diff --git a/include/vgpu_stream.h b/include/vgpu_stream.h new file mode 100644 index 0000000..87cbe06 --- /dev/null +++ b/include/vgpu_stream.h @@ -0,0 +1,169 @@ +#ifndef VGPU_STREAM_H +#define VGPU_STREAM_H +#include +#include /* offsetof */ +#include /* alignas */ +#include /* static_assert */ + +/* ===== Geometry — single source of truth (bare ABI, both ends agree) ===== */ +#define VGPU_PAGE 4096u +#define VGPU_SLOT_COUNT 3u +#define VGPU_SLOT_STRIDE (32u * 1024u * 1024u) +#define VGPU_RING_OFFSET (2u * 1024u * 1024u) +#define VGPU_PRODUCER_OFFSET 0u +#define VGPU_CONTROL_OFFSET VGPU_PAGE +#define VGPU_REGION_BYTES (VGPU_RING_OFFSET + (uint64_t)VGPU_SLOT_COUNT * VGPU_SLOT_STRIDE) +#define VGPU_MAX_WIDTH 3840u +#define VGPU_MAX_HEIGHT 2160u +#define VGPU_HEARTBEAT_PERIOD_MS 250u /* producer ticks heartbeat >= 4 Hz always */ +#define VGPU_LATEST_NONE 0xFFFFFFFFu + +static_assert((uint64_t)VGPU_MAX_WIDTH * VGPU_MAX_HEIGHT * 4u <= VGPU_SLOT_STRIDE, + "max-mode tight BGRA must fit one slot"); + +/* enum values travel as uint32 wire-values (not as enum fields → no width instability) */ +enum { VGPU_FMT_BGRA8888 = 0 }; +enum { VGPU_ST_INIT=0, VGPU_ST_CAPTURING=1, VGPU_ST_PAUSED=2, VGPU_ST_STOPPED=3, VGPU_ST_ERROR=4 }; +enum { VGPU_BK_NONE=0, VGPU_BK_NVFBC=1, VGPU_BK_DDA=2, VGPU_BK_GDI=3 }; +enum { VGPU_CMD_STOP=0, VGPU_CMD_RUN=1, VGPU_CMD_PAUSE=2 }; +/* cursor shape identity (wire-uint32); UNKNOWN=0 → custom/unrecognized glyph */ +enum { VGPU_CURSOR_ID_UNKNOWN=0, VGPU_CURSOR_ID_ARROW=1, VGPU_CURSOR_ID_IBEAM=2, + VGPU_CURSOR_ID_WAIT=3, VGPU_CURSOR_ID_CROSS=4, VGPU_CURSOR_ID_HAND=5, + VGPU_CURSOR_ID_SIZENS=6, VGPU_CURSOR_ID_SIZEWE=7, VGPU_CURSOR_ID_SIZENWSE=8, + VGPU_CURSOR_ID_SIZENESW=9, VGPU_CURSOR_ID_SIZEALL=10, VGPU_CURSOR_ID_NO=11, + VGPU_CURSOR_ID_APPSTARTING=12 }; + +/* ===== Per-slot descriptor (under hot.seq[slot]) ===== */ +typedef struct { + uint32_t width; /* pixels */ + uint32_t height; /* pixels */ + uint32_t stride; /* bytes/row; INVARIANT: == width*4 (tight) */ + uint32_t format; /* VGPU_FMT_* */ + uint64_t frame_id; /* == producer.frame_id at publish time */ + uint64_t timestamp_ns; /* capture time, monotonic */ +} vgpu_desc_t; +static_assert(sizeof(vgpu_desc_t) == 32, "desc layout"); +static_assert(offsetof(vgpu_desc_t, width) == 0, "desc.width"); +static_assert(offsetof(vgpu_desc_t, height) == 4, "desc.height"); +static_assert(offsetof(vgpu_desc_t, stride) == 8, "desc.stride"); +static_assert(offsetof(vgpu_desc_t, format) == 12, "desc.format"); +static_assert(offsetof(vgpu_desc_t, frame_id) == 16, "desc.frame_id"); +static_assert(offsetof(vgpu_desc_t, timestamp_ns) == 24, "desc.timestamp_ns"); + +/* ===== Producer block (host-RO): hot publish line + cold status line ===== */ +typedef struct { + /* --- hot publish line --- */ + alignas(64) + uint32_t latest; /* index of last; VGPU_LATEST_NONE until 1st frame */ + uint32_t _r0; + uint64_t frame_id; /* monotonic frame counter (8-aligned) */ + uint32_t seq[VGPU_SLOT_COUNT]; /* per-slot seqlock: even=stable, odd=writing */ + uint32_t _r1; + vgpu_desc_t desc[VGPU_SLOT_COUNT]; /* self-describing slots */ + + /* --- cold status line --- */ + alignas(64) + uint64_t heartbeat; /* monotonic; ticks always (even STOPPED/PAUSED) */ + uint32_t run_epoch; /* +1 per start (session break for host) */ + uint32_t status; /* VGPU_ST_* */ + uint32_t backend; /* VGPU_BK_* */ + uint32_t error_code; /* 0=none; else fatal detail */ + uint32_t applied_fps; /* publish-rate cap the producer actually applies; + actual rate may be lower on static content or + backend limits — host measures real fps from + desc.timestamp_ns */ + uint32_t supported_formats; /* bitmask (1u< + +/* vmctl.h — public API for a QEMU VM Input layer (actuator): input injection + + * power/lifecycle actuation. One handle; the input driver is selected + * declaratively through vmctl_config. OS-agnostic surface. */ + +typedef struct vmctl vmctl_t; /* opaque handle */ + +/* ===== Input drivers + open ===== */ +typedef enum { + VMCTL_DRIVER_QMP, /* QMP input-send-event (no guest driver required) */ + VMCTL_DRIVER_UINPUT /* host uinput source; optional passthrough into guest */ + /* via QEMU virtio-input-host-pci (Linux). uinput != virtio. */ +} vmctl_driver; + +#define VMCTL_PTR_ABS 1 /* uinput: absolute tablet */ +#define VMCTL_PTR_REL 2 /* uinput: relative mouse */ +#define VMCTL_PTR_BOTH 3 /* uinput: two devices A=abs B=rel */ + +typedef struct { + unsigned bustype; /* HID bus type, e.g. 0x0003 (USB) */ + unsigned vendor; /* vendor id */ + unsigned product; /* product id */ + unsigned version; /* device version */ + const char* name; /* device name; library copies it */ +} vmctl_uinput_id; + +typedef struct { + vmctl_driver driver; + const char* qmp_path; /* QMP unix socket; required for QMP, optional (passthrough) for UINPUT */ + const char* input_bus; /* virtio-input-host-pci bus "pci.0" for passthrough; "" = none */ + int ptr_mode; /* UINPUT VMCTL_PTR_*; 0 for QMP */ + const vmctl_uinput_id* uinput_id; /* UINPUT only; NULL = built-in defaults */ +} vmctl_config; + +vmctl_t* vmctl_open (const vmctl_config* cfg); /* NULL on error */ +void vmctl_close(vmctl_t* v); /* safe on NULL */ + +/* ===== Input constants ===== */ +#define VMCTL_ABS_MAX 32767 /* abs coordinates 0..VMCTL_ABS_MAX */ +#define VMCTL_AXIS_X 0 +#define VMCTL_AXIS_Y 1 +#define VMCTL_SCROLL_V 0 /* vertical */ +#define VMCTL_SCROLL_H 1 /* horizontal */ +#define VMCTL_BTN_LEFT 0 +#define VMCTL_BTN_RIGHT 1 +#define VMCTL_BTN_MIDDLE 2 +#define VMCTL_BTN_SIDE 3 +#define VMCTL_BTN_EXTRA 4 +#define VMCTL_BTN_FORWARD 5 +#define VMCTL_BTN_BACK 6 +#define VMCTL_BTN_TASK 7 + +#define VMCTL_KEY_CODE_MAX 0x2ff /* highest supported evdev key code (inclusive) */ +#define VMCTL_KEYS_SNAPSHOT_BYTES ((VMCTL_KEY_CODE_MAX + 1) / 8) /* bytes for vmctl_keys_snapshot */ + +/* ===== Event batch (value-type, stack; build ONLY via builders — ev[] is not API) ===== */ +#define VMCTL_BATCH_MAX 64 +typedef struct { + int kind; /* internal event-kind code; set by builders */ + int code; /* axis / button / evdev-code (per kind) */ + int value; /* abs-value / rel-delta / down(0|1) */ + double scroll; /* scroll magnitude (scroll only) */ +} vmctl_event; +typedef struct { vmctl_event ev[VMCTL_BATCH_MAX]; int count; } vmctl_batch; + +void vmctl_batch_init (vmctl_batch* b); +void vmctl_batch_abs (vmctl_batch* b, int axis, int value); +void vmctl_batch_rel (vmctl_batch* b, int axis, int delta); +void vmctl_batch_btn (vmctl_batch* b, int btn, int down); +void vmctl_batch_key (vmctl_batch* b, int evdev_code, int down); +void vmctl_batch_scroll(vmctl_batch* b, int axis, double value); +int vmctl_batch_send (vmctl_t* v, vmctl_batch* b); /* one round-trip; 0=ok, -1=err */ + +/* ===== Single events (wrappers over a 1-event batch) ===== */ +int vmctl_abs (vmctl_t* v, int axis, int value); /* 0..VMCTL_ABS_MAX */ +int vmctl_rel (vmctl_t* v, int axis, int delta); +int vmctl_btn (vmctl_t* v, int btn, int down); /* VMCTL_BTN_* */ +int vmctl_key (vmctl_t* v, int evdev_code, int down); /* Linux KEY_* */ +int vmctl_scroll(vmctl_t* v, int axis, double value); /* VMCTL_SCROLL_* */ + +/* ===== Held-state receipt (read-only) ===== + * "held" = key/button state as THIS handle last actuated it, not guest truth. + * It is the actuator's record of its own last output (sensing the guest belongs + * to the sensors layer, not here). Updated only after a successful send; the + * send path NEVER reads this map (no dedup, no auto-release, no autorepeat). */ + +int vmctl_key_held (vmctl_t* v, int evdev_code); /* Linux KEY_*; 1=down 0=up */ +int vmctl_btn_held (vmctl_t* v, int btn); /* VMCTL_BTN_*; 1=down 0=up */ +int vmctl_keys_snapshot(vmctl_t* v, unsigned char* bits, size_t nbytes); + /* copy key down-bits (EVIOCGKEY-style); + returns bytes written, -1 on bad args */ +unsigned vmctl_btns_snapshot(vmctl_t* v); /* VMCTL_BTN_* down-bits as a mask (bits 0..7) */ + +/* ===== Power/lifecycle actuation (requires a QMP connection; -1 if there is none) ===== */ +int vmctl_powerdown(vmctl_t* v); /* system_powerdown (ACPI soft-off) */ +int vmctl_reset (vmctl_t* v); /* system_reset */ +int vmctl_wakeup (vmctl_t* v); /* system_wakeup (from S3/S4) */ +int vmctl_pause (vmctl_t* v); /* stop */ +int vmctl_resume (vmctl_t* v); /* cont */ + +/* Transfer sequencing/context belongs to signaling; timing and decisions to + * control; reading VM state to sensors. Here, in the Input layer, only atomic + * actuation. */ + +#endif /* VMCTL_H */ diff --git a/include/vmsig_control.h b/include/vmsig_control.h index 548b81e..4318e12 100644 --- a/include/vmsig_control.h +++ b/include/vmsig_control.h @@ -37,6 +37,9 @@ typedef struct { #define VMSIG_CAP_MEMWRITE 0x100u /* CMD_MEMWRITE: atomic write-signaled mutation of shared guest memory * (separate from the freed CAP_MEMREAD bit — read != write; fresh bit * avoids stale-grant aliasing to this privileged cap). */ +#define VMSIG_CAP_ROSTER 0x200u /* SUBSCRIPTION to the VM roster (UP VMSIG_EV_ROSTER): which VMs occupy + * which endpoints, by name/state. Distinct from CAP_OBSERVE — this is + * host-wide inventory enumeration, not observing one VM's content. */ typedef struct { uint32_t principal; /* id for auditing (uid/token) */ diff --git a/include/vmsig_core.h b/include/vmsig_core.h index 01c04a6..ec81052 100644 --- a/include/vmsig_core.h +++ b/include/vmsig_core.h @@ -51,10 +51,20 @@ void vmsig_core_set_arb_policy(vmsig_core* c, vmsig_arb_policy cb, void* /* Register an adapter for VM `endpoint`: open(cfg,endpoint) -> attach(...), * enroll each yielded fd into epoll and into the dispatch table fd->(adapter,cookie). - * Returns the adapter id (>=0) or -1. */ + * Returns the adapter id (>=0) or -1. Runtime-safe: may be called AFTER vmsig_core_run + * has started, from a loop-thread callback (e.g. a discovery SLOT_SOURCE), to hot-plug + * a VM's adapters; a freed adapter slot is reused so churn does not exhaust the table. */ int vmsig_core_add_adapter(vmsig_core* c, const vmsig_adapter_ops* ops, const void* cfg, uint32_t endpoint); +/* Request runtime detach of EVERY adapter currently attached to `endpoint` (the whole + * VM trio). Deferred: the teardown (epoch settle + SEAM_DOWN + lease release + epoll DEL + * + ops->close) runs after the current event batch, like core_request_drop for controls. + * Safe to call from a loop-thread callback (e.g. inotify discovery). No-op if endpoint + * is not attached or >= 64. The composing of the trio at attach is the caller's job + * (3x add_adapter); detach is by endpoint so the caller needs no per-adapter ids. */ +void vmsig_core_detach_endpoint(vmsig_core* c, uint32_t endpoint); + /* Attach a control endpoint (in-process or socket) with a GRANT (capability set). * grant == NULL => default-deny (poller inert). The core sees only the neutral * vtable + grant + (opt.) fd. Returns the control id (>=0) or -1. */ diff --git a/include/vmsig_event.h b/include/vmsig_event.h index fe20876..69f7133 100644 --- a/include/vmsig_event.h +++ b/include/vmsig_event.h @@ -63,6 +63,11 @@ typedef enum { /* --- UP: cursor (vgpu sensor; emitted by the vgpu-perception shell-as-control) --- */ VMSIG_EV_CURSOR_STATE = 37, /* cursor position/visibility; inln=vmsig_cursor; cap OBSERVE|INPUT */ + /* --- UP: VM roster (inventory coherence; daemon-originated, source=CORE) --- */ + VMSIG_EV_ROSTER = 38, /* which VM occupies this endpoint: inln=vmsig_roster + * {vmid,state,action,name}, endpoint in the header; retained + * per-endpoint + replayed to late subscribers; cap ROSTER */ + /* --- UP: input/lifecycle ack (INPUT seam) --- */ VMSIG_EV_ACT_ACK = 48, /* down-command completed (ok/err) */ VMSIG_EV_VM_LIFECYCLE = 49, /* power/lifecycle state report */ diff --git a/include/vmsig_roster.h b/include/vmsig_roster.h new file mode 100644 index 0000000..71ae11e --- /dev/null +++ b/include/vmsig_roster.h @@ -0,0 +1,40 @@ +#ifndef VMSIG_ROSTER_H +#define VMSIG_ROSTER_H +#include + +/* vmsig_roster.h — NEUTRAL inventory-coherence contract. + * + * The signaling daemon owns the discovery namespace and assigns each running VM a stable + * ENDPOINT slot [0,64). The roster is the per-endpoint datum "which VM currently occupies + * this slot, by what name, in what coarse lifecycle state". It is published as an UP event + * VMSIG_EV_ROSTER (source=CORE), retained per endpoint and replayed to a late subscriber — + * exactly like the MEMCTX datum, but carrying identity rather than an address-space handle. + * + * This is COHERENCE of shared state (the endpoint roster is shared across all controls), + * NOT perception and NOT access-brokering. A consumer decodes it WITHOUT any host/Proxmox + * knowledge: `endpoint` rides in the event header (ev->endpoint), the rest in inln[48]. + * CAP_ROSTER gates RECEIVING the datum (subscription), not access — access stays OS-DAC. */ + +/* Roster transition (entry->action). */ +enum { + VMSIG_ROSTER_ATTACH = 0, /* endpoint is now occupied by `vmid` */ + VMSIG_ROSTER_DETACH = 1, /* endpoint vacated (the slot bit is being released) */ + VMSIG_ROSTER_UPDATE = 2 /* same vmid on the slot; state and/or name changed */ +}; + +/* roster->flags bits */ +#define VMSIG_ROSTER_NAME_TRUNC 0x1u /* the VM name did not fit and was truncated */ + +#define VMSIG_ROSTER_NAME_MAX 32 /* inline, NUL-terminated, truncated name */ + +/* The roster datum, carried inline (inln[48]). `endpoint` is NOT here — it is the event + * header's ev->endpoint (where every event carries it, and what the wire serializes). */ +typedef struct { + uint32_t vmid; /* host VM id (e.g. Proxmox vmid 100..1e9) — does NOT fit endpoint */ + uint32_t state; /* coarse lifecycle: VMSIG_VM_* (vmsig_event.h), from the host plane */ + uint32_t action; /* VMSIG_ROSTER_ATTACH/DETACH/UPDATE */ + uint32_t flags; /* VMSIG_ROSTER_* (e.g. NAME_TRUNC) */ + char name[VMSIG_ROSTER_NAME_MAX]; /* NUL-terminated, truncated display name */ +} vmsig_roster; /* 4+4+4+4+32 = 48 — exactly inln[48] */ + +#endif /* VMSIG_ROSTER_H */ diff --git a/packaging/config/vmsigd.conf b/packaging/config/vmsigd.conf new file mode 100644 index 0000000..da83c3c --- /dev/null +++ b/packaging/config/vmsigd.conf @@ -0,0 +1,27 @@ +# vmsigd.conf — vmsig management daemon configuration. +# Installed as a dpkg conffile (operator edits are preserved across upgrades). + +# Control listener the consumer dials. '@' prefix => abstract namespace. +socket = /run/vmsig/vmsigd.sock + +# Discovery namespace: a VM is managed iff its guest-RAM backing file appears here as +# vm--ram. The daemon owns this directory (created at boot via tmpfiles). +watch = /dev/shm/vmsig + +# Inventory source of truth (read on demand; not watched) and the QMP socket directory. +pve_conf = /etc/pve/qemu-server +qmp_dir = /var/run/qemu-server + +# vmid<->endpoint slot persistence (tmpfs; re-derived per daemon restart). "" => off. +slots = /dev/shm/vmsig/.slots + +# ---- Admission policy: one [grant uid=N] stanza per local uid. ----------------------- +# Entitlements are COARSE (the control enforces per-user caps behind the grant). `vmids` +# is either `*` (all VMs) or a list of vmids; it is translated to endpoint bits at connect +# time. `caps` is a comma list of: observe,input,lifecycle,power,vm,memctx,memwrite,roster. +# +# Example (edit before enabling the service): +# [grant uid=0] +# vmids = * +# caps = observe,input,lifecycle,power,vm,memctx,memwrite,roster +# arb_prio = 100 diff --git a/packaging/deb/conffiles b/packaging/deb/conffiles new file mode 100644 index 0000000..f861c43 --- /dev/null +++ b/packaging/deb/conffiles @@ -0,0 +1 @@ +/etc/vmsig/vmsigd.conf diff --git a/packaging/deb/control.in b/packaging/deb/control.in new file mode 100644 index 0000000..cb6d033 --- /dev/null +++ b/packaging/deb/control.in @@ -0,0 +1,13 @@ +Package: vmsig +Version: @VERSION@ +Section: admin +Priority: optional +Architecture: amd64 +Depends: @DEPENDS@ +Maintainer: @MAINTAINER@ +Description: VM signaling coherence daemon and host SI libraries + vmsig serves a unix-socket control plane over the signaling layer for the VMs it + discovers: lifecycle/state, coherent guest address-space context handoff, and arbitrated + input and memory-write actuation. Ships the daemon (vmsigd), the signaling library, the + host-side vgpu perception library, and a systemd unit. Configured via + /etc/vmsig/vmsigd.conf. diff --git a/packaging/deb/postinst b/packaging/deb/postinst new file mode 100755 index 0000000..9a46051 --- /dev/null +++ b/packaging/deb/postinst @@ -0,0 +1,19 @@ +#!/bin/sh +set -e +case "$1" in +configure) + ldconfig || true + mkdir -p /etc/vmsig + chmod 0640 /etc/vmsig/vmsigd.conf 2>/dev/null || true # carries the uid->grant policy + mkdir -p /dev/shm/vmsig && chmod 0755 /dev/shm/vmsig # also (re)created at boot via tmpfiles + if [ -d /run/systemd/system ]; then + systemctl daemon-reload || true + systemd-tmpfiles --create /usr/lib/tmpfiles.d/vmsig.conf || true + systemctl enable vmsigd.service || true # enable, but do NOT start + fi + echo "vmsig: review the [grant] policy in /etc/vmsig/vmsigd.conf, then: systemctl start vmsigd" >&2 + ;; +abort-upgrade|abort-remove|abort-deconfigure) + ;; +esac +exit 0 diff --git a/packaging/deb/prerm b/packaging/deb/prerm new file mode 100755 index 0000000..ebe1654 --- /dev/null +++ b/packaging/deb/prerm @@ -0,0 +1,12 @@ +#!/bin/sh +set -e +case "$1" in +remove|deconfigure) + if [ -d /run/systemd/system ]; then + systemctl disable --now vmsigd.service || true + fi + ;; +upgrade|failed-upgrade) + ;; +esac +exit 0 diff --git a/packaging/systemd/vmsigd.service b/packaging/systemd/vmsigd.service new file mode 100644 index 0000000..b2fdb22 --- /dev/null +++ b/packaging/systemd/vmsigd.service @@ -0,0 +1,41 @@ +[Unit] +Description=vmsig VM signaling coherence daemon +# No host/VM is named here: the daemon serves whatever appears under its watch dir. +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +# root: reads QEMU-owned /dev/shm RAM backings, dials per-VM QMP, reads /etc/pve (OS-DAC). +# The security boundary is the per-uid grant, not the process uid; per-VM isolation, if +# required, is the deployment's job (process-per-VM), not this daemon's. +User=root +ExecStart=/usr/sbin/vmsigd +Restart=on-failure +RestartSec=2 + +# systemd creates and owns /run/vmsig (the control socket dir) and cleans it on stop. +RuntimeDirectory=vmsig +RuntimeDirectoryMode=0755 + +StandardOutput=journal +StandardError=journal + +# ---- hardening: contain a root daemon by namespace/capability, not by uid ---- +NoNewPrivileges=true +ProtectSystem=strict +ReadWritePaths=/dev/shm/vmsig /run/vmsig +ReadOnlyPaths=/etc/pve /var/run/qemu-server +ProtectHome=true +PrivateTmp=true +RestrictAddressFamilies=AF_UNIX +CapabilityBoundingSet= +AmbientCapabilities= +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectControlGroups=true +RestrictRealtime=true +LockPersonality=true + +[Install] +WantedBy=multi-user.target diff --git a/packaging/tmpfiles/vmsig.conf b/packaging/tmpfiles/vmsig.conf new file mode 100644 index 0000000..5d3608c --- /dev/null +++ b/packaging/tmpfiles/vmsig.conf @@ -0,0 +1,3 @@ +# /dev/shm is tmpfs (wiped on reboot): (re)create the discovery namespace before the unit. +# Type Path Mode UID GID Age Argument +d /dev/shm/vmsig 0755 root root - diff --git a/src/adapter/input/include/input.h b/src/adapter/input/include/input.h index 25b3703..f777281 100644 --- a/src/adapter/input/include/input.h +++ b/src/adapter/input/include/input.h @@ -1,10 +1,10 @@ #ifndef VMSIG_INPUT_H #define VMSIG_INPUT_H -/* Private config of the input adapter (vmctl). cfg==NULL => stub mode. Armed mode - * (VMSIG_WITH_VMCTL) opens vmctl_open() and actuates for real. Injection is ALWAYS - * uinput (orphaned host uinput + external QEMU input-linux). qmp_path is kept for the - * SERVICE path (power/lifecycle via vmctl QMP), not for input injection. */ +/* Private config of the input adapter (vmctl, in-tree at src/si/input/). cfg==NULL or + * stub!=0 => stub mode (ack without actuation). stub==0 opens vmctl_open() and actuates for + * real. Injection is ALWAYS uinput (orphaned host uinput + external QEMU input-linux); + * qmp_path is kept for the SERVICE path (power/lifecycle via vmctl QMP), not for injection. */ typedef struct { int stub; const char* qmp_path; /* for power/lifecycle (vmctl QMP); NOT input injection */ diff --git a/src/adapter/input/input.c b/src/adapter/input/input.c index c267297..2e18f49 100644 --- a/src/adapter/input/input.c +++ b/src/adapter/input/input.c @@ -3,19 +3,17 @@ * Mechanism (recommended): vmctl is a blocking QMP round-trip; we run it on a * worker thread, completion ack via a completion-eventfd. The uinput path is a * local instantaneous write; when armed it would be done inline (see comment in submit). - * Real actuation is under VMSIG_WITH_VMCTL; otherwise the stub acks (spine without a VM). */ + * Real actuation when cfg.stub==0 (vmctl opened); otherwise the stub acks (spine without a VM). + * vmctl is the in-tree input driver (src/si/input/, absorbed); cfg.stub gates opening it. */ #include "vmsig_adapter.h" #include "adapter_util.h" #include "input.h" +#include "vmctl.h" #include #include #include #include -#ifdef VMSIG_WITH_VMCTL -#include "vmctl.h" -#endif - /* POD request/result of the worker. */ typedef struct { int cmd; /* 0 = input event, 1 = lifecycle */ @@ -40,9 +38,7 @@ struct vmsig_adapter { vmsig_emit emit; vmsig_worker* worker; const char* qmp_path; /* borrowed from cfg (valid through attach); SERVICE power/lifecycle */ -#ifdef VMSIG_WITH_VMCTL - vmctl_t* vmctl; -#endif + vmctl_t* vmctl; /* NULL in stub mode (cfg.stub) — no actuator opened */ }; static int input_job(void* user, const void* reqp, void* resp) { @@ -53,7 +49,6 @@ static int input_job(void* user, const void* reqp, void* resp) { rs->corr = rq->corr; rs->origin = rq->origin; rs->noack = rq->noack; -#ifdef VMSIG_WITH_VMCTL if (a->vmctl) { int r = -1; if (rq->cmd == 0) { @@ -87,9 +82,8 @@ static int input_job(void* user, const void* reqp, void* resp) { rs->ok = (r == 0); return r; } -#endif (void)a; - rs->ok = 1; /* stub: ack without actuation */ + rs->ok = 1; /* stub: ack without actuation (vmctl not opened) */ return 0; } @@ -109,7 +103,6 @@ static int in_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg a->worker = vmsig_worker_new(input_job, a, 1, 64); /* QMP is a serial channel, cap 64 */ if (!a->worker) return -1; -#ifdef VMSIG_WITH_VMCTL if (!a->stub) { /* armed: open the actuator. Injection is ALWAYS uinput (orphaned host uinput + external * QEMU input-linux). PTR_BOTH gives both pointer forms a device (A=abs tablet, B=rel @@ -125,7 +118,6 @@ static int in_attach(vmsig_adapter* a, const vmsig_emit* emit, vmsig_fd_reg* reg a->vmctl = vmctl_open(&vcfg); if (!a->vmctl) { vmsig_worker_free(a->worker); a->worker = NULL; return -1; } } -#endif reg[0].fd = vmsig_worker_evfd(a->worker); reg[0].epoll_events = EPOLLIN; @@ -166,7 +158,6 @@ static int in_submit(vmsig_adapter* a, const vmsig_event* ev) { * actuate — nothing to hold). */ vmsig_input_held h; memset(&h, 0, sizeof h); -#ifdef VMSIG_WITH_VMCTL if (a->vmctl) { const uint32_t capn = (uint32_t)(sizeof h.ent / sizeof h.ent[0]); unsigned char bits[VMCTL_KEYS_SNAPSHOT_BYTES]; @@ -184,7 +175,6 @@ static int in_submit(vmsig_adapter* a, const vmsig_event* ev) { else h.flags |= VMSIG_INPUT_HELD_TRUNC; } } -#endif vmsig_event up; memset(&up, 0, sizeof up); up.kind = VMSIG_EV_INPUT_HELD; up.source = VMSIG_SRC_INPUT; up.dir = VMSIG_DIR_UP; @@ -223,9 +213,7 @@ static int in_submit(vmsig_adapter* a, const vmsig_event* ev) { static void in_close(vmsig_adapter* a) { if (!a) return; vmsig_worker_free(a->worker); -#ifdef VMSIG_WITH_VMCTL if (a->vmctl) vmctl_close(a->vmctl); -#endif free(a); } diff --git a/src/adapter/memctx/include/memctx.h b/src/adapter/memctx/include/memctx.h index 28ad5b9..588bfb2 100644 --- a/src/adapter/memctx/include/memctx.h +++ b/src/adapter/memctx/include/memctx.h @@ -8,8 +8,10 @@ typedef struct { int stub; /* 1 => synthetic kcr3/RO-fd (spine without a VM) */ const char* ram_path; /* armed: path to guest RAM backing (NOT published outward) */ uint64_t low; /* below-4G split (vmie_win32_open / locator.low) */ - int ro_fd; /* >=0 => infra supplied a pre-sealed RO-fd (policy); */ - /* <0 => default: open(ram_path, O_RDONLY) / stub-memfd */ + int ro_fd; /* >=0 => infra hands a pre-sealed RO-fd (policy); OWNERSHIP */ + /* TRANSFERS to the adapter (closed in close()) — the */ + /* caller dups first if it must keep its own copy. */ + /* <0 => default: open(ram_path, O_RDONLY) / stub-memfd */ } vmsig_memctx_cfg; /* Max SRC bytes per atomic gva_write (bounds the worker POD slot; mc_req header + src diff --git a/src/adapter/memctx/memctx.c b/src/adapter/memctx/memctx.c index f7dad37..250a267 100644 --- a/src/adapter/memctx/memctx.c +++ b/src/adapter/memctx/memctx.c @@ -85,7 +85,7 @@ struct vmsig_adapter { int stub; const char* ram_path; /* armed: RAM-backing path (NOT published outward) */ uint64_t low; - int cfg_ro_fd; /* >=0 => infra-sealed RO-fd (policy); <0 => default */ + int cfg_ro_fd; /* >=0 => infra-sealed RO-fd (owned by adapter, closed in mc_close); <0 => default */ vmsig_emit emit; int registered; /* register_memctx already called */ vmsig_worker* worker; /* off-loop bootstrap + atomic writes */ @@ -398,7 +398,11 @@ static void mc_close(vmsig_adapter* a) { if (a->win) vmie_win32_close(a->win); /* AFTER worker join: no in-flight gva_write */ #endif if (a->stub_fd >= 0) close(a->stub_fd); - /* cfg_ro_fd belongs to the infrastructure (the open caller) — do NOT close it. */ + /* ro_fd ownership transferred to the adapter at open(): close it here so a re-grant + * (detach + re-attach with a fresh infra ro_fd) does not leak the prior one. Infra + * that must keep its own copy dups before handing it in — symmetric to the holder + * side, which dups the borrowed RO-fd it receives. */ + if (a->cfg_ro_fd >= 0) close(a->cfg_ro_fd); free(a); } diff --git a/src/core/core.c b/src/core/core.c index 0dcd0dd..4700e0c 100644 --- a/src/core/core.c +++ b/src/core/core.c @@ -68,10 +68,22 @@ vmsig_core* vmsig_core_new(vmsig_ctx* ctx) { int vmsig_core_add_adapter(vmsig_core* c, const vmsig_adapter_ops* ops, const void* cfg, uint32_t endpoint) { - if (!c || !ops || c->nadapters >= VMSIG_MAX_ADAPTERS) return -1; + if (!c || !ops) return -1; + + /* Reuse a reaped (inactive) adapter entry so runtime detach/re-attach churn does + * not exhaust the fixed table; otherwise grow up to the ceiling. */ + int id = -1; + for (int i = 0; i < c->nadapters; i++) + if (!c->adapters[i].active) { id = i; break; } + if (id < 0) { + if (c->nadapters >= VMSIG_MAX_ADAPTERS) return -1; + id = c->nadapters++; + } + core_adapter_ent* e = &c->adapters[id]; + uint16_t gen = e->gen; /* generation survives the memset below */ vmsig_adapter* a = ops->open(cfg, endpoint); - if (!a) return -1; + if (!a) return -1; /* entry stays inactive (reusable) */ vmsig_emit emit = { core_emit_up, core_register_memctx, core_unregister_memctx, c }; vmsig_fd_reg reg[VMSIG_ADAPTER_FDS]; @@ -80,23 +92,48 @@ int vmsig_core_add_adapter(vmsig_core* c, const vmsig_adapter_ops* ops, int n = ops->attach(a, &emit, reg, VMSIG_ADAPTER_FDS); if (n < 0) { ops->close(a); return -1; } + memset(e, 0, sizeof *e); + e->ops = ops; + e->a = a; + e->endpoint = endpoint; + e->active = 1; + e->gen = (uint16_t)(gen + 1); + e->nslot = 0; + for (int i = 0; i < n; i++) { uint32_t events = reg[i].epoll_events ? reg[i].epoll_events : (uint32_t)EPOLLIN; core_slot* s = core_register_fd(c, reg[i].fd, events, SLOT_ADAPTER); - if (!s) { ops->close(a); return -1; } + if (!s) { + /* roll back: deregister the fds enrolled so far, then close + free the entry. */ + for (int k = 0; k < e->nslot; k++) { + epoll_ctl(c->epfd, EPOLL_CTL_DEL, e->slots[k]->fd, NULL); + e->slots[k]->role = SLOT_DEAD; + } + ops->close(a); + e->active = 0; e->a = NULL; e->nslot = 0; + return -1; + } s->ops = ops; s->adapter = a; s->cookie = reg[i].cookie; + if (e->nslot < VMSIG_ADAPTER_FDS) e->slots[e->nslot++] = s; } - - int id = c->nadapters; - c->adapters[c->nadapters].ops = ops; - c->adapters[c->nadapters].a = a; - c->adapters[c->nadapters].endpoint = endpoint; - c->nadapters++; return id; } +/* Request runtime detach of every adapter on `endpoint` (deferred reap after the batch, + * mirrors core_request_drop). The teardown itself (epoch settle, SEAM_DOWN, lease release, + * epoll DEL, ops->close) runs in core_reap_adapters on the loop thread. */ +void vmsig_core_detach_endpoint(vmsig_core* c, uint32_t endpoint) { + if (!c || endpoint >= 64) return; + int any = 0; + for (int i = 0; i < c->nadapters; i++) { + core_adapter_ent* e = &c->adapters[i]; + if (e->active && e->endpoint == endpoint) { e->reap = 1; any = 1; } + } + if (any) core_wake(c); +} + int vmsig_core_add_control(vmsig_core* c, const vmsig_control_ops* ops, void* ctl, const vmsig_grant* grant) { if (!c || !ops) return -1; @@ -137,6 +174,7 @@ int vmsig_core_add_control(vmsig_core* c, const vmsig_control_ops* ops, void* ct * this control is qualified). For a control added BEFORE the first publication, * the cell is not yet valid — it receives MEMCTX via the normal multicast in pump_up. */ core_memctx_replay(c, id); + core_roster_replay(c, id); /* late subscriber: retained VM roster (CAP_ROSTER) */ return id; /* ncontrols already bumped when picking id (on growth); reuse does not grow it */ } @@ -205,7 +243,8 @@ void vmsig_core_free(vmsig_core* c) { * FIRST: their close stops off-loop workers and unregisters their seams (e.g. * memctx) BEFORE destruction. */ for (int i = 0; i < c->nadapters; i++) - if (c->adapters[i].ops->close) c->adapters[i].ops->close(c->adapters[i].a); + if (c->adapters[i].active && c->adapters[i].ops->close) + c->adapters[i].ops->close(c->adapters[i].a); for (int i = 0; i < c->ncontrols; i++) if (c->controls[i].active && c->controls[i].ops->close) c->controls[i].ops->close(c->controls[i].ctl); diff --git a/src/core/include/core_internal.h b/src/core/include/core_internal.h index b0f961b..5bb330a 100644 --- a/src/core/include/core_internal.h +++ b/src/core/include/core_internal.h @@ -1,6 +1,7 @@ #ifndef VMSIG_CORE_INTERNAL_H #define VMSIG_CORE_INTERNAL_H #include "vmsig_core.h" +#include "vmsig_roster.h" #include /* Private internals of the epoll core. Each registered fd carries a @@ -41,6 +42,11 @@ typedef struct { const vmsig_adapter_ops* ops; vmsig_adapter* a; uint32_t endpoint; + int active; /* 0 = free/reaped slot (reusable) */ + int reap; /* deferred runtime detach requested */ + uint16_t gen; /* +1 on each (re)use (ABA guard / debug) */ + core_slot* slots[VMSIG_ADAPTER_FDS]; /* epoll slots we registered */ + int nslot; } core_adapter_ent; @@ -57,6 +63,15 @@ typedef struct { vmsig_memctx_reg reg; /* valid when registered */ } core_memctx_cell; +/* ===== Retained VM roster (inventory coherence; daemon-published) ===== + * One value snapshot per endpoint: the last published roster datum. Simpler than the + * MEMCTX cell — roster carries no fd and no borrowed buffer, so the cell is pure POD and + * delivery is the ordinary broadcast (no re-describe / re-share). valid=0 on DETACH. */ +typedef struct { + int valid; /* a roster entry is published for this endpoint */ + vmsig_roster entry; /* last published {vmid,state,action,name} (by value) */ +} core_roster_cell; + /* ===== Lease layer (arbitration of exclusive ownership of destructive resources) ===== * One cell per (endpoint, lease-class): who owns it (origin) + a snapshot of arb_prio at * acquisition time. owner=0 => free. The snapshot (rather than the live grant) makes the @@ -108,6 +123,7 @@ struct vmsig_core { uint32_t epoch[64]; /* per-endpoint VM session epoch */ core_memctx_cell memctx[64]; /* per-endpoint retained context */ + core_roster_cell roster[64]; /* per-endpoint retained roster */ core_lease_cell lease[64][VMSIG_LEASE_CLASSES]; /* lease per (endpoint, class) */ vmsig_arb_policy arb_cb; /* preemption policy (NULL=default) */ @@ -150,6 +166,14 @@ void core_memctx_route(vmsig_core* c, const vmsig_event* trigger); * defined in loop.c). */ void core_memctx_replay(vmsig_core* c, int ctl_id); +/* ===== VM roster (inventory coherence; defined in loop.c alongside the memctx seam) ===== */ +/* Publish a roster transition for `endpoint`: retain the datum (valid=0 on DETACH) and + * broadcast VMSIG_EV_ROSTER to qualified subscribers (CAP_ROSTER + source + endpoint). */ +void core_roster_publish(vmsig_core* c, uint32_t endpoint, const vmsig_roster* entry); + +/* Replay the retained roster to a single (late) subscriber (from vmsig_core_add_control). */ +void core_roster_replay(vmsig_core* c, int ctl_id); + /* Bump the endpoint's epoch on a destructive lifecycle transition: epoch++, invalidate * the retain cell, emit MEMCTX_INVALIDATED, request re-bootstrap from the adapter. * Observed by the core in pump_up on UP VM_LIFECYCLE (defined in loop.c). */ diff --git a/src/core/linux/loop.c b/src/core/linux/loop.c index d2eb072..f01ef81 100644 --- a/src/core/linux/loop.c +++ b/src/core/linux/loop.c @@ -105,6 +105,7 @@ static uint32_t source_mask_for_lease_class(int cls) { * input actor (INPUT); otherwise CAP_OBSERVE (frames/SEAM/generic). The grant_allows_up * gate checks intersection, so OBSERVE|INPUT means "either of the two". */ static uint32_t cap_for_up(const vmsig_event* ev) { + if (ev->kind == VMSIG_EV_ROSTER) return VMSIG_CAP_ROSTER; /* host-wide inventory */ if (ev->kind == VMSIG_EV_CURSOR_STATE) return VMSIG_CAP_OBSERVE | VMSIG_CAP_INPUT; return (ev->source == VMSIG_SRC_MEMCTX) ? VMSIG_CAP_MEMCTX : VMSIG_CAP_OBSERVE; } @@ -129,7 +130,7 @@ static core_adapter_ent* core_find_adapter(vmsig_core* c, uint32_t endpoint, vmsig_source source) { for (int i = 0; i < c->nadapters; i++) { core_adapter_ent* e = &c->adapters[i]; - if (e->ops->source == source && e->endpoint == endpoint) return e; + if (e->active && e->ops->source == source && e->endpoint == endpoint) return e; } return NULL; } @@ -316,6 +317,22 @@ void core_lease_reap_control(vmsig_core* c, int ctl_id) { } } +/* Release ALL lease classes held on `endpoint` (from endpoint detach, BEFORE the adapters + * close). Symmetric to core_lease_reap_control but keyed by endpoint, not owner: when a VM + * disappears its leases must not survive to auto-transfer onto whatever VM later reuses the + * same endpoint bit. The owner principal is recorded for the audit. */ +static void core_lease_reap_endpoint(vmsig_core* c, uint32_t endpoint) { + if (endpoint >= 64) return; + for (int cls = 0; cls < VMSIG_LEASE_CLASSES; cls++) { + core_lease_cell* cell = &c->lease[endpoint][cls]; + if (!cell->owner) continue; + uint32_t principal = lease_owner_principal(c, cell->owner); + cell->owner = 0; cell->owner_prio = 0; + vmsig_audit a = { VMSIG_AUDIT_LEASE_RECLAIMED, principal, endpoint, (uint32_t)cls, 0 }; + core_audit(c, &a); + } +} + /* DOWN emit from a control: enforcement against THIS control's grant. */ int core_emit_down(void* token, vmsig_event* ev) { core_down_ctx* d = token; @@ -472,7 +489,57 @@ void core_memctx_replay(vmsig_core* c, int ctl_id) { } } -void core_epoch_bump(vmsig_core* c, uint32_t endpoint) { +/* ===== VM roster (inventory coherence): retain + broadcast + replay-to-late ===== * + * Mirrors the MEMCTX retain cell, but the datum is a pure inline POD (no fd, no borrowed + * buffer): delivery is the ordinary broadcast (ops->deliver), with NO interception in + * pump_up. Publish is SYNCHRONOUS (like core_memctx_route) so a control gets the datum + * exactly once: current subscribers via this broadcast, a late one via core_roster_replay. */ +static void core_roster_build(uint32_t ep, const vmsig_roster* r, vmsig_event* ev) { + memset(ev, 0, sizeof *ev); + ev->kind = VMSIG_EV_ROSTER; ev->source = VMSIG_SRC_CORE; ev->dir = VMSIG_DIR_UP; + ev->prio = VMSIG_PRIO_URGENT; ev->endpoint = ep; + ev->payload.flags = VMSIG_PL_INLINE; + memcpy(ev->inln, r, sizeof *r); +} + +void core_roster_publish(vmsig_core* c, uint32_t endpoint, const vmsig_roster* entry) { + if (!c || endpoint >= 64 || !entry) return; + core_roster_cell* cell = &c->roster[endpoint]; + cell->entry = *entry; + /* DETACH clears the retained datum (a vacated slot is not replayed to a late subscriber), + * but the DETACH event is still broadcast to current subscribers so they drop the VM. */ + cell->valid = (entry->action != VMSIG_ROSTER_DETACH); + + vmsig_event ev; + core_roster_build(endpoint, entry, &ev); + for (int i = 0; i < c->ncontrols; i++) { + core_control_ent* e = &c->controls[i]; + if (!e->active || !e->ops->deliver) continue; + if (grant_allows_up(&e->grant, &ev) && sub_match(&e->sub, &ev)) + e->ops->deliver(e->ctl, &ev); + } +} + +void core_roster_replay(vmsig_core* c, int ctl_id) { + if (!c || ctl_id < 0 || ctl_id >= c->ncontrols) return; + core_control_ent* e = &c->controls[ctl_id]; + if (!e->active || !e->ops->deliver) return; + for (uint32_t ep = 0; ep < 64; ep++) { + core_roster_cell* cell = &c->roster[ep]; + if (!cell->valid) continue; + vmsig_event ev; + core_roster_build(ep, &cell->entry, &ev); + if (grant_allows_up(&e->grant, &ev) && sub_match(&e->sub, &ev)) + e->ops->deliver(e->ctl, &ev); + } +} + +/* Bump the endpoint epoch and broadcast MEMCTX_INVALIDATED to holders. When `rebootstrap` + * is set, ask the adapter to re-bootstrap (it re-emits MEMCTX{epoch+1} when ready) — the + * normal destructive-lifecycle path. On endpoint TEARDOWN (detach) `rebootstrap` is 0: the + * adapter is about to be closed, so kicking a re-bootstrap on a worker we are joining would + * be wasted; holders still settle via the INVALIDATED broadcast + the bumped epoch. */ +static void core_epoch_invalidate_emit(vmsig_core* c, uint32_t endpoint, int rebootstrap) { if (endpoint >= 64) return; c->epoch[endpoint]++; core_memctx_cell* cell = &c->memctx[endpoint]; @@ -486,11 +553,14 @@ void core_epoch_bump(vmsig_core* c, uint32_t endpoint) { memcpy(up.inln, &inv, sizeof inv); core_emit_up(c, &up); /* broadcast to holders (CAP_MEMCTX gate) */ - /* request re-bootstrap from the adapter: it re-emits MEMCTX{epoch+1} when ready. */ - if (cell->registered && cell->reg.invalidate) + if (rebootstrap && cell->registered && cell->reg.invalidate) cell->reg.invalidate(cell->reg.ctx, c->epoch[endpoint]); } +void core_epoch_bump(vmsig_core* c, uint32_t endpoint) { + core_epoch_invalidate_emit(c, endpoint, 1); /* destructive lifecycle: re-bootstrap */ +} + /* UP: drain the context queue and dispatch to subscribed controls */ static void pump_up(vmsig_core* c) { vmsig_event ev; @@ -575,6 +645,46 @@ static void core_reap(vmsig_core* c) { } } +/* Deferred reap of runtime-detached adapters (after the batch). Two passes: + * 1) per-endpoint coherence settle ONCE: release leases + bump epoch / broadcast + * MEMCTX_INVALIDATED (no re-bootstrap — we are tearing down). Done while the memctx + * cell is still registered. + * 2) per-adapter teardown: SEAM_DOWN (close is silent on administrative detach), epoll + * DEL + mark slots dead (so the loop never dispatches a half-closed adapter), then + * ops->close (joins the worker, closes the SI handle AFTER the join). + * Deferred (reap flag set elsewhere) so no live slot is flipped to DEAD inside the batch. */ +static void core_reap_adapters(vmsig_core* c) { + uint64_t settled = 0; /* endpoints already coherence-settled this pass */ + for (int i = 0; i < c->nadapters; i++) { + core_adapter_ent* e = &c->adapters[i]; + if (!e->reap || !e->active) continue; + uint32_t ep = e->endpoint; + if (ep < 64 && !(settled & (1ull << ep))) { + settled |= (1ull << ep); + core_lease_reap_endpoint(c, ep); + core_epoch_invalidate_emit(c, ep, 0); /* settle holders; no re-bootstrap */ + } + } + for (int i = 0; i < c->nadapters; i++) { + core_adapter_ent* e = &c->adapters[i]; + if (!e->reap || !e->active) continue; + + vmsig_event sd; + memset(&sd, 0, sizeof sd); + sd.kind = VMSIG_EV_SEAM_DOWN; sd.source = e->ops->source; sd.dir = VMSIG_DIR_UP; + sd.prio = VMSIG_PRIO_URGENT; sd.endpoint = e->endpoint; + core_emit_up(c, &sd); + + for (int k = 0; k < e->nslot; k++) { + if (!e->slots[k]) continue; + epoll_ctl(c->epfd, EPOLL_CTL_DEL, e->slots[k]->fd, NULL); + e->slots[k]->role = SLOT_DEAD; + } + if (e->ops->close) e->ops->close(e->a); + e->a = NULL; e->nslot = 0; e->active = 0; e->reap = 0; + } +} + int vmsig_core_run(vmsig_core* c) { if (!c) return -1; struct epoll_event evs[VMSIG_MAX_EVENTS]; @@ -609,6 +719,7 @@ int vmsig_core_run(vmsig_core* c) { pump_up(c); pump_down(c); core_reap(c); + core_reap_adapters(c); } return 0; } diff --git a/src/daemon/admission.c b/src/daemon/admission.c new file mode 100644 index 0000000..ff27076 --- /dev/null +++ b/src/daemon/admission.c @@ -0,0 +1,37 @@ +/* admission.c — vmsigd coarse admission policy (see vmsigd_admission.h). Translates a uid to + * a vmsig_grant, resolving entitled vmids to live endpoint bits via the discovery slot map. */ +#define _GNU_SOURCE +#include "vmsigd_admission.h" +#include "discovery.h" /* vmsig_discovery_slot_of_vmid */ +#include + +static const vmsigd_grant_rule* rule_for_uid(const vmsigd_config* cfg, uint32_t uid) { + for (int i = 0; i < cfg->ngrants; i++) + if (cfg->grants[i].uid == uid) return &cfg->grants[i]; + return NULL; +} + +vmsig_grant vmsigd_policy(uint32_t uid, uint32_t pid, void* ud) { + (void)pid; + vmsigd_admission* a = ud; + vmsig_grant g; + memset(&g, 0, sizeof g); + + const vmsigd_grant_rule* r = (a && a->cfg) ? rule_for_uid(a->cfg, uid) : NULL; + if (!r) return g; /* no stanza => empty grant => REJECT */ + + g.principal = uid; + g.source_mask = 0xFFFFFFFFu; /* coarse: control enforces source finer behind us */ + g.cap_mask = r->cap_mask; + g.arb_prio = r->arb_prio; + + if (r->all_vms) { + g.endpoint_mask = ~0ull; /* covers all current + future endpoints */ + } else { + for (int i = 0; i < r->nvmids; i++) { + int ep = a->disc ? vmsig_discovery_slot_of_vmid(a->disc, r->vmids[i]) : -1; + if (ep >= 0 && ep < 64) g.endpoint_mask |= (1ull << ep); + } + } + return g; +} diff --git a/src/daemon/config.c b/src/daemon/config.c new file mode 100644 index 0000000..58ca0da --- /dev/null +++ b/src/daemon/config.c @@ -0,0 +1,126 @@ +/* config.c — vmsigd config parser (see vmsigd.h). INI-ish: `key = value` globals + repeated + * `[grant uid=N]` stanzas. Pure libc; no core/vmie dependency (unit-testable in any build). */ +#define _GNU_SOURCE +#include "vmsigd.h" +#include "vmsig_control.h" /* VMSIG_CAP_* */ +#include +#include +#include +#include +#include + +void vmsigd_config_defaults(vmsigd_config* c) { + memset(c, 0, sizeof *c); + snprintf(c->socket, sizeof c->socket, "%s", "/run/vmsig/vmsigd.sock"); + snprintf(c->watch, sizeof c->watch, "%s", "/dev/shm/vmsig"); + snprintf(c->pve_conf, sizeof c->pve_conf, "%s", "/etc/pve/qemu-server"); + snprintf(c->qmp_dir, sizeof c->qmp_dir, "%s", "/var/run/qemu-server"); + snprintf(c->slots, sizeof c->slots, "%s", "/dev/shm/vmsig/.slots"); +} + +uint32_t vmsigd_caps_from_str(const char* s) { + static const struct { const char* k; uint32_t bit; } map[] = { + { "observe", VMSIG_CAP_OBSERVE }, + { "input", VMSIG_CAP_INPUT }, + { "lifecycle", VMSIG_CAP_LIFECYCLE }, + { "power", VMSIG_CAP_POWER }, + { "vm", VMSIG_CAP_VM }, + { "memctx", VMSIG_CAP_MEMCTX }, + { "memwrite", VMSIG_CAP_MEMWRITE }, + { "roster", VMSIG_CAP_ROSTER }, + }; + uint32_t mask = 0; + while (s && *s) { + while (*s == ',' || *s == ' ' || *s == '\t') s++; + const char* w = s; + while (*s && *s != ',' && *s != ' ' && *s != '\t') s++; + size_t len = (size_t)(s - w); + for (size_t i = 0; i < sizeof map / sizeof map[0]; i++) + if (len == strlen(map[i].k) && strncmp(w, map[i].k, len) == 0) { mask |= map[i].bit; break; } + } + return mask; +} + +/* Trim leading/trailing whitespace in place; returns the trimmed start. */ +static char* trim(char* s) { + while (*s == ' ' || *s == '\t' || *s == '\r') s++; + char* e = s + strlen(s); + while (e > s && (e[-1] == ' ' || e[-1] == '\t' || e[-1] == '\r' || e[-1] == '\n')) *--e = 0; + return s; +} + +static void set_path(char* dst, size_t cap, const char* v) { snprintf(dst, cap, "%s", v); } + +static void parse_vmids(vmsigd_grant_rule* g, const char* v) { + g->all_vms = 0; g->nvmids = 0; + if (strchr(v, '*')) { g->all_vms = 1; return; } + while (*v) { + while (*v == ',' || *v == ' ' || *v == '\t') v++; + if (*v < '0' || *v > '9') { if (*v) v++; continue; } + uint32_t id = (uint32_t)strtoul(v, NULL, 10); + while (*v >= '0' && *v <= '9') v++; + if (id && g->nvmids < VMSIGD_MAX_VMIDS) g->vmids[g->nvmids++] = id; + } +} + +int vmsigd_config_parse_buf(vmsigd_config* c, const char* buf) { + if (!c || !buf) return -1; + char* copy = strdup(buf); + if (!copy) return -1; + + vmsigd_grant_rule* cur = NULL; /* current [grant] stanza, or NULL for globals */ + char* save = NULL; + for (char* line = strtok_r(copy, "\n", &save); line; line = strtok_r(NULL, "\n", &save)) { + char* p = trim(line); + if (!*p || *p == '#' || *p == ';') continue; + + if (*p == '[') { + cur = NULL; + /* [grant uid=N] */ + char* u = strstr(p, "uid="); + if (u && c->ngrants < VMSIGD_MAX_GRANTS) { + cur = &c->grants[c->ngrants++]; + memset(cur, 0, sizeof *cur); + cur->uid = (uint32_t)strtoul(u + 4, NULL, 10); + } + continue; + } + + char* eq = strchr(p, '='); + if (!eq) continue; + *eq = 0; + char* key = trim(p); + char* val = trim(eq + 1); + + if (cur) { + if (!strcmp(key, "vmids")) parse_vmids(cur, val); + else if (!strcmp(key, "caps")) cur->cap_mask = vmsigd_caps_from_str(val); + else if (!strcmp(key, "arb_prio")) cur->arb_prio = (uint32_t)strtoul(val, NULL, 10); + } else { + if (!strcmp(key, "socket")) set_path(c->socket, sizeof c->socket, val); + else if (!strcmp(key, "watch")) set_path(c->watch, sizeof c->watch, val); + else if (!strcmp(key, "pve_conf")) set_path(c->pve_conf, sizeof c->pve_conf, val); + else if (!strcmp(key, "qmp_dir")) set_path(c->qmp_dir, sizeof c->qmp_dir, val); + else if (!strcmp(key, "slots")) set_path(c->slots, sizeof c->slots, val); + } + } + free(copy); + return 0; +} + +int vmsigd_config_parse_file(vmsigd_config* c, const char* path) { + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) return -1; + char buf[16 * 1024]; + size_t got = 0; + for (;;) { + ssize_t n = read(fd, buf + got, sizeof buf - 1 - got); + if (n < 0) { close(fd); return -1; } + if (n == 0) break; + got += (size_t)n; + if (got >= sizeof buf - 1) break; + } + close(fd); + buf[got] = 0; + return vmsigd_config_parse_buf(c, buf); +} diff --git a/src/daemon/include/vmsigd.h b/src/daemon/include/vmsigd.h new file mode 100644 index 0000000..bb2694a --- /dev/null +++ b/src/daemon/include/vmsigd.h @@ -0,0 +1,47 @@ +#ifndef VMSIGD_H +#define VMSIGD_H +#include + +/* vmsigd.h — private config model of the vmsig daemon. + * + * The daemon owns the /dev/shm/vmsig discovery namespace and serves a unix-socket control + * plane over the signaling layer for the VMs discovered there. Its only policy is a COARSE + * admission grant per uid (SISC: signaling is not a fine-grained access broker — the control + * enforces per-user caps behind the grant). Entitlements are expressed in vmid terms and + * translated to an endpoint_mask at connect time against the live slot map. */ + +#define VMSIGD_MAX_GRANTS 64 +#define VMSIGD_MAX_VMIDS 64 +#define VMSIGD_PATH_MAX 256 + +typedef struct { + uint32_t uid; + int all_vms; /* `vmids = *` */ + uint32_t vmids[VMSIGD_MAX_VMIDS]; + int nvmids; + uint32_t cap_mask; /* VMSIG_CAP_* (from `caps =` keywords) */ + uint32_t arb_prio; +} vmsigd_grant_rule; + +typedef struct { + char socket[VMSIGD_PATH_MAX]; /* control listener ('@' => abstract) */ + char watch[VMSIGD_PATH_MAX]; /* discovery dir (/dev/shm/vmsig) */ + char pve_conf[VMSIGD_PATH_MAX]; /* /etc/pve/qemu-server */ + char qmp_dir[VMSIGD_PATH_MAX]; /* /var/run/qemu-server */ + char slots[VMSIGD_PATH_MAX]; /* slot persistence ("" => off) */ + vmsigd_grant_rule grants[VMSIGD_MAX_GRANTS]; + int ngrants; +} vmsigd_config; + +/* Populate with built-in defaults. */ +void vmsigd_config_defaults(vmsigd_config* c); + +/* Parse the INI-ish config (globals + repeated [grant uid=N] stanzas) over the defaults + * already in `c`. Unknown keys are ignored. Returns 0, or -1 on open/usage error. */ +int vmsigd_config_parse_file(vmsigd_config* c, const char* path); +int vmsigd_config_parse_buf (vmsigd_config* c, const char* buf); /* same, from memory (tests) */ + +/* Translate a comma/space-separated cap keyword list to a VMSIG_CAP_* mask. */ +uint32_t vmsigd_caps_from_str(const char* s); + +#endif /* VMSIGD_H */ diff --git a/src/daemon/include/vmsigd_admission.h b/src/daemon/include/vmsigd_admission.h new file mode 100644 index 0000000..c41d754 --- /dev/null +++ b/src/daemon/include/vmsigd_admission.h @@ -0,0 +1,21 @@ +#ifndef VMSIGD_ADMISSION_H +#define VMSIGD_ADMISSION_H +#include "vmsigd.h" +#include "vmsig_control.h" /* vmsig_grant */ + +struct vmsig_discovery; + +/* Admission context handed to the socket listener as policy `ud`. The config is read-only at + * connect time; the live discovery resolves entitled vmids to their current endpoint bits. */ +typedef struct { + const vmsigd_config* cfg; + struct vmsig_discovery* disc; +} vmsigd_admission; + +/* vmsig_socket_policy: uid from SO_PEERCRED -> a coarse grant. No matching [grant uid=N] + * stanza => empty grant (the listener rejects). `vmids = *` => endpoint_mask covers all 64; + * a vmid list resolves each currently-attached vmid to its endpoint bit (an unbound entitled + * vmid contributes no bit yet — the peer learns liveness via the roster). */ +vmsig_grant vmsigd_policy(uint32_t uid, uint32_t pid, void* ud); + +#endif /* VMSIGD_ADMISSION_H */ diff --git a/src/daemon/vmsigd.c b/src/daemon/vmsigd.c new file mode 100644 index 0000000..6f3c0b4 --- /dev/null +++ b/src/daemon/vmsigd.c @@ -0,0 +1,156 @@ +/* vmsigd.c — the vmsig management daemon. + * + * Owns the /dev/shm/vmsig discovery namespace and serves a unix-socket control plane over the + * signaling layer for the VMs found there. It wires nothing VM-specific: discovery hot-plugs + * each VM's adapter trio and publishes the roster; the daemon only supplies the loop, the + * discovery roots, the control socket, and a coarse per-uid admission policy. + * + * Real input/memctx actuation needs an armed library build (memctx -> vmie). A stub build + * still runs (socket/admission/discovery machinery), but memctx will not bootstrap. + * + * Usage: vmsigd [--config PATH] [--socket S] [--watch DIR] [--pve-conf DIR] [--qmp-dir DIR] + * [--slots PATH] [--foreground] + * precedence: argv > environment (VMSIGD_*) > config file > built-in defaults. */ +#define _GNU_SOURCE +#include "vmsig.h" +#include "vmsig_socket.h" +#include "discovery.h" +#include "core_internal.h" /* core_add_source (in-repo daemon, intimate with the core) */ +#include "vmsigd.h" +#include "vmsigd_admission.h" +#include +#include +#include +#include +#include +#include + +static vmsig_core* g_core; +static vmsigd_config g_cfg; +static char g_cfg_path[VMSIGD_PATH_MAX]; + +/* Audit trace: admissions/denials, lease and memctx grants — on the loop thread, to stderr + * (systemd routes stderr to the journal). */ +static void on_audit(void* ud, const vmsig_audit* a) { + (void)ud; + static const char* k[] = { + "ADMIT", "REJECT", "DOWN_DENIED", "LEASE_GRANTED", "LEASE_DENIED", + "LEASE_REVOKED", "LEASE_RECLAIMED", "MEMCTX_GRANTED" + }; + const char* name = (a->kind <= VMSIG_AUDIT_MEMCTX_GRANTED) ? k[a->kind] : "?"; + fprintf(stderr, "vmsigd: audit %-14s principal=%u ep=%u cmd=%u detail=%u\n", + name, a->principal, a->endpoint, a->cmd, a->detail); +} + +/* Signals arrive as fd readiness (signalfd) on the loop thread — no async-handler hazards. + * TERM/INT => graceful stop; HUP => reload ONLY the admission table from the config file + * (paths/socket/adapters are untouched; already-connected grants are not retroactively + * changed — a peer reconnects to pick up a changed entitlement). */ +static void on_signal(void* user, uint32_t events) { + (void)events; + int sfd = *(int*)user; + struct signalfd_siginfo si; + while (read(sfd, &si, sizeof si) == (ssize_t)sizeof si) { + if (si.ssi_signo == SIGINT || si.ssi_signo == SIGTERM) { + vmsig_core_stop(g_core); + } else if (si.ssi_signo == SIGHUP) { + vmsigd_config fresh; + vmsigd_config_defaults(&fresh); + if (g_cfg_path[0] && vmsigd_config_parse_file(&fresh, g_cfg_path) == 0) { + memcpy(g_cfg.grants, fresh.grants, sizeof g_cfg.grants); + g_cfg.ngrants = fresh.ngrants; /* swap admission table only */ + fprintf(stderr, "vmsigd: reloaded %d grant rule(s)\n", g_cfg.ngrants); + } + } + } +} + +static const char* arg_val(int argc, char** argv, int* i) { + char* a = argv[*i]; + char* eq = strchr(a, '='); + if (eq) return eq + 1; + if (*i + 1 < argc) { (*i)++; return argv[*i]; } + return ""; +} + +static void apply_env(vmsigd_config* c) { + const char* v; + if ((v = getenv("VMSIGD_SOCKET"))) snprintf(c->socket, sizeof c->socket, "%s", v); + if ((v = getenv("VMSIGD_WATCH"))) snprintf(c->watch, sizeof c->watch, "%s", v); + if ((v = getenv("VMSIGD_PVE_CONF"))) snprintf(c->pve_conf, sizeof c->pve_conf, "%s", v); + if ((v = getenv("VMSIGD_QMP_DIR"))) snprintf(c->qmp_dir, sizeof c->qmp_dir, "%s", v); + if ((v = getenv("VMSIGD_SLOTS"))) snprintf(c->slots, sizeof c->slots, "%s", v); +} + +int main(int argc, char** argv) { + /* config path: argv --config > env > default. */ + const char* cfg_path = getenv("VMSIGD_CONFIG"); + if (!cfg_path) cfg_path = "/etc/vmsig/vmsigd.conf"; + for (int i = 1; i < argc; i++) + if (!strncmp(argv[i], "--config", 8)) { cfg_path = arg_val(argc, argv, &i); } + + vmsigd_config_defaults(&g_cfg); + vmsigd_config_parse_file(&g_cfg, cfg_path); /* missing file => defaults (not fatal) */ + snprintf(g_cfg_path, sizeof g_cfg_path, "%s", cfg_path); + apply_env(&g_cfg); + + for (int i = 1; i < argc; i++) { + char* a = argv[i]; + if (!strncmp(a, "--config", 8)) { (void)arg_val(argc, argv, &i); } + else if (!strncmp(a, "--socket", 8)) snprintf(g_cfg.socket, sizeof g_cfg.socket, "%s", arg_val(argc, argv, &i)); + else if (!strncmp(a, "--watch", 7)) snprintf(g_cfg.watch, sizeof g_cfg.watch, "%s", arg_val(argc, argv, &i)); + else if (!strncmp(a, "--pve-conf", 10)) snprintf(g_cfg.pve_conf, sizeof g_cfg.pve_conf, "%s", arg_val(argc, argv, &i)); + else if (!strncmp(a, "--qmp-dir", 9)) snprintf(g_cfg.qmp_dir, sizeof g_cfg.qmp_dir, "%s", arg_val(argc, argv, &i)); + else if (!strncmp(a, "--slots", 7)) snprintf(g_cfg.slots, sizeof g_cfg.slots, "%s", arg_val(argc, argv, &i)); + else if (!strcmp(a, "--foreground")) { /* default; systemd Type=simple */ } + else if (!strcmp(a, "-h") || !strcmp(a, "--help")) { + fprintf(stderr, "usage: %s [--config P][--socket S][--watch D][--pve-conf D]" + "[--qmp-dir D][--slots P][--foreground]\n", argv[0]); + return 0; + } + } + + /* Signals via signalfd, serviced on the loop thread. SIGPIPE ignored (dead-peer writes). */ + signal(SIGPIPE, SIG_IGN); + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGINT); sigaddset(&mask, SIGTERM); sigaddset(&mask, SIGHUP); + sigprocmask(SIG_BLOCK, &mask, NULL); + int sfd = signalfd(-1, &mask, SFD_NONBLOCK | SFD_CLOEXEC); + if (sfd < 0) { perror("vmsigd: signalfd"); return 1; } + + vmsig_ctx* ctx = vmsig_ctx_new(); + if (!ctx) { fprintf(stderr, "vmsigd: ctx_new failed\n"); close(sfd); return 1; } + g_core = vmsig_core_new(ctx); + if (!g_core) { fprintf(stderr, "vmsigd: core_new failed\n"); vmsig_ctx_free(ctx); close(sfd); return 1; } + vmsig_core_set_audit(g_core, on_audit, NULL); + + if (core_add_source(g_core, sfd, on_signal, &sfd, NULL) != 0) { + fprintf(stderr, "vmsigd: signal source registration failed\n"); + vmsig_core_free(g_core); vmsig_ctx_free(ctx); close(sfd); return 1; + } + + vmsig_discovery* disc = vmsig_discovery_new( + g_core, g_cfg.watch, g_cfg.pve_conf, g_cfg.qmp_dir, + g_cfg.slots[0] ? g_cfg.slots : NULL, NULL, NULL); + if (!disc) { + fprintf(stderr, "vmsigd: discovery_new(%s) failed\n", g_cfg.watch); + vmsig_core_free(g_core); vmsig_ctx_free(ctx); close(sfd); return 1; + } + + vmsigd_admission adm = { &g_cfg, disc }; + if (vmsig_socket_attach(g_core, g_cfg.socket, vmsigd_policy, &adm) != 0) { + fprintf(stderr, "vmsigd: socket_attach(%s) failed\n", g_cfg.socket); + vmsig_core_free(g_core); vmsig_ctx_free(ctx); close(sfd); return 1; + } + + fprintf(stderr, "vmsigd: serving %s (watch=%s pve=%s qmp=%s) %d grant rule(s)\n", + g_cfg.socket, g_cfg.watch, g_cfg.pve_conf, g_cfg.qmp_dir, g_cfg.ngrants); + int rc = vmsig_core_run(g_core); + fprintf(stderr, "vmsigd: loop exit rc=%d\n", rc); + + vmsig_core_free(g_core); /* reaps discovery (source on_free) + closes the socket listener */ + vmsig_ctx_free(ctx); + close(sfd); + return rc; +} diff --git a/src/discovery/discovery.c b/src/discovery/discovery.c new file mode 100644 index 0000000..8d7138b --- /dev/null +++ b/src/discovery/discovery.c @@ -0,0 +1,363 @@ +/* discovery.c — runtime VM discovery state machine (see discovery.h). + * + * Single-threaded on the loop thread (inotify + timer sources via core_add_source). On a + * "vm--ram" file appearing it corroborates the candidate (host-probe seam), assigns a + * stable endpoint slot, hot-plugs the trio (sink), and publishes the roster; on the file + * disappearing it tears the endpoint down and publishes a roster DETACH. QMP-not-up-yet is a + * transient retry driven by a timerfd (no busy-wait); config errors / stale files drop. */ +#define _GNU_SOURCE +#include "discovery.h" +#include "slot.h" +#include "core_internal.h" /* core_roster_publish */ +#include "memctx.h" /* vmsig_memctx_cfg */ +#include "vmhost.h" /* vmsig_vmhost_cfg */ +#include "input.h" /* vmsig_input_cfg */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DISC_PATH_MAX 256 +#define DISC_RETRY_MAX 40 /* give up after ~tens of seconds of QMP-not-up */ +#define DISC_BACKOFF_BASE 50000000ull /* 50 ms */ +#define DISC_BACKOFF_CAP 2000000000ull /* 2 s */ + +typedef enum { CAND_FREE = 0, CAND_PROBING, CAND_ATTACHED } cand_state; + +typedef struct { + cand_state state; + uint32_t vmid; + int endpoint; /* -1 until attached */ + int attempts; + uint64_t next_probe_ns; /* monotonic deadline for the next retry */ + vmsig_host_facts facts; /* probe working copy */ +} cand_ent; + +struct vmsig_discovery { + vmsig_core* core; + char watch_dir[DISC_PATH_MAX]; + char slots_path[DISC_PATH_MAX]; + int persist; + vmsig_host_probe probe; + vmsig_discovery_sink sink; + int ifd; /* inotify */ + int wd; + int tfd; /* retry timerfd */ + slot_table slots; + cand_ent cand[VMSIG_SLOT_COUNT]; + /* Stable per-endpoint home for the adapter cfg strings (ram_path/qmp_path): the adapters + * keep pointers, and detach is deferred, so this must outlive the candidate. Overwritten + * only on the NEXT attach to the endpoint, which never races a still-open prior adapter. */ + vmsig_host_facts ep_facts[VMSIG_SLOT_COUNT]; +}; + +static uint64_t now_ns(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec; +} + +static uint64_t backoff_ns(int attempts) { + uint64_t b = DISC_BACKOFF_BASE << (attempts < 6 ? attempts : 6); + return b > DISC_BACKOFF_CAP ? DISC_BACKOFF_CAP : b; +} + +/* Parse exactly "vm--ram" -> vmid; 0 if it does not match. */ +static uint32_t parse_vmid(const char* name) { + if (strncmp(name, "vm-", 3) != 0) return 0; + const char* p = name + 3; + if (*p < '0' || *p > '9') return 0; + uint64_t v = 0; + while (*p >= '0' && *p <= '9') { v = v * 10 + (uint64_t)(*p - '0'); p++; if (v > 0xFFFFFFFFull) return 0; } + if (strcmp(p, "-ram") != 0) return 0; + return (uint32_t)v; +} + +static cand_ent* cand_find(vmsig_discovery* d, uint32_t vmid) { + for (int i = 0; i < VMSIG_SLOT_COUNT; i++) + if (d->cand[i].state != CAND_FREE && d->cand[i].vmid == vmid) return &d->cand[i]; + return NULL; +} + +static cand_ent* cand_alloc(vmsig_discovery* d, uint32_t vmid) { + cand_ent* e = cand_find(d, vmid); + if (e) return e; + for (int i = 0; i < VMSIG_SLOT_COUNT; i++) + if (d->cand[i].state == CAND_FREE) { + memset(&d->cand[i], 0, sizeof d->cand[i]); + d->cand[i].vmid = vmid; d->cand[i].endpoint = -1; + return &d->cand[i]; + } + return NULL; /* 64-candidate ceiling */ +} + +/* Arm the retry timer to the soonest pending probe, or disarm if none pending. */ +static void rearm_timer(vmsig_discovery* d) { + uint64_t soonest = 0; int any = 0; + for (int i = 0; i < VMSIG_SLOT_COUNT; i++) + if (d->cand[i].state == CAND_PROBING && d->cand[i].next_probe_ns) { + if (!any || d->cand[i].next_probe_ns < soonest) soonest = d->cand[i].next_probe_ns; + any = 1; + } + struct itimerspec its; + memset(&its, 0, sizeof its); + if (any) { + uint64_t now = now_ns(); + uint64_t dt = soonest > now ? soonest - now : 1000000ull; /* >=1ms */ + its.it_value.tv_sec = (time_t)(dt / 1000000000ull); + its.it_value.tv_nsec = (long)(dt % 1000000000ull); + } + timerfd_settime(d->tfd, 0, &its, NULL); /* it_value 0 => disarm */ +} + +static void publish_roster(vmsig_discovery* d, uint32_t ep, uint32_t vmid, uint32_t state, + uint32_t action, const char* name) { + vmsig_roster r; + memset(&r, 0, sizeof r); + r.vmid = vmid; r.state = state; r.action = action; + if (name) { + size_t n = strlen(name); + if (n >= VMSIG_ROSTER_NAME_MAX) { n = VMSIG_ROSTER_NAME_MAX - 1; r.flags |= VMSIG_ROSTER_NAME_TRUNC; } + memcpy(r.name, name, n); + } + core_roster_publish(d->core, ep, &r); +} + +static void cand_drop(cand_ent* c) { + c->state = CAND_FREE; c->vmid = 0; c->endpoint = -1; c->attempts = 0; c->next_probe_ns = 0; +} + +static void do_attach(vmsig_discovery* d, cand_ent* c) { + int ep = slot_alloc(&d->slots, c->vmid); + if (ep < 0) { + fprintf(stderr, "vmsig discovery: no free endpoint for vmid %u (64-VM ceiling)\n", c->vmid); + cand_drop(c); + return; + } + d->ep_facts[ep] = c->facts; /* stable home for cfg strings the adapters keep */ + if (d->sink.attach(d->sink.ud, d->core, c->vmid, (uint32_t)ep, &d->ep_facts[ep]) != 0) { + slot_free(&d->slots, c->vmid); + fprintf(stderr, "vmsig discovery: attach failed for vmid %u\n", c->vmid); + cand_drop(c); + return; + } + c->state = CAND_ATTACHED; c->endpoint = ep; + publish_roster(d, (uint32_t)ep, c->vmid, (uint32_t)c->facts.vm_state, VMSIG_ROSTER_ATTACH, + c->facts.name); + if (d->persist) slot_save(&d->slots, d->slots_path); +} + +static void do_detach(vmsig_discovery* d, cand_ent* c) { + int ep = c->endpoint; + if (ep >= 0) { + publish_roster(d, (uint32_t)ep, c->vmid, VMSIG_VM_SHUTDOWN, VMSIG_ROSTER_DETACH, + c->facts.name); + d->sink.detach(d->sink.ud, d->core, c->vmid, (uint32_t)ep); /* deferred teardown */ + slot_free(&d->slots, c->vmid); /* bit vacated (ordered) */ + if (d->persist) slot_save(&d->slots, d->slots_path); + /* ep_facts[ep] is intentionally NOT cleared: the deferred adapter reap still reads the + * cfg strings; it is overwritten on the next attach to this endpoint. */ + } + cand_drop(c); +} + +static void try_probe(vmsig_discovery* d, cand_ent* c) { + d->probe.config(&d->probe, c->vmid, &c->facts); + if (!c->facts.ok) { cand_drop(c); return; } /* not ours / no share=on */ + + d->probe.live(&d->probe, &c->facts); + if (c->facts.retry) { + if (++c->attempts > DISC_RETRY_MAX) { + fprintf(stderr, "vmsig discovery: vmid %u QMP never came up, giving up\n", c->vmid); + cand_drop(c); + return; + } + c->next_probe_ns = now_ns() + backoff_ns(c->attempts); + rearm_timer(d); + return; + } + if (!c->facts.ok) { cand_drop(c); return; } /* stale: file present, VM dead/unparsable */ + + do_attach(d, c); +} + +static void on_file_appear(vmsig_discovery* d, uint32_t vmid) { + cand_ent* c = cand_alloc(d, vmid); + if (!c) { fprintf(stderr, "vmsig discovery: candidate table full, vmid %u ignored\n", vmid); return; } + if (c->state == CAND_ATTACHED) return; /* already live (duplicate event) */ + if (c->state == CAND_FREE) { c->state = CAND_PROBING; c->attempts = 0; } + c->next_probe_ns = 0; + try_probe(d, c); +} + +static void on_file_gone(vmsig_discovery* d, uint32_t vmid) { + cand_ent* c = cand_find(d, vmid); + if (!c) return; + if (c->state == CAND_ATTACHED) do_detach(d, c); + else cand_drop(c); /* was still probing */ +} + +/* ---- loop sources ------------------------------------------------------------ */ + +static void on_inotify(void* user, uint32_t events) { + (void)events; + vmsig_discovery* d = user; + char buf[4096] __attribute__((aligned(__alignof__(struct inotify_event)))); + for (;;) { + ssize_t n = read(d->ifd, buf, sizeof buf); + if (n <= 0) { if (n < 0 && errno == EINTR) continue; break; } + for (char* p = buf; p < buf + n; ) { + struct inotify_event* ev = (struct inotify_event*)p; + if (ev->len) { + uint32_t vmid = parse_vmid(ev->name); + if (vmid) { + if (ev->mask & (IN_CREATE | IN_MOVED_TO | IN_CLOSE_WRITE)) on_file_appear(d, vmid); + else if (ev->mask & (IN_DELETE | IN_MOVED_FROM)) on_file_gone(d, vmid); + } + } + p += sizeof(struct inotify_event) + ev->len; + } + } +} + +static void on_timer(void* user, uint32_t events) { + (void)events; + vmsig_discovery* d = user; + uint64_t v; + while (read(d->tfd, &v, sizeof v) == (ssize_t)sizeof v) { /* drain */ } + uint64_t now = now_ns(); + for (int i = 0; i < VMSIG_SLOT_COUNT; i++) { + cand_ent* c = &d->cand[i]; + if (c->state == CAND_PROBING && c->next_probe_ns && c->next_probe_ns <= now) + try_probe(d, c); + } + rearm_timer(d); +} + +static void bootstrap_scan(vmsig_discovery* d) { + DIR* dir = opendir(d->watch_dir); + if (!dir) return; + struct dirent* de; + while ((de = readdir(dir)) != NULL) { + uint32_t vmid = parse_vmid(de->d_name); + if (vmid) on_file_appear(d, vmid); + } + closedir(dir); + /* GC persisted-but-not-live slots: a vmid bound in .slots with no live file (it died while + * the daemon was down) keeps its bit pinned; free it so the ceiling is not leaked. */ + for (int e = 0; e < VMSIG_SLOT_COUNT; e++) { + uint32_t vmid = d->slots.ent[e].vmid; + if (!vmid) continue; + cand_ent* c = cand_find(d, vmid); + if (!c || c->state != CAND_ATTACHED) slot_free(&d->slots, vmid); + } + if (d->persist) slot_save(&d->slots, d->slots_path); +} + +/* ---- default sink: wire the core adapter trio ------------------------------- */ + +static int default_attach(void* ud, vmsig_core* core, uint32_t vmid, uint32_t endpoint, + const vmsig_host_facts* f) { + (void)ud; (void)vmid; + vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc); + mc.stub = 0; mc.ram_path = f->ram_path; mc.low = f->low; mc.ro_fd = -1; + vmsig_vmhost_cfg vh; memset(&vh, 0, sizeof vh); + vh.stub = 0; vh.qmp_path = f->qmp_path; + vmsig_input_cfg in; memset(&in, 0, sizeof in); + in.stub = 0; in.qmp_path = NULL; /* input is uinput; power/lifecycle via the vmhost seam */ + + if (vmsig_core_add_adapter(core, vmsig_memctx_ops(), &mc, endpoint) < 0) goto fail; + if (vmsig_core_add_adapter(core, vmsig_vmhost_ops(), &vh, endpoint) < 0) goto fail; + if (vmsig_core_add_adapter(core, vmsig_input_ops(), &in, endpoint) < 0) goto fail; + return 0; +fail: + vmsig_core_detach_endpoint(core, endpoint); /* roll back any partial trio (deferred) */ + return -1; +} + +static void default_detach(void* ud, vmsig_core* core, uint32_t vmid, uint32_t endpoint) { + (void)ud; (void)vmid; + vmsig_core_detach_endpoint(core, endpoint); +} + +/* ---- lifecycle --------------------------------------------------------------- */ + +void vmsig_discovery_free(void* user) { + vmsig_discovery* d = user; + if (!d) return; + if (d->ifd >= 0) close(d->ifd); + if (d->tfd >= 0) close(d->tfd); + free(d); +} + +vmsig_discovery* vmsig_discovery_new(vmsig_core* core, + const char* watch_dir, const char* pve_conf, + const char* qmp_dir, const char* slots_path, + const vmsig_host_probe* probe, + const vmsig_discovery_sink* sink) { + if (!core || !watch_dir) return NULL; + vmsig_discovery* d = calloc(1, sizeof *d); + if (!d) return NULL; + d->core = core; + d->ifd = d->tfd = d->wd = -1; + snprintf(d->watch_dir, sizeof d->watch_dir, "%s", watch_dir); + if (slots_path && *slots_path) { + snprintf(d->slots_path, sizeof d->slots_path, "%s", slots_path); + d->persist = 1; + } + for (int i = 0; i < VMSIG_SLOT_COUNT; i++) d->cand[i].endpoint = -1; + + if (probe) d->probe = *probe; + else d->probe = host_probe_proxmox(d->watch_dir, + pve_conf ? pve_conf : "/etc/pve/qemu-server", + qmp_dir ? qmp_dir : "/var/run/qemu-server"); + if (sink) d->sink = *sink; + else { d->sink.attach = default_attach; d->sink.detach = default_detach; d->sink.ud = NULL; } + + slot_load(&d->slots, d->persist ? d->slots_path : NULL); + + d->ifd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC); + if (d->ifd < 0) { vmsig_discovery_free(d); return NULL; } + d->wd = inotify_add_watch(d->ifd, d->watch_dir, + IN_CREATE | IN_MOVED_TO | IN_DELETE | IN_MOVED_FROM | IN_CLOSE_WRITE | IN_ONLYDIR); + /* a missing watch dir is not fatal: the dir may be created later; bootstrap finds nothing. */ + + d->tfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC); + if (d->tfd < 0) { vmsig_discovery_free(d); return NULL; } + + /* The inotify source owns the discovery lifetime (on_free frees ifd+tfd+d); the timer + * source shares the handle with on_free=NULL. */ + if (core_add_source(core, d->ifd, on_inotify, d, vmsig_discovery_free) != 0) { + vmsig_discovery_free(d); return NULL; + } + if (core_add_source(core, d->tfd, on_timer, d, NULL) != 0) { + /* ifd already enrolled with the on_free; closing here would double-free at core_free. + * Leave it to core_free to reap. Return NULL to signal partial failure is not clean. */ + return NULL; + } + + bootstrap_scan(d); + rearm_timer(d); + return d; +} + +int vmsig_discovery_slot_of_vmid(vmsig_discovery* d, uint32_t vmid) { + if (!d) return -1; + return slot_lookup(&d->slots, vmid); +} + +/* ---- TEST-ONLY hooks: drive the state machine deterministically (no inotify/timer) ---- */ +void vmsig_discovery_feed(vmsig_discovery* d, uint32_t vmid, int present) { + if (present) on_file_appear(d, vmid); else on_file_gone(d, vmid); +} +void vmsig_discovery_tick(vmsig_discovery* d) { /* force a re-probe of every probing candidate */ + for (int i = 0; i < VMSIG_SLOT_COUNT; i++) + if (d->cand[i].state == CAND_PROBING) try_probe(d, &d->cand[i]); +} diff --git a/src/discovery/include/discovery.h b/src/discovery/include/discovery.h new file mode 100644 index 0000000..9c655f9 --- /dev/null +++ b/src/discovery/include/discovery.h @@ -0,0 +1,46 @@ +#ifndef VMSIG_DISCOVERY_H +#define VMSIG_DISCOVERY_H +#include "vmsig_core.h" +#include "host_probe.h" + +/* discovery.h — runtime VM discovery (private to the discovery module). + * + * Watches a tmpfs trigger dir for "vm--ram" files, corroborates each candidate via the + * host-probe seam, assigns a stable endpoint slot, hot-plugs the VM (sink), and publishes the + * roster. The state machine + slot allocation are decoupled from actuation by a sink seam, so + * the orchestration is unit-testable without armed adapters. */ + +typedef struct vmsig_discovery vmsig_discovery; + +/* Actuation seam: bring a discovered VM up / tear it down. Default (NULL) wires the core + * adapter trio (memctx+vmhost+input via vmsig_core_add_adapter) and detach_endpoint. A test + * injects a recording sink to verify the state machine without real adapters. Roster publish + * is owned by discovery (not the sink): ATTACH after a successful attach, DETACH before tear-down. */ +typedef struct { + int (*attach)(void* ud, vmsig_core* core, uint32_t vmid, uint32_t endpoint, + const vmsig_host_facts* f); /* 0 = up, -1 = failed (slot freed) */ + void (*detach)(void* ud, vmsig_core* core, uint32_t vmid, uint32_t endpoint); + void* ud; +} vmsig_discovery_sink; + +/* Create discovery over `core`. `watch_dir` (e.g. /dev/shm/vmsig) is scanned once and + * inotify-watched. `probe` NULL => default Proxmox probe over (watch_dir, pve_conf, qmp_dir); + * `sink` NULL => default core trio; `slots_path` NULL => no persistence. Registers the inotify + * + retry-timer loop sources and runs a bootstrap scan. The core owns the lifetime (freed at + * vmsig_core_free via the source on_free). NULL on error. */ +vmsig_discovery* vmsig_discovery_new(vmsig_core* core, + const char* watch_dir, const char* pve_conf, + const char* qmp_dir, const char* slots_path, + const vmsig_host_probe* probe, + const vmsig_discovery_sink* sink); + +/* Resolve vmid -> endpoint for the admission policy (WS4); -1 if not currently attached. */ +int vmsig_discovery_slot_of_vmid(vmsig_discovery* d, uint32_t vmid); + +/* TEST-ONLY: drive a file appear(present=1)/gone(present=0) directly, bypassing inotify; and + * force a re-probe of every probing candidate, bypassing the retry timer. Lets the state + * machine be unit-tested deterministically without threads/timers. */ +void vmsig_discovery_feed(vmsig_discovery* d, uint32_t vmid, int present); +void vmsig_discovery_tick(vmsig_discovery* d); + +#endif /* VMSIG_DISCOVERY_H */ diff --git a/src/discovery/include/host_probe.h b/src/discovery/include/host_probe.h new file mode 100644 index 0000000..2537257 --- /dev/null +++ b/src/discovery/include/host_probe.h @@ -0,0 +1,48 @@ +#ifndef VMSIG_HOST_PROBE_H +#define VMSIG_HOST_PROBE_H +#include + +/* host_probe.h — the platform-coupled discovery seam (private to the discovery module). + * + * This is the ONLY surface that knows the host's config convention (/etc/pve/qemu-server), + * the QMP socket path convention, and the `info mtree` text. It produces a NEUTRAL facts + * struct; discovery.c consumes ONLY that and never names a path convention. A non-Proxmox + * host (or a unit test) injects its own vmsig_host_probe with the same two-stage contract. */ + +#define VMSIG_HF_NAME_MAX 32 +#define VMSIG_HF_PATH_MAX 128 + +typedef struct { + uint32_t vmid; + char name[VMSIG_HF_NAME_MAX]; /* host VM name (truncated) */ + char ram_path[VMSIG_HF_PATH_MAX]; /* guest-RAM backing file (the trigger) */ + char qmp_path[VMSIG_HF_PATH_MAX]; /* QMP socket ('@' prefix => abstract) */ + uint64_t cfg_ram_bytes; /* RAM size from host config (sanity) */ + uint64_t low; /* below-4G split (memctx locator); 0=unknown */ + int vm_state; /* VMSIG_VM_* from the liveness oracle */ + int share_on; /* memory-backend share=on verified */ + int ok; /* 1 => all fail-closed gates passed (attach) */ + int retry; /* 1 => transient (QMP not up yet) — back off */ +} vmsig_host_facts; + +/* Two-stage probe. Stage 1 reads host config (cheap, local). Stage 2 corroborates liveness + * and derives `low` (QMP round-trip, bounded). Splitting them lets the state machine treat + * "config error" (permanent, drop) apart from "QMP not up yet" (transient, retry). */ +typedef struct vmsig_host_probe { + /* Populate paths + name + cfg_ram_bytes + share_on from host config; stat the RAM file. + * Sets out->ok=0 on any permanent gate failure (no share=on, missing/oversized file). + * Returns 0 when `out` was populated, -1 on a usage error. */ + int (*config)(const struct vmsig_host_probe* p, uint32_t vmid, vmsig_host_facts* out); + /* Corroborate liveness + derive `low` via QMP. Mutates `io`: sets vm_state, low, ok; or + * retry=1 (QMP not reachable yet) / ok=0 (stale: file present but VM dead / unparsable). */ + int (*live)(const struct vmsig_host_probe* p, vmsig_host_facts* io); + void* ud; /* implementation-private */ +} vmsig_host_probe; + +/* The default Proxmox probe over (watch_dir, pve_conf). `qmp_dir` is the QMP socket dir + * (Proxmox: /var/run/qemu-server, socket "/.qmp"). The returned struct + * references the path strings by pointer — the caller keeps them alive. */ +vmsig_host_probe host_probe_proxmox(const char* watch_dir, const char* pve_conf, + const char* qmp_dir); + +#endif /* VMSIG_HOST_PROBE_H */ diff --git a/src/discovery/include/slot.h b/src/discovery/include/slot.h new file mode 100644 index 0000000..77c1e8c --- /dev/null +++ b/src/discovery/include/slot.h @@ -0,0 +1,49 @@ +#ifndef VMSIG_SLOT_H +#define VMSIG_SLOT_H +#include + +/* slot.h — vmid <-> endpoint allocator (private to the discovery module). + * + * The signaling core addresses VMs by an ENDPOINT bit in a 64-bit mask (endpoint < 64). A + * Proxmox vmid (100..1e9) does NOT fit 6 bits, so the binding is a PINNED table, not a pure + * function: a vmid keeps the SAME endpoint across VM restarts (so a control's endpoint_mask + * stays coherent), and the table is persisted so a daemon restart re-derives the same map. + * + * Bit reuse is a coherence event, not a silent alias: a freed bit is handed to a DIFFERENT + * vmid only AFTER the roster DETACH for the old occupant has been published. The discovery + * loop is single-threaded and publishes DETACH synchronously before any later attach, so the + * ordering itself enforces this — the allocator only needs to never double-assign a live bit. */ + +#define VMSIG_SLOT_COUNT 64 + +typedef struct { + uint32_t vmid; /* 0 => slot free */ +} slot_ent; + +typedef struct { + slot_ent ent[VMSIG_SLOT_COUNT]; + uint64_t used_mask; /* mirror: bit e set <=> ent[e].vmid != 0 */ +} slot_table; + +/* Reset to all-free. */ +void slot_init(slot_table* t); + +/* Endpoint pinned to `vmid`, or -1 if `vmid` is not bound (or 0). */ +int slot_lookup(const slot_table* t, uint32_t vmid); + +/* Pin `vmid` to a stable endpoint. Idempotent: if `vmid` is already bound, returns its + * existing endpoint. Otherwise assigns the lowest free bit. Returns the endpoint [0,64), + * or -1 if `vmid`==0 or the table is full (the 64-VM ceiling). */ +int slot_alloc(slot_table* t, uint32_t vmid); + +/* Release the slot bound to `vmid` (no-op if not bound). */ +void slot_free(slot_table* t, uint32_t vmid); + +/* Persist the table to `path` atomically (tmp + rename), mode 0600. 0 / -1. */ +int slot_save(const slot_table* t, const char* path); + +/* Load the table from `path`. On a missing/corrupt file, initializes empty and returns 0 + * (a fresh start is valid). -1 only on a hard error. */ +int slot_load(slot_table* t, const char* path); + +#endif /* VMSIG_SLOT_H */ diff --git a/src/discovery/linux/host_probe.c b/src/discovery/linux/host_probe.c new file mode 100644 index 0000000..f71ab48 --- /dev/null +++ b/src/discovery/linux/host_probe.c @@ -0,0 +1,244 @@ +/* host_probe.c — the default Proxmox host-probe (see host_probe.h). The ONLY TU that knows + * /etc/pve/qemu-server, the QMP socket path convention, and `info mtree`. Pure libc + + * AF_UNIX + files; no vmie/vmctl. config() is cheap+local; live() does a bounded blocking + * QMP round-trip (query-status + info mtree) and is fail-closed: anything it cannot confirm + * leaves ok=0 (the VM is not brought up rather than guessed). */ +#define _GNU_SOURCE +#include "host_probe.h" +#include "vmsig_event.h" /* VMSIG_VM_* */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct { + const char* watch_dir; /* /dev/shm/vmsig */ + const char* pve_conf; /* /etc/pve/qemu-server */ + const char* qmp_dir; /* /var/run/qemu-server */ +} hp_cfg; + +/* ---- /etc/pve config (stage 1) ----------------------------------------------- */ + +/* Read a whole small file into a heap buffer (NUL-terminated). NULL on error/oversize. */ +static char* read_file(const char* path, size_t cap) { + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) return NULL; + char* buf = malloc(cap + 1); + if (!buf) { close(fd); return NULL; } + size_t got = 0; + for (;;) { + ssize_t n = read(fd, buf + got, cap - got); + if (n < 0) { if (errno == EINTR) continue; free(buf); close(fd); return NULL; } + if (n == 0) break; + got += (size_t)n; + if (got >= cap) break; + } + close(fd); + buf[got] = 0; + return buf; +} + +/* Value of a top-level "key:" line (Proxmox ini), copied trimmed into out. 1 if found. */ +static int conf_val(const char* conf, const char* key, char* out, size_t cap) { + size_t klen = strlen(key); + const char* p = conf; + while (p && *p) { + const char* line = p; + const char* nl = strchr(p, '\n'); + size_t llen = nl ? (size_t)(nl - line) : strlen(line); + if (llen > klen && strncmp(line, key, klen) == 0 && line[klen] == ':') { + const char* v = line + klen + 1; + while (*v == ' ' || *v == '\t') v++; + size_t vlen = (size_t)((line + llen) - v); + while (vlen && (v[vlen-1] == ' ' || v[vlen-1] == '\t' || v[vlen-1] == '\r')) vlen--; + if (vlen >= cap) vlen = cap - 1; + memcpy(out, v, vlen); out[vlen] = 0; + return 1; + } + p = nl ? nl + 1 : NULL; + } + return 0; +} + +static int hp_config(const struct vmsig_host_probe* p, uint32_t vmid, vmsig_host_facts* out) { + const hp_cfg* c = p->ud; + memset(out, 0, sizeof *out); + out->vmid = vmid; + + snprintf(out->ram_path, sizeof out->ram_path, "%s/vm-%u-ram", c->watch_dir, vmid); + snprintf(out->qmp_path, sizeof out->qmp_path, "%s/%u.qmp", c->qmp_dir, vmid); + + char path[VMSIG_HF_PATH_MAX + 32]; + snprintf(path, sizeof path, "%s/%u.conf", c->pve_conf, vmid); + char* conf = read_file(path, 64 * 1024); + if (!conf) { out->ok = 0; return 0; } /* no host config => not a known VM */ + + char tmp[VMSIG_HF_NAME_MAX]; + if (conf_val(conf, "name", out->name, sizeof out->name) == 0) + snprintf(out->name, sizeof out->name, "vm-%u", vmid); + if (conf_val(conf, "memory", tmp, sizeof tmp)) + out->cfg_ram_bytes = (uint64_t)strtoull(tmp, NULL, 10) * 1024ull * 1024ull; + + /* share=on is mandatory: without it the host mmap is a private copy, not guest RAM. */ + out->share_on = (strstr(conf, "share=on") != NULL) ? 1 : 0; + free(conf); + + out->ok = out->share_on ? 1 : 0; /* config-level pass; liveness is stage 2 */ + return 0; +} + +/* ---- QMP liveness + mtree low (stage 2) -------------------------------------- */ + +static int qmp_connect(const char* path) { + int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (fd < 0) return -1; + struct timeval tv = { .tv_sec = 0, .tv_usec = 250000 }; /* 250ms bound on each recv */ + setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv); + setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof tv); + + struct sockaddr_un a; + memset(&a, 0, sizeof a); + a.sun_family = AF_UNIX; + size_t n = strlen(path); + socklen_t alen; + if (path[0] == '@') { /* abstract namespace */ + if (n > sizeof a.sun_path) { close(fd); return -1; } + a.sun_path[0] = 0; + memcpy(a.sun_path + 1, path + 1, n - 1); + alen = (socklen_t)(offsetof(struct sockaddr_un, sun_path) + n); + } else { + if (n >= sizeof a.sun_path) { close(fd); return -1; } + memcpy(a.sun_path, path, n); + alen = (socklen_t)(offsetof(struct sockaddr_un, sun_path) + n + 1); + } + if (connect(fd, (struct sockaddr*)&a, alen) < 0) { close(fd); return -1; } + return fd; +} + +/* Read ONE '\n'-terminated QMP message into buf (QMP frames each JSON object on a line; + * an HMP string return keeps its newlines escaped, so it is still a single line). 1 / 0 / -1. */ +static int qmp_read_line(int fd, char* buf, size_t cap, size_t* out_len) { + size_t got = 0; + while (got + 1 < cap) { + ssize_t r = read(fd, buf + got, cap - 1 - got); + if (r < 0) { if (errno == EINTR) continue; return -1; } /* timeout/error */ + if (r == 0) return (got > 0) ? 1 : 0; + got += (size_t)r; + char* nl = memchr(buf, '\n', got); + if (nl) { *out_len = got; buf[got] = 0; return 1; } + } + *out_len = got; buf[got] = 0; + return 1; /* line longer than cap: truncated but usable for our scans */ +} + +/* Read messages until one carries "return"/"error", skipping async "event"s. 1 if a return, + * 0 if an error/closed, -1 on transport error. The matched message is left in buf. */ +static int qmp_await_return(int fd, char* buf, size_t cap) { + for (int i = 0; i < 64; i++) { + size_t len = 0; + int r = qmp_read_line(fd, buf, cap, &len); + if (r <= 0) return r; + if (strstr(buf, "\"error\"")) return 0; + if (strstr(buf, "\"return\"")) return 1; + /* greeting {"QMP":...} or async {"event":...} -> keep reading */ + } + return -1; +} + +static int qmp_cmd(int fd, const char* json, char* buf, size_t cap) { + size_t n = strlen(json); + if (write(fd, json, n) != (ssize_t)n) return -1; + return qmp_await_return(fd, buf, cap); +} + +/* Map a QEMU query-status "status" word to VMSIG_VM_*. Alive = running|paused. */ +static int qmp_status_word(const char* buf) { + const char* s = strstr(buf, "\"status\""); + if (!s) return VMSIG_VM_UNKNOWN; + s = strchr(s, ':'); if (!s) return VMSIG_VM_UNKNOWN; + s = strchr(s, '"'); if (!s) return VMSIG_VM_UNKNOWN; + s++; + if (!strncmp(s, "running", 7)) return VMSIG_VM_RUNNING; + if (!strncmp(s, "paused", 6)) return VMSIG_VM_PAUSED; + if (!strncmp(s, "prelaunch", 9)) return VMSIG_VM_PAUSED; + if (!strncmp(s, "shutdown", 8)) return VMSIG_VM_SHUTDOWN; + if (!strncmp(s, "guest-panicked", 14) || !strncmp(s, "internal-error", 14)) + return VMSIG_VM_CRASHED; + return VMSIG_VM_UNKNOWN; +} + +/* Derive the below-4G split from `info mtree` text: the size of the RAM region whose guest + * physical range starts at address 0. Standard QEMU split-RAM layout puts low RAM at + * [0, low) and high RAM above 4G at file offset @low. FAIL-CLOSED: 0 if not found. + * NOTE: parses HMP text (not a stable QMP schema) — verify against real `info mtree` output. */ +static uint64_t mtree_low(const char* ret) { + /* The return is a JSON string; lines inside are escaped "\n". Scan for the GPA-0 ram run: + * " 0000000000000000- (prio N, ram): ..." */ + const char* p = ret; + while ((p = strstr(p, "0000000000000000-")) != NULL) { + const char* end_hex = p + 17; /* 16 zeros + '-' */ + char* stop = NULL; + unsigned long long end = strtoull(end_hex, &stop, 16); + /* the descriptor after the range must mark it RAM (not the i/o "system" root) */ + const char* tail = stop ? stop : end_hex; + const char* nl = strstr(tail, "\\n"); + const char* lim = nl ? nl : (tail + 64); + int is_ram = 0; + for (const char* q = tail; q < lim && *q; q++) + if (!strncmp(q, "ram)", 4)) { is_ram = 1; break; } + if (is_ram && end > 0 && end != ~0ull) return end + 1ull; /* [0, end] => low=end+1 */ + p = end_hex; + } + return 0; +} + +static int hp_live(const struct vmsig_host_probe* p, vmsig_host_facts* io) { + (void)p; + io->retry = 0; + int fd = qmp_connect(io->qmp_path); + if (fd < 0) { io->retry = 1; io->ok = 0; return 0; } /* QMP not up yet => transient */ + + char* buf = malloc(256 * 1024); + if (!buf) { close(fd); io->retry = 1; io->ok = 0; return 0; } + + int alive = 0; + if (qmp_cmd(fd, "{\"execute\":\"qmp_capabilities\"}\n", buf, 256 * 1024) == 1 && + qmp_cmd(fd, "{\"execute\":\"query-status\"}\n", buf, 256 * 1024) == 1) { + io->vm_state = qmp_status_word(buf); + alive = (io->vm_state == VMSIG_VM_RUNNING || io->vm_state == VMSIG_VM_PAUSED); + } else { + io->retry = 1; /* handshake failed mid-way => transient */ + } + + if (alive) { + if (qmp_cmd(fd, + "{\"execute\":\"human-monitor-command\"," + "\"arguments\":{\"command-line\":\"info mtree -f\"}}\n", buf, 256 * 1024) == 1) { + io->low = mtree_low(buf); + } + } + + free(buf); + close(fd); + + /* fail-closed: alive AND a parsed split => bring up; else not (stale / unparsable). */ + io->ok = (alive && io->low != 0) ? 1 : 0; + return 0; +} + +vmsig_host_probe host_probe_proxmox(const char* watch_dir, const char* pve_conf, + const char* qmp_dir) { + static hp_cfg cfg; /* single daemon-wide probe; paths are process-lifetime strings */ + cfg.watch_dir = watch_dir; + cfg.pve_conf = pve_conf; + cfg.qmp_dir = qmp_dir; + vmsig_host_probe p = { hp_config, hp_live, &cfg }; + return p; +} diff --git a/src/discovery/slot.c b/src/discovery/slot.c new file mode 100644 index 0000000..c8b7257 --- /dev/null +++ b/src/discovery/slot.c @@ -0,0 +1,91 @@ +/* slot.c — vmid <-> endpoint allocator (see slot.h). Pure logic + a tiny pointer-free + * on-disk format; no core dependency. */ +#define _GNU_SOURCE +#include "slot.h" +#include +#include +#include +#include +#include +#include + +void slot_init(slot_table* t) { + memset(t, 0, sizeof *t); +} + +int slot_lookup(const slot_table* t, uint32_t vmid) { + if (!vmid) return -1; + for (int e = 0; e < VMSIG_SLOT_COUNT; e++) + if (t->ent[e].vmid == vmid) return e; + return -1; +} + +int slot_alloc(slot_table* t, uint32_t vmid) { + if (!vmid) return -1; + int e = slot_lookup(t, vmid); + if (e >= 0) return e; /* idempotent pin */ + /* lowest free bit: ffsll of the complement (1-based; 0 => none free) */ + int b = __builtin_ffsll((long long)~t->used_mask); + if (b == 0) return -1; /* table full (64-VM ceiling) */ + e = b - 1; + t->ent[e].vmid = vmid; + t->used_mask |= (1ull << e); + return e; +} + +void slot_free(slot_table* t, uint32_t vmid) { + int e = slot_lookup(t, vmid); + if (e < 0) return; + t->ent[e].vmid = 0; + t->used_mask &= ~(1ull << e); +} + +/* ---- persistence: magic + version + 64 * uint32 vmid (native byte order, tmpfs-local) ---- */ +#define SLOT_MAGIC 0x534C4F54u /* "SLOT" */ +#define SLOT_VERSION 1u + +typedef struct { + uint32_t magic; + uint32_t version; + uint32_t vmid[VMSIG_SLOT_COUNT]; +} slot_blob; + +int slot_save(const slot_table* t, const char* path) { + if (!path) return -1; + slot_blob b; + memset(&b, 0, sizeof b); + b.magic = SLOT_MAGIC; b.version = SLOT_VERSION; + for (int e = 0; e < VMSIG_SLOT_COUNT; e++) b.vmid[e] = t->ent[e].vmid; + + char tmp[512]; + int n = snprintf(tmp, sizeof tmp, "%s.tmp", path); + if (n < 0 || (size_t)n >= sizeof tmp) return -1; + + int fd = open(tmp, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0600); + if (fd < 0) return -1; + ssize_t w = write(fd, &b, sizeof b); + int rc = (w == (ssize_t)sizeof b) ? 0 : -1; + if (close(fd) != 0) rc = -1; + if (rc == 0 && rename(tmp, path) != 0) rc = -1; + if (rc != 0) unlink(tmp); + return rc; +} + +int slot_load(slot_table* t, const char* path) { + slot_init(t); + if (!path) return 0; + int fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) return 0; /* no file => fresh start (valid) */ + slot_blob b; + ssize_t r = read(fd, &b, sizeof b); + close(fd); + if (r != (ssize_t)sizeof b || b.magic != SLOT_MAGIC || b.version != SLOT_VERSION) { + slot_init(t); /* corrupt/old => fresh start */ + return 0; + } + for (int e = 0; e < VMSIG_SLOT_COUNT; e++) { + t->ent[e].vmid = b.vmid[e]; + if (b.vmid[e]) t->used_mask |= (1ull << e); + } + return 0; +} diff --git a/src/si/input/include/driver.h b/src/si/input/include/driver.h new file mode 100644 index 0000000..8dfd5ee --- /dev/null +++ b/src/si/input/include/driver.h @@ -0,0 +1,39 @@ +#ifndef VMCTL_DRIVER_H +#define VMCTL_DRIVER_H +#include "vmctl.h" +#include "qmp.h" + +/* driver.h — input-driver vtable, the concrete vmctl handle, and the shared + * event-kind enum. The event kind is the SINGLE source of truth that every + * driver switches on (never on magic numbers). */ + +typedef enum { + VMCTL_EV_ABS, VMCTL_EV_REL, VMCTL_EV_BTN, VMCTL_EV_KEY, VMCTL_EV_SCROLL +} vmctl_ev_kind; + +typedef struct { + int (*send)(vmctl_t* v, const vmctl_batch* b); /* deliver an input batch */ + void (*close)(vmctl_t* v); /* release driver resources */ +} vmctl_driver_ops; + +struct vmctl { + vmctl_driver_ops ops; + vmctl_driver driver; + qmp_conn* qmp; /* control channel; NULL if none */ + int ui_fd_a; /* uinput driver: device A; -1 for QMP */ + int ui_fd_b; /* uinput driver: device B (BOTH); -1 */ + int ptr_mode; /* uinput driver: VMCTL_PTR_*; 0 for QMP */ + + /* Held-state receipt: key/btn down-bits as THIS handle last actuated them + * (not guest truth). Written only after a successful send in + * vmctl_batch_send; the send path never reads them. Zero-initialised by + * calloc at open = all up. Single-threaded (one handle owner): no locks. */ + unsigned char keys_held[VMCTL_KEYS_SNAPSHOT_BYTES]; /* evdev-indexed key down-bits */ + unsigned btns_held; /* VMCTL_BTN_* 0..7 down-bits */ +}; + +/* driver factories (called from open.c per cfg->driver) */ +vmctl_t* vmctl_open_qmp_driver (const vmctl_config* cfg); +vmctl_t* vmctl_open_uinput_driver(const vmctl_config* cfg); + +#endif /* VMCTL_DRIVER_H */ diff --git a/src/si/input/include/keymap.h b/src/si/input/include/keymap.h new file mode 100644 index 0000000..448836a --- /dev/null +++ b/src/si/input/include/keymap.h @@ -0,0 +1,18 @@ +#ifndef VMCTL_KEYMAP_H +#define VMCTL_KEYMAP_H +#include + +/* keymap.h — the single source of truth for keyboard keys. One descriptor maps + * a Linux evdev code to a QEMU QKeyCode name. Both the QMP and uinput drivers + * derive everything from this table. */ + +/* NOTE: named vmctl_keymap, not vmctl_key — the public API uses the ordinary + * identifier vmctl_key for the key-injection function (include/vmctl.h), and a + * typedef would collide with it. */ +typedef struct { int evdev; const char* qcode; } vmctl_keymap; +extern const vmctl_keymap VMCTL_KEYS[]; /* sorted by evdev (for bsearch) */ +extern const int VMCTL_KEYS_LEN; + +const char* vmctl_evdev_to_qcode(int evdev); /* NULL if absent */ + +#endif /* VMCTL_KEYMAP_H */ diff --git a/src/si/input/include/qmp.h b/src/si/input/include/qmp.h new file mode 100644 index 0000000..f403e7d --- /dev/null +++ b/src/si/input/include/qmp.h @@ -0,0 +1,14 @@ +#ifndef VMCTL_QMP_H +#define VMCTL_QMP_H +#include + +/* qmp.h — minimal QMP client over an AF_UNIX socket: connect (with capability + * negotiation), disconnect, and synchronous command execution. */ + +typedef struct qmp_conn qmp_conn; + +qmp_conn* qmp_connect(const char* sock_path); /* connect + qmp_capabilities; NULL on error */ +void qmp_disconnect(qmp_conn* c); +int qmp_exec(qmp_conn* c, const char* cmd, char* resp, size_t cap); /* 0=return, -1=error */ + +#endif /* VMCTL_QMP_H */ diff --git a/src/si/input/keymap.c b/src/si/input/keymap.c new file mode 100644 index 0000000..da97e32 --- /dev/null +++ b/src/si/input/keymap.c @@ -0,0 +1,115 @@ +/* keymap.c — the single source of truth for keyboard keys. VMCTL_KEYS maps + * Linux evdev codes to QEMU QKeyCode names (sorted by evdev for bsearch); + * vmctl_evdev_to_qcode is the sole lookup, consumed by the QMP driver. */ + +#include "keymap.h" + +#include +#include + +const vmctl_keymap VMCTL_KEYS[] = { + { KEY_ESC, "esc" }, + { KEY_1, "1" }, + { KEY_2, "2" }, + { KEY_3, "3" }, + { KEY_4, "4" }, + { KEY_5, "5" }, + { KEY_6, "6" }, + { KEY_7, "7" }, + { KEY_8, "8" }, + { KEY_9, "9" }, + { KEY_0, "0" }, + { KEY_MINUS, "minus" }, + { KEY_EQUAL, "equal" }, + { KEY_BACKSPACE, "backspace" }, + { KEY_TAB, "tab" }, + { KEY_Q, "q" }, + { KEY_W, "w" }, + { KEY_E, "e" }, + { KEY_R, "r" }, + { KEY_T, "t" }, + { KEY_Y, "y" }, + { KEY_U, "u" }, + { KEY_I, "i" }, + { KEY_O, "o" }, + { KEY_P, "p" }, + { KEY_LEFTBRACE, "bracket_left" }, + { KEY_RIGHTBRACE, "bracket_right" }, + { KEY_ENTER, "ret" }, + { KEY_LEFTCTRL, "ctrl" }, + { KEY_A, "a" }, + { KEY_S, "s" }, + { KEY_D, "d" }, + { KEY_F, "f" }, + { KEY_G, "g" }, + { KEY_H, "h" }, + { KEY_J, "j" }, + { KEY_K, "k" }, + { KEY_L, "l" }, + { KEY_SEMICOLON, "semicolon" }, + { KEY_APOSTROPHE, "apostrophe" }, + { KEY_GRAVE, "grave_accent" }, + { KEY_LEFTSHIFT, "shift" }, + { KEY_BACKSLASH, "backslash" }, + { KEY_Z, "z" }, + { KEY_X, "x" }, + { KEY_C, "c" }, + { KEY_V, "v" }, + { KEY_B, "b" }, + { KEY_N, "n" }, + { KEY_M, "m" }, + { KEY_COMMA, "comma" }, + { KEY_DOT, "dot" }, + { KEY_SLASH, "slash" }, + { KEY_RIGHTSHIFT, "shift_r" }, + { KEY_LEFTALT, "alt" }, + { KEY_SPACE, "spc" }, + { KEY_CAPSLOCK, "caps_lock" }, + { KEY_F1, "f1" }, + { KEY_F2, "f2" }, + { KEY_F3, "f3" }, + { KEY_F4, "f4" }, + { KEY_F5, "f5" }, + { KEY_F6, "f6" }, + { KEY_F7, "f7" }, + { KEY_F8, "f8" }, + { KEY_F9, "f9" }, + { KEY_F10, "f10" }, + { KEY_NUMLOCK, "num_lock" }, + { KEY_SCROLLLOCK, "scroll_lock" }, + { KEY_102ND, "less" }, + { KEY_F11, "f11" }, + { KEY_F12, "f12" }, + { KEY_RIGHTCTRL, "ctrl_r" }, + { KEY_SYSRQ, "print" }, + { KEY_RIGHTALT, "alt_r" }, + { KEY_HOME, "home" }, + { KEY_UP, "up" }, + { KEY_PAGEUP, "pgup" }, + { KEY_LEFT, "left" }, + { KEY_RIGHT, "right" }, + { KEY_END, "end" }, + { KEY_DOWN, "down" }, + { KEY_PAGEDOWN, "pgdn" }, + { KEY_INSERT, "insert" }, + { KEY_DELETE, "delete" }, + { KEY_POWER, "power" }, + { KEY_PAUSE, "pause" }, + { KEY_LEFTMETA, "meta_l" }, + { KEY_RIGHTMETA, "meta_r" }, + { KEY_SLEEP, "sleep" }, + { KEY_WAKEUP, "wake" }, +}; + +const int VMCTL_KEYS_LEN = (int)(sizeof VMCTL_KEYS / sizeof VMCTL_KEYS[0]); + +static int key_cmp(const void* a, const void* b) { + return ((const vmctl_keymap*)a)->evdev - ((const vmctl_keymap*)b)->evdev; +} + +const char* vmctl_evdev_to_qcode(int evdev) { + vmctl_keymap k = { .evdev = evdev, .qcode = NULL }; + const vmctl_keymap* e = bsearch(&k, VMCTL_KEYS, (size_t)VMCTL_KEYS_LEN, + sizeof VMCTL_KEYS[0], key_cmp); + return e ? e->qcode : NULL; +} diff --git a/src/si/input/linux/uinput_driver.c b/src/si/input/linux/uinput_driver.c new file mode 100644 index 0000000..d33ff5b --- /dev/null +++ b/src/si/input/linux/uinput_driver.c @@ -0,0 +1,274 @@ +/* uinput_driver.c — Linux uinput input driver (host source) plus optional + * passthrough into the guest. TWO distinct layers, not to be confused: + * + * (1) uinput — the host side: the library creates a /dev/input/eventN node + * and writes struct input_event into it on the hot path (uinput_driver_send). + * + * (2) virtio-input-host-pci — a QEMU device that forwards that host evdev node + * into the guest. It is an OPTIONAL setup step performed over QMP at open + * (device_add) and undone at close (device_del). It is NOT a per-event + * mechanism and lives entirely in the hotplug helpers below. + * + * uinput != virtio. Without qmp_path/input_bus the uinput device is created + * orphaned (an external layer may forward it). The driver switches on + * vmctl_ev_kind (never on magic numbers). */ + +#include "driver.h" +#include "keymap.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* HID identity of the synthesized device (values preserved — behaviour unchanged). */ +#define HWID_BUS 0x0003 +#define HWID_VENDOR 0x046D +#define HWID_PRODUCT 0xC52B +#define HWID_VERSION 0x0111 +#define HWID_NAME_A "VMInput-A" +#define HWID_NAME_B "VMInput-B" + +/* Hotplug device ids for virtio-input-host-pci passthrough. */ +#define PLUG_ID_A "vmctl-a" +#define PLUG_ID_B "vmctl-b" + +static const uint16_t BTN_CODES[8] = { + 0x110, 0x111, 0x112, 0x113, 0x114, 0x115, 0x116, 0x117 +}; + +static void emit(int fd, uint16_t type, uint16_t code, int32_t val) { + struct input_event e = {.type = type, .code = code, .value = val}; + ssize_t r = write(fd, &e, sizeof e); + (void)r; +} + +static void syn(int fd) { emit(fd, EV_SYN, SYN_REPORT, 0); } + +static int uinput_create(int rel_motion, const vmctl_uinput_id* id, const char* name, char evdev[64]) { + int fd = open("/dev/uinput", O_RDWR | O_CLOEXEC); + if (fd < 0) return -1; + + ioctl(fd, UI_SET_EVBIT, EV_SYN); + + ioctl(fd, UI_SET_EVBIT, EV_KEY); + /* Keyboard keybits come from the single source of truth: every key in + * VMCTL_KEYS, so a key in the table always works through uinput too. */ + for (int i = 0; i < VMCTL_KEYS_LEN; i++) + ioctl(fd, UI_SET_KEYBIT, VMCTL_KEYS[i].evdev); + for (int b = 0; b < 8; b++) + ioctl(fd, UI_SET_KEYBIT, (int)BTN_CODES[b]); + + ioctl(fd, UI_SET_EVBIT, EV_REL); + ioctl(fd, UI_SET_RELBIT, REL_WHEEL); + ioctl(fd, UI_SET_RELBIT, REL_HWHEEL); + if (rel_motion) { + ioctl(fd, UI_SET_RELBIT, REL_X); + ioctl(fd, UI_SET_RELBIT, REL_Y); + } + + if (!rel_motion) { + ioctl(fd, UI_SET_EVBIT, EV_ABS); + ioctl(fd, UI_SET_ABSBIT, ABS_X); + ioctl(fd, UI_SET_ABSBIT, ABS_Y); + + struct uinput_abs_setup ax; + memset(&ax, 0, sizeof ax); + ax.code = ABS_X; + ax.absinfo.minimum = 0; + ax.absinfo.maximum = VMCTL_ABS_MAX; + ioctl(fd, UI_ABS_SETUP, &ax); + ax.code = ABS_Y; + ioctl(fd, UI_ABS_SETUP, &ax); + } + + struct uinput_setup us; + memset(&us, 0, sizeof us); + us.id.bustype = (uint16_t)id->bustype; + us.id.vendor = (uint16_t)id->vendor; + us.id.product = (uint16_t)id->product; + us.id.version = (uint16_t)id->version; + strncpy(us.name, name, sizeof us.name - 1); + + if (ioctl(fd, UI_DEV_SETUP, &us) < 0 || ioctl(fd, UI_DEV_CREATE) < 0) { + close(fd); + return -1; + } + + char sysname[64] = {0}; + evdev[0] = '\0'; + if (ioctl(fd, UI_GET_SYSNAME(sizeof sysname), sysname) >= 0) + snprintf(evdev, 64, "/dev/input/%s", sysname); + + if (!evdev[0]) { + ioctl(fd, UI_DEV_DESTROY); + close(fd); + return -1; + } + + return fd; +} + +/* ===== virtio-input-host-pci passthrough (layer 2, optional, QMP setup) ===== */ + +static int qmp_plug(qmp_conn* qmp, const char* bus, const char* evdev, const char* id) { + char cmd[512], resp[1024]; + snprintf(cmd, sizeof cmd, + "{\"execute\":\"device_del\",\"arguments\":{\"id\":\"%s\"}}", id); + qmp_exec(qmp, cmd, resp, sizeof resp); + + snprintf(cmd, sizeof cmd, + "{\"execute\":\"device_add\",\"arguments\":{" + "\"driver\":\"virtio-input-host-pci\"," + "\"id\":\"%s\"," + "\"evdev\":\"%s\"," + "\"bus\":\"%s\"}}", + id, evdev, bus); + return qmp_exec(qmp, cmd, resp, sizeof resp); +} + +static void qmp_unplug(qmp_conn* qmp, const char* id) { + char cmd[256], resp[1024]; + snprintf(cmd, sizeof cmd, + "{\"execute\":\"device_del\",\"arguments\":{\"id\":\"%s\"}}", id); + qmp_exec(qmp, cmd, resp, sizeof resp); +} + +/* ===== hot path (layer 1, uinput write) ===== */ + +static int uinput_driver_send(vmctl_t* v, const vmctl_batch* b) { + int fd_a = v->ui_fd_a; + int fd_b = v->ui_fd_b; + int both = (fd_b >= 0); + + for (int i = 0; i < b->count; i++) { + int code = b->ev[i].code; + int value = b->ev[i].value; + double scl = b->ev[i].scroll; + + switch ((vmctl_ev_kind)b->ev[i].kind) { + case VMCTL_EV_ABS: + if (v->ptr_mode == VMCTL_PTR_REL) return -1; + emit(fd_a, EV_ABS, code == VMCTL_AXIS_X ? ABS_X : ABS_Y, value); + syn(fd_a); + break; + case VMCTL_EV_REL: { + if (!both && v->ptr_mode == VMCTL_PTR_ABS) return -1; + int fd = both ? fd_b : fd_a; + emit(fd, EV_REL, code == VMCTL_AXIS_X ? REL_X : REL_Y, value); + syn(fd); + break; + } + case VMCTL_EV_BTN: + if (code < 0 || code >= 8) return -1; + emit(fd_a, EV_KEY, BTN_CODES[code], value); + syn(fd_a); + break; + case VMCTL_EV_KEY: + emit(fd_a, EV_KEY, (uint16_t)code, value); + syn(fd_a); + break; + case VMCTL_EV_SCROLL: + emit(fd_a, EV_REL, code == VMCTL_SCROLL_V ? REL_WHEEL : REL_HWHEEL, (int32_t)scl); + syn(fd_a); + break; + default: + return -1; + } + } + return 0; +} + +static void uinput_driver_close(vmctl_t* v) { + if (v->qmp) { + qmp_unplug(v->qmp, PLUG_ID_A); + if (v->ui_fd_b >= 0) qmp_unplug(v->qmp, PLUG_ID_B); + qmp_disconnect(v->qmp); + } + if (v->ui_fd_a >= 0) { ioctl(v->ui_fd_a, UI_DEV_DESTROY); close(v->ui_fd_a); } + if (v->ui_fd_b >= 0) { ioctl(v->ui_fd_b, UI_DEV_DESTROY); close(v->ui_fd_b); } +} + +vmctl_t* vmctl_open_uinput_driver(const vmctl_config* cfg) { + vmctl_t* v = calloc(1, sizeof *v); + if (!v) return NULL; + v->driver = VMCTL_DRIVER_UINPUT; + v->ui_fd_a = -1; + v->ui_fd_b = -1; + + /* HID identity: NULL config selects the built-in defaults verbatim; a + * non-NULL config supplies all numeric fields literally (zeros included). */ + const vmctl_uinput_id DEFAULT_ID = { + HWID_BUS, HWID_VENDOR, HWID_PRODUCT, HWID_VERSION, HWID_NAME_A + }; + const vmctl_uinput_id* id = cfg->uinput_id ? cfg->uinput_id : &DEFAULT_ID; + + /* Base name: caller's non-empty name, else NULL = use default A/B names. */ + const char* base = (cfg->uinput_id && cfg->uinput_id->name && cfg->uinput_id->name[0]) + ? cfg->uinput_id->name : NULL; + + /* A/B suffix is added by the library only when two devices are created + * (VMCTL_PTR_BOTH) and only over a caller-supplied base name. */ + char name_a[UINPUT_MAX_NAME_SIZE]; + char name_b[UINPUT_MAX_NAME_SIZE]; + const char* dev_a = base ? base : HWID_NAME_A; + const char* dev_b = HWID_NAME_B; + if (cfg->ptr_mode == VMCTL_PTR_BOTH && base) { + int base_max = (int)(sizeof name_a - 1 /*NUL*/ - 2 /*"-A"*/); + snprintf(name_a, sizeof name_a, "%.*s-A", base_max, base); + snprintf(name_b, sizeof name_b, "%.*s-B", base_max, base); + dev_a = name_a; + dev_b = name_b; + } + + char evdev_a[64], evdev_b[64]; + int rel_a = (cfg->ptr_mode == VMCTL_PTR_REL); + v->ui_fd_a = uinput_create(rel_a, id, dev_a, evdev_a); + if (v->ui_fd_a < 0) { free(v); return NULL; } + + if (cfg->ptr_mode == VMCTL_PTR_BOTH) { + v->ui_fd_b = uinput_create(1, id, dev_b, evdev_b); + if (v->ui_fd_b < 0) { + ioctl(v->ui_fd_a, UI_DEV_DESTROY); + close(v->ui_fd_a); + free(v); + return NULL; + } + } + + if (cfg->qmp_path) { + v->qmp = qmp_connect(cfg->qmp_path); + if (!v->qmp) { + if (v->ui_fd_b >= 0) { ioctl(v->ui_fd_b, UI_DEV_DESTROY); close(v->ui_fd_b); } + ioctl(v->ui_fd_a, UI_DEV_DESTROY); + close(v->ui_fd_a); + free(v); + return NULL; + } + if (cfg->input_bus && cfg->input_bus[0]) { + if (qmp_plug(v->qmp, cfg->input_bus, evdev_a, PLUG_ID_A) < 0) { + uinput_driver_close(v); + free(v); + return NULL; + } + if (cfg->ptr_mode == VMCTL_PTR_BOTH) { + if (qmp_plug(v->qmp, cfg->input_bus, evdev_b, PLUG_ID_B) < 0) { + qmp_unplug(v->qmp, PLUG_ID_A); + uinput_driver_close(v); + free(v); + return NULL; + } + } + } + } + + v->ops.send = uinput_driver_send; + v->ops.close = uinput_driver_close; + v->ptr_mode = cfg->ptr_mode; + return v; +} diff --git a/src/si/input/open.c b/src/si/input/open.c new file mode 100644 index 0000000..3a9e109 --- /dev/null +++ b/src/si/input/open.c @@ -0,0 +1,156 @@ +/* open.c — handle lifecycle and the input batch API. vmctl_open dispatches to a + * driver factory by cfg->driver; vmctl_close releases via ops.close. The batch + * builders set vmctl_event.kind (the single event-kind code that drivers read), + * and the single-event wrappers are thin batches of one. */ + +#include "driver.h" + +#include +#include + +vmctl_t* vmctl_open(const vmctl_config* cfg) { + if (!cfg) return NULL; + switch (cfg->driver) { + case VMCTL_DRIVER_QMP: return vmctl_open_qmp_driver(cfg); + case VMCTL_DRIVER_UINPUT: return vmctl_open_uinput_driver(cfg); + default: return NULL; + } +} + +void vmctl_close(vmctl_t* v) { + if (!v) return; + v->ops.close(v); + free(v); +} + +/* ===== Batch builders ===== */ + +void vmctl_batch_init(vmctl_batch* b) { + b->count = 0; +} + +void vmctl_batch_abs(vmctl_batch* b, int axis, int value) { + if (b->count >= VMCTL_BATCH_MAX) return; + vmctl_event* e = &b->ev[b->count++]; + e->kind = VMCTL_EV_ABS; e->code = axis; e->value = value; e->scroll = 0.0; +} + +void vmctl_batch_rel(vmctl_batch* b, int axis, int delta) { + if (b->count >= VMCTL_BATCH_MAX) return; + vmctl_event* e = &b->ev[b->count++]; + e->kind = VMCTL_EV_REL; e->code = axis; e->value = delta; e->scroll = 0.0; +} + +void vmctl_batch_btn(vmctl_batch* b, int btn, int down) { + if (b->count >= VMCTL_BATCH_MAX) return; + vmctl_event* e = &b->ev[b->count++]; + e->kind = VMCTL_EV_BTN; e->code = btn; e->value = down; e->scroll = 0.0; +} + +void vmctl_batch_key(vmctl_batch* b, int evdev_code, int down) { + if (b->count >= VMCTL_BATCH_MAX) return; + vmctl_event* e = &b->ev[b->count++]; + e->kind = VMCTL_EV_KEY; e->code = evdev_code; e->value = down; e->scroll = 0.0; +} + +void vmctl_batch_scroll(vmctl_batch* b, int axis, double value) { + if (b->count >= VMCTL_BATCH_MAX) return; + vmctl_event* e = &b->ev[b->count++]; + e->kind = VMCTL_EV_SCROLL; e->code = axis; e->value = 0; e->scroll = value; +} + +int vmctl_batch_send(vmctl_t* v, vmctl_batch* b) { + if (b->count == 0) return 0; + int rc = v->ops.send(v, b); + if (rc != 0) return rc; /* not sent = not recorded; never touch the receipt */ + + /* Record the actuated key/btn down-bits (write-only; the send path above + * never reads this map). abs/rel/scroll have no held state. */ + for (int i = 0; i < b->count; i++) { + const vmctl_event* e = &b->ev[i]; + int down = e->value ? 1 : 0; + switch (e->kind) { + case VMCTL_EV_KEY: { + int code = e->code; + if (code < 0 || code > VMCTL_KEY_CODE_MAX) break; /* out of range: ignore */ + unsigned char mask = (unsigned char)(1u << (code & 7)); + if (down) v->keys_held[code >> 3] |= mask; + else v->keys_held[code >> 3] &= (unsigned char)~mask; + break; + } + case VMCTL_EV_BTN: { + int btn = e->code; + if (btn < 0 || btn >= 8) break; /* out of range: ignore */ + unsigned mask = 1u << btn; + if (down) v->btns_held |= mask; + else v->btns_held &= ~mask; + break; + } + default: break; /* abs/rel/scroll: no-op for receipt */ + } + } + return rc; +} + +/* ===== Single-event wrappers ===== */ + +int vmctl_abs(vmctl_t* v, int axis, int value) { + vmctl_batch b; + vmctl_batch_init(&b); + vmctl_batch_abs(&b, axis, value); + return vmctl_batch_send(v, &b); +} + +int vmctl_rel(vmctl_t* v, int axis, int delta) { + vmctl_batch b; + vmctl_batch_init(&b); + vmctl_batch_rel(&b, axis, delta); + return vmctl_batch_send(v, &b); +} + +int vmctl_btn(vmctl_t* v, int btn, int down) { + vmctl_batch b; + vmctl_batch_init(&b); + vmctl_batch_btn(&b, btn, down); + return vmctl_batch_send(v, &b); +} + +int vmctl_key(vmctl_t* v, int evdev_code, int down) { + vmctl_batch b; + vmctl_batch_init(&b); + vmctl_batch_key(&b, evdev_code, down); + return vmctl_batch_send(v, &b); +} + +int vmctl_scroll(vmctl_t* v, int axis, double value) { + vmctl_batch b; + vmctl_batch_init(&b); + vmctl_batch_scroll(&b, axis, value); + return vmctl_batch_send(v, &b); +} + +/* ===== Held-state receipt (read-only) ===== + * Reads of the actuator's own last output; never mutate driver state. The + * in-range predicate matches the write path in vmctl_batch_send. */ + +int vmctl_key_held(vmctl_t* v, int evdev_code) { + if (!v || evdev_code < 0 || evdev_code > VMCTL_KEY_CODE_MAX) return 0; + return (v->keys_held[evdev_code >> 3] >> (evdev_code & 7)) & 1; +} + +int vmctl_btn_held(vmctl_t* v, int btn) { + if (!v || btn < 0 || btn >= 8) return 0; + return (int)((v->btns_held >> btn) & 1u); +} + +int vmctl_keys_snapshot(vmctl_t* v, unsigned char* bits, size_t nbytes) { + if (!v || !bits) return -1; + size_t n = nbytes < VMCTL_KEYS_SNAPSHOT_BYTES ? nbytes : VMCTL_KEYS_SNAPSHOT_BYTES; + memcpy(bits, v->keys_held, n); + return (int)n; +} + +unsigned vmctl_btns_snapshot(vmctl_t* v) { + if (!v) return 0; + return v->btns_held; +} diff --git a/src/si/input/power.c b/src/si/input/power.c new file mode 100644 index 0000000..c31f406 --- /dev/null +++ b/src/si/input/power.c @@ -0,0 +1,18 @@ +/* power.c — QMP power/lifecycle actuation. This plane is orthogonal to the + * input driver and always rides the shared QMP channel; every entry returns -1 + * when there is no connection. */ + +#include "driver.h" + +/* QMP responses are small; a stack buffer suffices. */ +static int qmp_simple(vmctl_t* v, const char* cmd) { + if (!v->qmp) return -1; + char resp[1024]; + return qmp_exec(v->qmp, cmd, resp, sizeof resp); +} + +int vmctl_powerdown(vmctl_t* v) { return qmp_simple(v, "{\"execute\":\"system_powerdown\"}"); } +int vmctl_reset (vmctl_t* v) { return qmp_simple(v, "{\"execute\":\"system_reset\"}"); } +int vmctl_wakeup (vmctl_t* v) { return qmp_simple(v, "{\"execute\":\"system_wakeup\"}"); } +int vmctl_pause (vmctl_t* v) { return qmp_simple(v, "{\"execute\":\"stop\"}"); } +int vmctl_resume (vmctl_t* v) { return qmp_simple(v, "{\"execute\":\"cont\"}"); } diff --git a/src/si/input/qmp.c b/src/si/input/qmp.c new file mode 100644 index 0000000..50113ea --- /dev/null +++ b/src/si/input/qmp.c @@ -0,0 +1,113 @@ +/* qmp.c — AF_UNIX QMP client: connect + capability handshake, line-based recv + * with a poll timeout, and synchronous command execution. */ + +#include "qmp.h" + +#include +#include +#include +#include +#include +#include + +#define QMP_TIMEOUT_MS 5000 +#define QMP_BUF_SIZE 4096 + +struct qmp_conn { + int fd; +}; + +static int recv_line(int fd, char* buf, size_t cap) { + size_t n = 0; + while (n + 1 < cap) { + struct pollfd pfd = { .fd = fd, .events = POLLIN }; + if (poll(&pfd, 1, QMP_TIMEOUT_MS) <= 0) return -1; + char c; + if (read(fd, &c, 1) != 1) return -1; + buf[n++] = c; + if (c == '\n') break; + } + buf[n] = '\0'; + return (int)n; +} + +static int send_all(int fd, const char* s, size_t len) { + while (len > 0) { + ssize_t w = write(fd, s, len); + if (w <= 0) return -1; + s += w; + len -= (size_t)w; + } + return 0; +} + +qmp_conn* qmp_connect(const char* sock_path) { + int fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) return NULL; + + struct sockaddr_un addr; + memset(&addr, 0, sizeof addr); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, sock_path, sizeof addr.sun_path - 1); + + if (connect(fd, (struct sockaddr*)&addr, sizeof addr) < 0) { + close(fd); + return NULL; + } + + char buf[QMP_BUF_SIZE]; + if (recv_line(fd, buf, sizeof buf) < 0) { + close(fd); + return NULL; + } + + const char* cap_cmd = "{\"execute\":\"qmp_capabilities\"}\r\n"; + if (send_all(fd, cap_cmd, strlen(cap_cmd)) < 0) { + close(fd); + return NULL; + } + + if (recv_line(fd, buf, sizeof buf) < 0) { + close(fd); + return NULL; + } + + qmp_conn* c = malloc(sizeof *c); + if (!c) { + close(fd); + return NULL; + } + c->fd = fd; + return c; +} + +void qmp_disconnect(qmp_conn* c) { + if (!c) return; + close(c->fd); + free(c); +} + +int qmp_exec(qmp_conn* c, const char* cmd, char* resp, size_t cap) { + size_t cmdlen = strlen(cmd); + if (send_all(c->fd, cmd, cmdlen) < 0) return -1; + if (send_all(c->fd, "\r\n", 2) < 0) return -1; + + char line[QMP_BUF_SIZE]; + for (;;) { + if (recv_line(c->fd, line, sizeof line) < 0) return -1; + if (strstr(line, "\"return\"")) { + if (resp && cap > 0) { + strncpy(resp, line, cap - 1); + resp[cap - 1] = '\0'; + } + return 0; + } + if (strstr(line, "\"error\"")) { + if (resp && cap > 0) { + strncpy(resp, line, cap - 1); + resp[cap - 1] = '\0'; + } + return -1; + } + } +} diff --git a/src/si/input/qmp_driver.c b/src/si/input/qmp_driver.c new file mode 100644 index 0000000..7f9d8bc --- /dev/null +++ b/src/si/input/qmp_driver.c @@ -0,0 +1,94 @@ +/* qmp_driver.c — QMP input driver: serialises an input batch into a single + * input-send-event command and sends it in one round-trip. No guest driver is + * required. Switches on vmctl_ev_kind (never on magic numbers). */ + +#include "driver.h" +#include "keymap.h" + +#include +#include + +static const char* btn_names[] = { + "left", "right", "middle", "side", "extra", "forward", "back", "task" +}; +#define BTN_NAMES_LEN ((int)(sizeof btn_names / sizeof btn_names[0])) + +static int qmp_driver_send(vmctl_t* v, const vmctl_batch* b) { + char json[8192]; + int pos = 0; + + pos += snprintf(json + pos, (int)sizeof json - pos, + "{\"execute\":\"input-send-event\",\"arguments\":{\"events\":["); + + for (int i = 0; i < b->count; i++) { + if (i > 0) + pos += snprintf(json + pos, (int)sizeof json - pos, ","); + + int code = b->ev[i].code; + int value = b->ev[i].value; + double scl = b->ev[i].scroll; + + switch ((vmctl_ev_kind)b->ev[i].kind) { + case VMCTL_EV_ABS: + pos += snprintf(json + pos, (int)sizeof json - pos, + "{\"type\":\"abs\",\"data\":{\"axis\":\"%s\",\"value\":%d}}", + code == VMCTL_AXIS_X ? "x" : "y", value); + break; + case VMCTL_EV_REL: + pos += snprintf(json + pos, (int)sizeof json - pos, + "{\"type\":\"rel\",\"data\":{\"axis\":\"%s\",\"value\":%d}}", + code == VMCTL_AXIS_X ? "x" : "y", value); + break; + case VMCTL_EV_BTN: + if (code < 0 || code >= BTN_NAMES_LEN) return -1; + pos += snprintf(json + pos, (int)sizeof json - pos, + "{\"type\":\"btn\",\"data\":{\"button\":\"%s\",\"down\":%s}}", + btn_names[code], value ? "true" : "false"); + break; + case VMCTL_EV_KEY: { + const char* qcode = vmctl_evdev_to_qcode(code); + if (!qcode) return -1; + pos += snprintf(json + pos, (int)sizeof json - pos, + "{\"type\":\"key\",\"data\":{\"key\":{\"type\":\"qcode\"," + "\"data\":\"%s\"},\"down\":%s}}", + qcode, value ? "true" : "false"); + break; + } + case VMCTL_EV_SCROLL: + pos += snprintf(json + pos, (int)sizeof json - pos, + "{\"type\":\"scl\",\"data\":{\"axis\":\"%s\",\"value\":%g}}", + code == VMCTL_SCROLL_V ? "vertical" : "horizontal", scl); + break; + default: + return -1; + } + } + + pos += snprintf(json + pos, (int)sizeof json - pos, "]}}"); + + char resp[4096]; + return qmp_exec(v->qmp, json, resp, sizeof resp); +} + +static void qmp_driver_close(vmctl_t* v) { + qmp_disconnect(v->qmp); +} + +vmctl_t* vmctl_open_qmp_driver(const vmctl_config* cfg) { + qmp_conn* qmp = qmp_connect(cfg->qmp_path); + if (!qmp) return NULL; + + vmctl_t* v = calloc(1, sizeof *v); + if (!v) { + qmp_disconnect(qmp); + return NULL; + } + v->driver = VMCTL_DRIVER_QMP; + v->qmp = qmp; + v->ui_fd_a = -1; + v->ui_fd_b = -1; + v->ptr_mode = 0; + v->ops.send = qmp_driver_send; + v->ops.close = qmp_driver_close; + return v; +} diff --git a/src/si/vgpu-perception/control.c b/src/si/vgpu-perception/control.c new file mode 100644 index 0000000..1df3e3a --- /dev/null +++ b/src/si/vgpu-perception/control.c @@ -0,0 +1,39 @@ +/* control.c — control-write SEAM ONLY (this never writes guest memory). + * + * The actual write is performed elsewhere, by a component that holds read-write + * access to the region; this only builds the desired vgpu_control_t image from + * the intent and computes the GVA + offset/length of the significant field range + * for that atomic write under the ctrl_gen seqlock. There is no gva_write here + * and there must not be — the source is a RO fd that would fault on a store anyway. + * + * The reported out_ctrl_gva is a GVA in the PRODUCER's user address space + * (region base + VGPU_CONTROL_OFFSET, cached as r->ctrl_gva): the external write + * MUST be performed under r->proc_cr3, NOT the System kcr3. + */ + +#include "perception-internal.h" + +int vgpup_build_control_write(vgpup_region* r, const vgpup_control_intent* in, + vgpu_control_t* out_frame, uint64_t* out_ctrl_gva, + uint32_t* out_off, uint32_t* out_len) +{ + if (!r || !in || !out_frame || !out_ctrl_gva || !out_off || !out_len) { return -1; } + + /* Fill the desired control image. ctrl_gen stays 0: the writer owns it under + * the seqlock. consumer_tick/attached carry separate heartbeat/intent + * semantics and are not part of this intent. */ + memset(out_frame, 0, sizeof *out_frame); + out_frame->desired_state = in->desired_state; + out_frame->target_fps = in->target_fps; + out_frame->draw_cursor = in->draw_cursor; + out_frame->full_frame_req = in->full_frame_req; + + *out_ctrl_gva = r->ctrl_gva; /* region base + VGPU_CONTROL_OFFSET (cached) */ + + /* Significant range: desired_state .. full_frame_req (contiguous in the ABI), + * i.e. offsetof(desired_state) through the end of full_frame_req. */ + *out_off = (uint32_t)offsetof(vgpu_control_t, desired_state); + *out_len = (uint32_t)(offsetof(vgpu_control_t, full_frame_req) + sizeof(uint32_t) + - offsetof(vgpu_control_t, desired_state)); + return 0; +} diff --git a/src/si/vgpu-perception/discover.c b/src/si/vgpu-perception/discover.c new file mode 100644 index 0000000..68b2055 --- /dev/null +++ b/src/si/vgpu-perception/discover.c @@ -0,0 +1,170 @@ +/* discover.c — process discovery + user-AS region scan (NO magic) + handle. + * + * The region is a RW shared mapping projected into the USER address space of a + * producer PROCESS — NOT a kernel VA in the System address space. So discovery + * works by PROCESS: enumerate processes (proc_list) over the RO win32 context, + * and for each one scan its user-AS under process.cr3 in [USER_MIN, USER_MAX] + * for a contiguous RW run >= VGPU_REGION_BYTES, read the producer block at its + * base, and accept it iff the whole structural-invariant table holds. The System + * kcr3 is needed only to open the context and walk processes (the caller already + * baked it into v); the region itself is always read under the producer's cr3. + * + * There is NO magic field in the ABI and the owner forbids inventing one. The + * discriminator is the cheap RW-run filter + the invariant table + two-phase + * heartbeat liveness — and the inter-phase WAIT is the caller's (the core never + * sleeps). Discovery is STRUCTURAL: never filtered by process.name. + * + * Layering: the win32 dependency (proc_list, vmie_win32_mem) lives ONLY in this + * file, in the per-process loop. The per-cr3 scan (vgpup_scan_user_as_for_region) + * is pure gva_* so it stays win32-agnostic and unit-testable under a synthetic + * cr3. A <0 read after binding means the producer process may have restarted + * (its pages are gone); the core only reports it — re-discovery is the caller's. + */ + +#include + +#include "perception-internal.h" + +/* How many region runs to ask for per process when probing its user-AS. A user + * address space has many runs; this is generous, and the scan early-exits on the + * first accepted candidate anyway. */ +#define VGPUP_MAX_REGIONS 256 + +/* How many processes to enumerate. proc_list stops at this; raising it would see + * more, but a producer is an ordinary user process well within this bound. */ +#define VGPUP_MAX_PROCS 512 + +/* Read the producer block at `region_gva` under `cr3` into *out (one gva_read of + * the whole block). 0 on success, <0 on read error. */ +static int read_producer_block(vmie_mem* m, uint64_t cr3, uint64_t region_gva, + vgpu_producer_t* out) +{ + return gva_read(m, (uintptr_t)cr3, (uintptr_t)region_gva, out, sizeof *out) < 0 ? -1 : 0; +} + +/* Scan ONE process user-AS (steps 3–5) under `cr3`: walk the RW runs in + * [USER_MIN, USER_MAX] and, for each contiguous run >= VGPU_REGION_BYTES, test + * the producer block at the run base against the invariant table. On the first + * accepted candidate write its base GVA + heartbeat snapshot and return 0; + * <0 if none is found / a read fails. Pure gva_* — no proc_list, no win32. + * + * Adjacent same-protection runs are coalesced: gva_regions reports VA-contiguous + * runs, but a region can land as one run or as touching neighbours, so we extend + * a running span while the next run starts exactly where the current one ends. + * The window [USER_MIN, USER_MAX] lies in one canonical half, as gva_regions + * requires. The RW filter (VR_R|VR_W) matches the shared mapping's protection + * and is cheap — it reads region metadata, not the 98 MiB of region bytes. */ +int vgpup_scan_user_as_for_region(vmie_mem* m, uint64_t cr3, + uint64_t* out_region_gva, uint64_t* out_hb0) +{ + vregion runs[VGPUP_MAX_REGIONS]; + int n, i; + + if (!m || !out_region_gva || !out_hb0) { return -1; } + + n = gva_regions(m, (uintptr_t)cr3, USER_MIN, USER_MAX, VR_R | VR_W, runs, VGPUP_MAX_REGIONS); + if (n < 0) { return -1; } + if (n > VGPUP_MAX_REGIONS) { n = VGPUP_MAX_REGIONS; } /* truncated; probe what we got */ + + for (i = 0; i < n; ++i) { + uint64_t span_base = runs[i].va; + uint64_t span_len = runs[i].len; + int j = i; + + /* coalesce adjacent RW runs into one contiguous span */ + while (j + 1 < n && runs[j + 1].va == runs[j].va + runs[j].len) { + span_len += runs[j + 1].len; + ++j; + } + + if (span_len >= VGPU_REGION_BYTES) { + vgpu_producer_t p; + if (read_producer_block(m, cr3, span_base, &p) == 0 && + vgpup_invariants_hold(&p)) { + *out_region_gva = span_base; + *out_hb0 = p.heartbeat; + return 0; + } + } + } + return -1; +} + +/* Phase 1: enumerate processes and scan each one's user-AS for the region. The + * win32 dependency is confined here: vmie_win32_mem(v) for the generic gva_*, + * proc_list(v, skip_system=1, ...) to drop PEB-less System/kernel-only entries + * (a producer is never one). On the first process that yields a candidate write + * its proc_cr3 + region base GVA + heartbeat snapshot and return 0; <0 if no + * process yields one or proc_list / the context is not ready. */ +int vgpup_discover_candidate(vmie_win32* v, uint64_t* out_proc_cr3, + uint64_t* out_region_gva, uint64_t* out_hb0) +{ + process procs[VGPUP_MAX_PROCS]; + vmie_mem* m; + int np, i; + + if (!v || !out_proc_cr3 || !out_region_gva || !out_hb0) { return -1; } + + m = vmie_win32_mem(v); + if (!m) { return -1; } + + np = proc_list(v, /*skip_system=*/1, procs, VGPUP_MAX_PROCS); + if (np < 0) { return -1; } + if (np > VGPUP_MAX_PROCS) { np = VGPUP_MAX_PROCS; } /* truncated; probe what we got */ + + for (i = 0; i < np; ++i) { + uint64_t region_gva = 0, hb0 = 0; + if (vgpup_scan_user_as_for_region(m, procs[i].cr3, ®ion_gva, &hb0) == 0) { + *out_proc_cr3 = procs[i].cr3; + *out_region_gva = region_gva; + *out_hb0 = hb0; + return 0; + } + } + return -1; +} + +/* Phase 2: re-read heartbeat at region_gva under proc_cr3 and report whether it + * advanced. The caller must have waited >= VGPU_HEARTBEAT_PERIOD_MS since phase + * 1. <0 here can also mean the producer process restarted (pages gone). */ +int vgpup_confirm_alive(vmie_mem* m, uint64_t proc_cr3, + uint64_t region_gva, uint64_t hb0) +{ + uint64_t hb_now; + if (!m) { return -1; } + if (gva_read(m, (uintptr_t)proc_cr3, + (uintptr_t)region_gva + offsetof(vgpu_producer_t, heartbeat), + &hb_now, sizeof hb_now) < 0) { + return -1; + } + return (hb_now - hb0) > 0u ? 1 : 0; +} + +vgpup_region* vgpup_open(vmie_win32* v) +{ + uint64_t proc_cr3 = 0, region_gva = 0, hb0 = 0; + vgpup_region* r; + + if (vgpup_discover_candidate(v, &proc_cr3, ®ion_gva, &hb0) != 0) { return NULL; } + + r = (vgpup_region*)calloc(1, sizeof *r); + if (!r) { return NULL; } + + r->proc_cr3 = proc_cr3; + r->region_gva = region_gva; + r->ctrl_gva = region_gva + VGPU_CONTROL_OFFSET; + r->ring_gva = region_gva + VGPU_RING_OFFSET; + r->last_frame_id = 0; + r->run_epoch = 0; + return r; +} + +void vgpup_close(vgpup_region* r) +{ + free(r); /* core state only; v / m belong to the caller */ +} + +uint32_t vgpup_run_epoch(const vgpup_region* r) +{ + return r ? r->run_epoch : 0u; +} diff --git a/src/si/vgpu-perception/include/perception-internal.h b/src/si/vgpu-perception/include/perception-internal.h new file mode 100644 index 0000000..ceea25f --- /dev/null +++ b/src/si/vgpu-perception/include/perception-internal.h @@ -0,0 +1,152 @@ +#ifndef VGPU_PERCEPTION_INTERNAL_H +#define VGPU_PERCEPTION_INTERNAL_H + +/* perception-internal.h — private consumer-side helpers (NOT a public surface). + * + * Holds the core's private state type, the consumer-side seqlock read discipline + * (the mirror of the producer's atomic-shim accessors, but an independent body — + * we read into local copies via gva_read, never sharing producer code), the + * structural-invariant validator table used by discovery, and the bit unpackers + * for the packed cursor fields. Included only by the perception TUs. + * + * Consumer seqlock discipline: every guest read goes through gva_read into a + * local copy, so the compiler cannot reorder a data read across the seq read — + * each gva_read is an opaque call. We still bump the seq read into its own + * gva_read and treat odd seq / changed seq as "writer in flight → retry". + */ + +#include +#include +#include + +#include "vgpu_stream.h" +#include "memmodel.h" +#include "vgpu_perception.h" + +/* Bounded seqlock retry. Producer windows are short (a single slot publish), so + * a small count suffices; spinning longer would be a behavioural timing choice + * (control's job), which does not belong in the sensor. Exhausted → lossy skip. */ +#define VGPUP_SEQLOCK_RETRIES 8u + +/* Private core state. Owns nothing of the address space — only where the region + * lives (in the producer's user-AS, keyed by proc_cr3) and the last-seen + * monotonic markers for dedup / session-break. */ +struct vgpup_region { + uint64_t proc_cr3; /* producer process cr3 — key to its user-AS */ + uint64_t region_gva; /* producer-block GVA == region base */ + uint64_t ctrl_gva; /* region_gva + VGPU_CONTROL_OFFSET (cached) */ + uint64_t ring_gva; /* region_gva + VGPU_RING_OFFSET (cached) */ + uint64_t last_frame_id; /* dedup: only frames with a greater id are "fresh" */ + uint32_t run_epoch; /* last run_epoch seen via vgpup_read_status */ +}; + +/* Per-cr3 user-AS region scan (discovery steps 3–5 for ONE address space): scan + * gva_regions over [USER_MIN, USER_MAX] under `cr3` for a contiguous RW run of + * >= VGPU_REGION_BYTES, read the producer block at its base, and accept it iff + * the structural-invariant table holds. On the first hit writes the region base + * GVA to *out_region_gva and the heartbeat snapshot to *out_hb0 and returns 0; + * <0 if none is found / a read fails. Pure gva_* (no proc_list / win32) so it is + * testable under a synthetic cr3; vgpup_discover_candidate calls it per process. */ +int vgpup_scan_user_as_for_region(vmie_mem* m, uint64_t cr3, + uint64_t* out_region_gva, uint64_t* out_hb0); + +/* ---- seqlock primitives -------------------------------------------------- */ + +static inline int vgpup_seq_is_writing(uint32_t seq) { return (seq & 1u) != 0u; } + +/* Read one 32-bit seq field at `gva` into *out under `cr3` (the producer's + * user-AS cr3). 0 on success, <0 on read error. */ +static inline int vgpup_read_seq(vmie_mem* m, uintptr_t cr3, uint64_t gva, + uint32_t* out) +{ + return gva_read(m, cr3, (uintptr_t)gva, out, sizeof *out) < 0 ? -1 : 0; +} + +/* ---- packed-field unpackers (cursor line) -------------------------------- */ + +static inline int32_t vgpup_cursor_x(uint64_t pos) { return (int32_t)(uint32_t)(pos & 0xFFFFFFFFu); } +static inline int32_t vgpup_cursor_y(uint64_t pos) { return (int32_t)(uint32_t)(pos >> 32); } +static inline uint16_t vgpup_lo16(uint32_t v) { return (uint16_t)(v & 0xFFFFu); } +static inline uint16_t vgpup_hi16(uint32_t v) { return (uint16_t)(v >> 16); } + +/* ---- structural-invariant validator (discovery, BY TABLE — no magic) ------ + * + * Discovery has no magic field in the ABI (the owner forbids one). The + * discriminator is the conjunction of structural invariants derived from the + * ABI bounds in vgpu_stream.h, plus the two-phase heartbeat liveness handled by + * the caller. The predicates run cheap→costly with early exit; each takes a + * decoded producer-block snapshot and returns 1 (holds) / 0 (rejects). */ + +typedef int (*vgpup_inv_fn)(const vgpu_producer_t* p); + +/* Is `latest` a valid slot index, or the legitimate "no frame yet" sentinel? + * latest == NONE is NOT a rejection (a freshly-started region has no frame). */ +static inline int vgpup_inv_latest_in_range(const vgpu_producer_t* p) +{ + return p->latest == VGPU_LATEST_NONE || p->latest < VGPU_SLOT_COUNT; +} + +/* If a frame is published, its slot seq must be even (stable, not mid-write). */ +static inline int vgpup_inv_latest_seq_stable(const vgpu_producer_t* p) +{ + if (p->latest == VGPU_LATEST_NONE) { return 1; } + return !vgpup_seq_is_writing(p->seq[p->latest]); +} + +/* If a frame is published, its descriptor must be a tight BGRA frame within the + * ABI dimension bounds. */ +static inline int vgpup_inv_latest_desc_valid(const vgpu_producer_t* p) +{ + const vgpu_desc_t* d; + if (p->latest == VGPU_LATEST_NONE) { return 1; } + d = &p->desc[p->latest]; + if (d->format != VGPU_FMT_BGRA8888) { return 0; } + if (d->width == 0u || d->width > VGPU_MAX_WIDTH) { return 0; } + if (d->height == 0u || d->height > VGPU_MAX_HEIGHT) { return 0; } + if (d->stride != d->width * 4u) { return 0; } + return 1; +} + +/* Cold-line status enum must be in the ABI range. */ +static inline int vgpup_inv_status_in_range(const vgpu_producer_t* p) +{ + return p->status <= VGPU_ST_ERROR; +} + +/* Cold-line backend enum must be in the ABI range. */ +static inline int vgpup_inv_backend_in_range(const vgpu_producer_t* p) +{ + return p->backend <= VGPU_BK_GDI; +} + +/* The producer must advertise the one wire format we consume. */ +static inline int vgpup_inv_supports_bgra(const vgpu_producer_t* p) +{ + return (p->supported_formats & (1u << VGPU_FMT_BGRA8888)) != 0u; +} + +/* The invariant table, cheap→costly. A candidate is accepted (phase 1) iff + * every predicate holds; the table is the single discriminator, no scattered + * ifs and no hardcoded numbers (all bounds come from vgpu_stream.h). */ +static const vgpup_inv_fn VGPUP_INVARIANTS[] = { + vgpup_inv_latest_in_range, + vgpup_inv_status_in_range, + vgpup_inv_backend_in_range, + vgpup_inv_supports_bgra, + vgpup_inv_latest_seq_stable, + vgpup_inv_latest_desc_valid, +}; +#define VGPUP_INVARIANT_COUNT (sizeof(VGPUP_INVARIANTS) / sizeof(VGPUP_INVARIANTS[0])) + +/* Run the whole invariant table over a decoded producer-block snapshot. + * Returns 1 if every predicate holds, 0 on the first rejection. */ +static inline int vgpup_invariants_hold(const vgpu_producer_t* p) +{ + size_t i; + for (i = 0; i < VGPUP_INVARIANT_COUNT; ++i) { + if (!VGPUP_INVARIANTS[i](p)) { return 0; } + } + return 1; +} + +#endif /* VGPU_PERCEPTION_INTERNAL_H */ diff --git a/src/si/vgpu-perception/sample.c b/src/si/vgpu-perception/sample.c new file mode 100644 index 0000000..812d6ca --- /dev/null +++ b/src/si/vgpu-perception/sample.c @@ -0,0 +1,228 @@ +/* sample.c — consumer seqlock reads: frame sampling, cursor, geometry, status. + * + * Every guest read goes through gva_read into a local copy; we never hold a + * gva_ptr across a seqlock window (it is borrowed and not atomic for re-check). + * The discipline is the mirror of the producer's publish order in atomic-shim.h, + * but an independent body — this is consumer code, not shared producer code. + * + * Lossy by contract: when a writer keeps a window busy past VGPUP_SEQLOCK_RETRIES + * we return 0 (skip), never block. Blocking longer would be behavioural timing + * (control's concern), which has no place in the sensor. + * + * All reads go under r->proc_cr3 (the producer's user-AS cr3, cached in the + * handle at discovery), NOT the System kcr3. A <0 from any gva_read means a page + * is gone — the producer process may have restarted; we propagate <0 and the + * caller re-discovers (see vgpu_perception.h "Two epochs + producer restart"). + */ + +#include "perception-internal.h" +#include /* TEMP debug (revert): stderr skip-reason trace */ + +/* Read one cold-line / packed field at producer offset `off` into dst under the + * producer's user-AS cr3. */ +static int read_field(vmie_mem* m, uintptr_t cr3, uint64_t region_gva, + size_t off, void* dst, size_t n) +{ + return gva_read(m, cr3, (uintptr_t)region_gva + off, dst, n) < 0 ? -1 : 0; +} + +int vgpup_sample_frame(vgpup_region* r, vmie_mem* m, + uint8_t* dst, size_t cap, vgpup_frame_info* info) +{ + unsigned attempt; + static unsigned long _dc = 0; /* TEMP debug: 1/240 call gate */ + int _dbg = ((_dc++ % 240u) == 0u); + + if (!r || !m || !dst || !info) { return -1; } + + for (attempt = 0; attempt < VGPUP_SEQLOCK_RETRIES; ++attempt) { + uint32_t latest = 0, seq_before = 0, seq_after = 0; + vgpu_desc_t d; + uint64_t slot_gva, seq_gva, desc_gva; + size_t frame_bytes; + + /* latest (acquire-equivalent: its own read) */ + if (read_field(m, r->proc_cr3, r->region_gva, + offsetof(vgpu_producer_t, latest), &latest, sizeof latest) < 0) { + if (_dbg) fprintf(stderr, "VGPUP_DBG ret=-1 latest-read-fail\n"); + return -1; + } + if (latest == VGPU_LATEST_NONE || latest >= VGPU_SLOT_COUNT) { + if (_dbg) fprintf(stderr, "VGPUP_DBG ret=0 A latest=%u\n", latest); + return 0; + } + + seq_gva = r->region_gva + offsetof(vgpu_producer_t, seq) + (uint64_t)latest * sizeof(uint32_t); + desc_gva = r->region_gva + offsetof(vgpu_producer_t, desc) + (uint64_t)latest * sizeof(vgpu_desc_t); + + if (vgpup_read_seq(m, r->proc_cr3, seq_gva, &seq_before) < 0) { return -1; } + if (vgpup_seq_is_writing(seq_before)) { + if (_dbg) fprintf(stderr, "VGPUP_DBG cont B att=%u latest=%u seqB=%u (writing)\n", attempt, latest, seq_before); + continue; /* writer in slot */ + } + + if (gva_read(m, (uintptr_t)r->proc_cr3, (uintptr_t)desc_gva, &d, sizeof d) < 0) { return -1; } + + /* dedup by frame_id: nothing newer than what we already sampled */ + if (d.frame_id <= r->last_frame_id) { + if (_dbg) fprintf(stderr, "VGPUP_DBG ret=0 C dedup dfid=%llu last=%llu\n", + (unsigned long long)d.frame_id, (unsigned long long)r->last_frame_id); + return 0; + } + + /* descriptor sanity within the read window (tight BGRA, bounded dims) */ + if (d.format != VGPU_FMT_BGRA8888 || d.stride != d.width * 4u || + d.width == 0u || d.width > VGPU_MAX_WIDTH || + d.height == 0u || d.height > VGPU_MAX_HEIGHT) { + if (_dbg) fprintf(stderr, "VGPUP_DBG cont D torn att=%u w=%u h=%u s=%u f=%u\n", + attempt, d.width, d.height, d.stride, d.format); + continue; /* likely a torn read; retry */ + } + + frame_bytes = (size_t)d.height * d.stride; + if (frame_bytes > VGPU_SLOT_STRIDE) { return 0; } /* impossible-large → skip */ + if (frame_bytes > cap) { + if (_dbg) fprintf(stderr, "VGPUP_DBG ret=0 F fbytes=%zu cap=%zu\n", frame_bytes, cap); + return 0; /* would not fit → lossy drop */ + } + + slot_gva = r->ring_gva + (uint64_t)latest * VGPU_SLOT_STRIDE; + if (gva_read(m, (uintptr_t)r->proc_cr3, (uintptr_t)slot_gva, dst, frame_bytes) < 0) { + if (_dbg) fprintf(stderr, "VGPUP_DBG ret=-1 G slot-read-fail latest=%u fbytes=%zu\n", latest, frame_bytes); + return -1; + } + + /* re-check the slot seq: unchanged and still even → snapshot consistent */ + if (vgpup_read_seq(m, r->proc_cr3, seq_gva, &seq_after) < 0) { return -1; } + if (seq_after != seq_before || vgpup_seq_is_writing(seq_after)) { + if (_dbg) fprintf(stderr, "VGPUP_DBG cont H att=%u latest=%u seqB=%u seqA=%u\n", + attempt, latest, seq_before, seq_after); + continue; /* the slot was rewritten under us — retry */ + } + + info->desc.width = d.width; + info->desc.height = d.height; + info->desc.stride = d.stride; + info->desc.format = d.format; + info->desc.frame_id = d.frame_id; + info->desc.timestamp_ns = d.timestamp_ns; + info->bytes = frame_bytes; + + r->last_frame_id = d.frame_id; + return 1; + } + if (_dbg) fprintf(stderr, "VGPUP_DBG ret=0 I retry-exhaust (%u attempts all busy)\n", VGPUP_SEQLOCK_RETRIES); + return 0; /* writer kept the slot busy past the retry limit — skip */ +} + +int vgpup_read_cursor(vgpup_region* r, vmie_mem* m, vgpup_cursor* out) +{ + unsigned attempt; + + if (!r || !m || !out) { return -1; } + + /* The producer bumps cursor_seq LAST (acquire), so we read the cursor line + * first and gate on cursor_seq being even and unchanged across the window. */ + for (attempt = 0; attempt < VGPUP_SEQLOCK_RETRIES; ++attempt) { + uint32_t seq_before = 0, seq_after = 0; + uint32_t visible = 0, hotspot = 0, glyph = 0, id = 0; + uint64_t pos = 0; + + if (vgpup_read_seq(m, r->proc_cr3, r->region_gva + offsetof(vgpu_producer_t, cursor_seq), + &seq_before) < 0) { return -1; } + if (vgpup_seq_is_writing(seq_before)) { continue; } + + if (read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, cursor_visible), &visible, sizeof visible) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, cursor_pos), &pos, sizeof pos) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, cursor_hotspot), &hotspot, sizeof hotspot) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, cursor_glyph), &glyph, sizeof glyph) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, cursor_id), &id, sizeof id) < 0) { + return -1; + } + + if (vgpup_read_seq(m, r->proc_cr3, r->region_gva + offsetof(vgpu_producer_t, cursor_seq), + &seq_after) < 0) { return -1; } + if (seq_after != seq_before || vgpup_seq_is_writing(seq_after)) { continue; } + + out->seq = seq_after; + out->visible = visible; + out->x = vgpup_cursor_x(pos); + out->y = vgpup_cursor_y(pos); + out->hot_x = vgpup_lo16(hotspot); + out->hot_y = vgpup_hi16(hotspot); + out->glyph_w = vgpup_lo16(glyph); + out->glyph_h = vgpup_hi16(glyph); + out->id = id; + return 1; + } + return 0; +} + +int vgpup_read_geometry(vgpup_region* r, vmie_mem* m, vgpup_geometry* out) +{ + unsigned attempt; + + if (!r || !m || !out) { return -1; } + + for (attempt = 0; attempt < VGPUP_SEQLOCK_RETRIES; ++attempt) { + uint32_t seq_before = 0, seq_after = 0; + int32_t virt_x = 0, virt_y = 0, cap_x = 0, cap_y = 0; + uint32_t virt_w = 0, virt_h = 0, dpi = 0, refresh_mhz = 0; + + if (vgpup_read_seq(m, r->proc_cr3, r->region_gva + offsetof(vgpu_producer_t, geom_seq), + &seq_before) < 0) { return -1; } + if (vgpup_seq_is_writing(seq_before)) { continue; } + + if (read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, virt_x), &virt_x, sizeof virt_x) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, virt_y), &virt_y, sizeof virt_y) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, virt_w), &virt_w, sizeof virt_w) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, virt_h), &virt_h, sizeof virt_h) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, cap_x), &cap_x, sizeof cap_x) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, cap_y), &cap_y, sizeof cap_y) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, dpi), &dpi, sizeof dpi) < 0 || + read_field(m, r->proc_cr3, r->region_gva, offsetof(vgpu_producer_t, refresh_mhz), &refresh_mhz, sizeof refresh_mhz) < 0) { + return -1; + } + + if (vgpup_read_seq(m, r->proc_cr3, r->region_gva + offsetof(vgpu_producer_t, geom_seq), + &seq_after) < 0) { return -1; } + if (seq_after != seq_before || vgpup_seq_is_writing(seq_after)) { continue; } + + out->virt_x = virt_x; + out->virt_y = virt_y; + out->virt_w = virt_w; + out->virt_h = virt_h; + out->cap_x = cap_x; + out->cap_y = cap_y; + out->dpi = dpi; + out->refresh_mhz = refresh_mhz; + return 1; + } + return 0; +} + +int vgpup_read_status(vgpup_region* r, vmie_mem* m, vgpup_status* out) +{ + vgpu_producer_t p; + + if (!r || !m || !out) { return -1; } + + /* Cold line: single naturally-aligned atomic fields with no seqlock. Read + * the whole producer block once and pick the cold fields — "fresh enough" + * by the lossy contract. */ + if (gva_read(m, (uintptr_t)r->proc_cr3, (uintptr_t)r->region_gva, &p, sizeof p) < 0) { return -1; } + + out->heartbeat = p.heartbeat; + out->run_epoch = p.run_epoch; + out->status = p.status; + out->backend = p.backend; + out->error_code = p.error_code; + out->applied_fps = p.applied_fps; + out->supported_formats = p.supported_formats; + out->ctrl_ack = p.ctrl_ack; + out->full_frame_ack = p.full_frame_ack; + out->content_change_ns = p.content_change_ns; + + r->run_epoch = p.run_epoch; /* feed the session-break detector */ + return 0; +} diff --git a/src/test/test_daemoncfg.c b/src/test/test_daemoncfg.c new file mode 100644 index 0000000..549e41d --- /dev/null +++ b/src/test/test_daemoncfg.c @@ -0,0 +1,121 @@ +/* test_daemoncfg.c — vmsigd config parser + admission policy (WS4). Config parse is pure; + * admission is exercised against a live discovery (fake probe + recording sink) so the + * vmid->endpoint resolution at connect time is verified end-to-end without armed adapters. */ +#define _GNU_SOURCE +#include "vmsig.h" +#include "discovery.h" +#include "vmsigd.h" +#include "vmsigd_admission.h" +#include +#include +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } } while (0) + +static void test_config(void) { + printf("test_config\n"); + const char* sample = + "# vmsigd config\n" + "socket = /run/foo.sock\n" + "watch = /dev/shm/vmsig\n" + "pve_conf = /etc/pve/qemu-server\n" + "\n" + "[grant uid=0]\n" + "vmids = *\n" + "caps = observe,input,memctx,roster\n" + "arb_prio = 100\n" + "[grant uid=1000]\n" + "vmids = 101, 102\n" + "caps = observe\n" + "arb_prio = 50\n"; + vmsigd_config c; vmsigd_config_defaults(&c); + CHECK(vmsigd_config_parse_buf(&c, sample) == 0, "parse ok"); + CHECK(strcmp(c.socket, "/run/foo.sock") == 0, "global socket override"); + CHECK(strcmp(c.qmp_dir, "/var/run/qemu-server") == 0, "default qmp_dir retained"); + CHECK(c.ngrants == 2, "two grant stanzas"); + + CHECK(c.grants[0].uid == 0 && c.grants[0].all_vms, "grant0 uid=0 vmids=*"); + CHECK(c.grants[0].cap_mask == + (VMSIG_CAP_OBSERVE | VMSIG_CAP_INPUT | VMSIG_CAP_MEMCTX | VMSIG_CAP_ROSTER), + "grant0 caps parsed"); + CHECK(c.grants[0].arb_prio == 100, "grant0 arb_prio"); + + CHECK(c.grants[1].uid == 1000 && !c.grants[1].all_vms && c.grants[1].nvmids == 2 && + c.grants[1].vmids[0] == 101 && c.grants[1].vmids[1] == 102, "grant1 vmid list"); + CHECK(c.grants[1].cap_mask == VMSIG_CAP_OBSERVE, "grant1 caps"); + CHECK(c.grants[1].arb_prio == 50, "grant1 arb_prio"); +} + +/* ---- fake probe + recording sink (attach vmids to slots without armed adapters) ---- */ +typedef struct { int dummy; } fakeprobe; +static int fp_config(const vmsig_host_probe* p, uint32_t vmid, vmsig_host_facts* out) { + (void)p; memset(out, 0, sizeof *out); out->vmid = vmid; out->share_on = 1; out->ok = 1; + snprintf(out->name, sizeof out->name, "win-%u", vmid); + return 0; +} +static int fp_live(const vmsig_host_probe* p, vmsig_host_facts* io) { + (void)p; io->retry = 0; io->ok = 1; io->vm_state = VMSIG_VM_RUNNING; io->low = 0x80000000ull; + return 0; +} +static int rs_attach(void* ud, vmsig_core* core, uint32_t vmid, uint32_t ep, + const vmsig_host_facts* f) { (void)ud;(void)core;(void)vmid;(void)ep;(void)f; return 0; } +static void rs_detach(void* ud, vmsig_core* core, uint32_t vmid, uint32_t ep) { + (void)ud;(void)core;(void)vmid;(void)ep; +} + +static void test_admission(void) { + printf("test_admission\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + fakeprobe fpd; + vmsig_host_probe probe = { fp_config, fp_live, &fpd }; + vmsig_discovery_sink sink = { rs_attach, rs_detach, NULL }; + char dir[] = "/tmp/vmsig_adm.XXXXXX"; CHECK(mkdtemp(dir) != NULL, "temp dir"); + vmsig_discovery* disc = vmsig_discovery_new(core, dir, NULL, NULL, NULL, &probe, &sink); + CHECK(disc != NULL, "discovery created"); + + vmsig_discovery_feed(disc, 101, 1); /* -> ep0 */ + vmsig_discovery_feed(disc, 102, 1); /* -> ep1 */ + + vmsigd_config c; vmsigd_config_defaults(&c); + vmsigd_config_parse_buf(&c, + "[grant uid=0]\nvmids=*\ncaps=observe,input,memctx,roster\narb_prio=100\n" + "[grant uid=1000]\nvmids=101,102\ncaps=observe\narb_prio=50\n" + "[grant uid=1001]\nvmids=999\ncaps=observe\narb_prio=10\n"); + vmsigd_admission adm = { &c, disc }; + + /* uid 0: all_vms => full mask */ + vmsig_grant g0 = vmsigd_policy(0, 0, &adm); + CHECK(g0.endpoint_mask == ~0ull, "uid0 (vmids=*) covers all endpoints"); + CHECK(g0.cap_mask == (VMSIG_CAP_OBSERVE | VMSIG_CAP_INPUT | VMSIG_CAP_MEMCTX | VMSIG_CAP_ROSTER), + "uid0 caps"); + CHECK(g0.arb_prio == 100 && g0.principal == 0, "uid0 prio/principal"); + + /* uid 1000: vmids 101,102 attached at ep0,ep1 => bits 0,1 */ + vmsig_grant g1 = vmsigd_policy(1000, 0, &adm); + CHECK(g1.endpoint_mask == ((1ull << 0) | (1ull << 1)), "uid1000 resolved to ep0,ep1 bits"); + CHECK(g1.cap_mask == VMSIG_CAP_OBSERVE && g1.arb_prio == 50, "uid1000 caps/prio"); + + /* uid 1001: vmid 999 not attached => no bits (peer learns via roster / reconnect) */ + vmsig_grant g2 = vmsigd_policy(1001, 0, &adm); + CHECK(g2.endpoint_mask == 0, "uid1001 unbound vmid => no endpoint bit yet"); + + /* unknown uid: empty grant => reject */ + vmsig_grant g3 = vmsigd_policy(4242, 0, &adm); + CHECK(g3.cap_mask == 0 && g3.endpoint_mask == 0, "unknown uid => empty grant (reject)"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); + rmdir(dir); +} + +int main(void) { + test_config(); + test_admission(); + printf("daemoncfg tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_discovery.c b/src/test/test_discovery.c new file mode 100644 index 0000000..4428cb1 --- /dev/null +++ b/src/test/test_discovery.c @@ -0,0 +1,198 @@ +/* test_discovery.c — discovery state machine (WS3), driven deterministically via the TEST + * hooks (no inotify/timer/threads). A fake host-probe controls config/live verdicts; a + * recording sink captures attach/detach; a CAP_ROSTER subscriber captures the published + * roster. Covers: appear->attach(slot+roster), duplicate, gone->detach(roster+free), bit + * reuse, config-fail drop, stale drop, and the retry-then-attach path. */ +#define _GNU_SOURCE +#include "vmsig.h" +#include "vmsig_roster.h" +#include "discovery.h" /* pulls host_probe.h */ +#include +#include +#include +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } } while (0) + +/* ---- fake host-probe ---- */ +typedef struct { int config_ok; int live_mode; int live_calls; } fakeprobe; +/* live_mode: 0=ok, 1=stale(dead, no retry), 2=retry-once-then-ok */ + +static int fp_config(const vmsig_host_probe* p, uint32_t vmid, vmsig_host_facts* out) { + fakeprobe* f = p->ud; + memset(out, 0, sizeof *out); + out->vmid = vmid; + snprintf(out->name, sizeof out->name, "win-%u", vmid); + snprintf(out->ram_path, sizeof out->ram_path, "/tmp/vm-%u-ram", vmid); + snprintf(out->qmp_path, sizeof out->qmp_path, "/tmp/%u.qmp", vmid); + out->cfg_ram_bytes = 4ull << 30; + out->share_on = f->config_ok; + out->ok = f->config_ok; + return 0; +} +static int fp_live(const vmsig_host_probe* p, vmsig_host_facts* io) { + fakeprobe* f = p->ud; + io->retry = 0; + f->live_calls++; + if (f->live_mode == 1) { io->ok = 0; io->vm_state = VMSIG_VM_SHUTDOWN; return 0; } + if (f->live_mode == 2 && f->live_calls == 1) { io->retry = 1; io->ok = 0; return 0; } + io->ok = 1; io->vm_state = VMSIG_VM_RUNNING; io->low = 0x80000000ull; + return 0; +} + +/* ---- recording sink ---- */ +typedef struct { + int n_attach, n_detach; + uint32_t la_vmid, la_ep, ld_vmid, ld_ep; +} recsink; +static int rs_attach(void* ud, vmsig_core* core, uint32_t vmid, uint32_t ep, + const vmsig_host_facts* f) { + (void)core; (void)f; + recsink* s = ud; s->n_attach++; s->la_vmid = vmid; s->la_ep = ep; + return 0; +} +static void rs_detach(void* ud, vmsig_core* core, uint32_t vmid, uint32_t ep) { + (void)core; + recsink* s = ud; s->n_detach++; s->ld_vmid = vmid; s->ld_ep = ep; +} + +/* ---- roster subscriber ---- */ +typedef struct { int attach, detach; uint32_t last_vmid, last_ep, last_action; char last_name[32]; } robs; +static int rob_on_ev(void* u, const vmsig_event* ev) { + robs* r = u; + if (ev->kind != VMSIG_EV_ROSTER) return 0; + const vmsig_roster* e = (const vmsig_roster*)ev->inln; + r->last_vmid = e->vmid; r->last_ep = ev->endpoint; r->last_action = e->action; + snprintf(r->last_name, sizeof r->last_name, "%s", e->name); + if (e->action == VMSIG_ROSTER_ATTACH) r->attach++; + else if (e->action == VMSIG_ROSTER_DETACH) r->detach++; + return 0; +} + +static void test_discovery(void) { + printf("test_discovery\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + robs ro; memset(&ro, 0, sizeof ro); + vmsig_inproc_cfg cfg; memset(&cfg, 0, sizeof cfg); + cfg.on_event = rob_on_ev; cfg.user = &ro; + void* ctl = vmsig_inproc_control_new(&cfg); + vmsig_grant g; memset(&g, 0, sizeof g); + g.principal = 1; g.endpoint_mask = ~0ull; g.source_mask = 0xFFFFFFFFu; g.cap_mask = VMSIG_CAP_ROSTER; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); + + fakeprobe fp; memset(&fp, 0, sizeof fp); fp.config_ok = 1; fp.live_mode = 0; + vmsig_host_probe probe = { fp_config, fp_live, &fp }; + recsink rs; memset(&rs, 0, sizeof rs); + vmsig_discovery_sink sink = { rs_attach, rs_detach, &rs }; + + char dir[] = "/tmp/vmsig_disc.XXXXXX"; + CHECK(mkdtemp(dir) != NULL, "temp watch dir created"); + vmsig_discovery* d = vmsig_discovery_new(core, dir, NULL, NULL, NULL, &probe, &sink); + CHECK(d != NULL, "discovery created"); + + /* 1) appear 101 -> attach ep0 + roster ATTACH */ + vmsig_discovery_feed(d, 101, 1); + CHECK(rs.n_attach == 1 && rs.la_vmid == 101 && rs.la_ep == 0, "101 attached on ep0 (sink)"); + CHECK(ro.attach == 1 && ro.last_vmid == 101 && ro.last_ep == 0 && + ro.last_action == VMSIG_ROSTER_ATTACH, "roster ATTACH 101 ep0"); + CHECK(strcmp(ro.last_name, "win-101") == 0, "roster carried the VM name"); + CHECK(vmsig_discovery_slot_of_vmid(d, 101) == 0, "slot_of_vmid(101)==0"); + + /* 2) appear 102 -> ep1 */ + vmsig_discovery_feed(d, 102, 1); + CHECK(rs.n_attach == 2 && rs.la_vmid == 102 && rs.la_ep == 1, "102 attached on ep1"); + + /* duplicate appear 101 -> ignored */ + vmsig_discovery_feed(d, 101, 1); + CHECK(rs.n_attach == 2, "duplicate appear ignored"); + + /* 3) gone 101 -> detach + roster DETACH + slot freed */ + vmsig_discovery_feed(d, 101, 0); + CHECK(rs.n_detach == 1 && rs.ld_vmid == 101 && rs.ld_ep == 0, "101 detached (sink)"); + CHECK(ro.detach == 1 && ro.last_action == VMSIG_ROSTER_DETACH && ro.last_vmid == 101, + "roster DETACH 101"); + CHECK(vmsig_discovery_slot_of_vmid(d, 101) == -1, "slot freed after detach"); + + /* 4) appear 103 -> reuse freed ep0 */ + vmsig_discovery_feed(d, 103, 1); + CHECK(rs.la_ep == 0 && rs.la_vmid == 103, "103 reuses freed ep0 (lowest free)"); + + /* 5) config-fail -> drop */ + fp.config_ok = 0; + int n = rs.n_attach; + vmsig_discovery_feed(d, 999, 1); + CHECK(rs.n_attach == n, "config-fail vmid dropped (no attach)"); + fp.config_ok = 1; + + /* 6) stale (file present, VM dead) -> drop */ + fp.live_mode = 1; + n = rs.n_attach; + vmsig_discovery_feed(d, 105, 1); + CHECK(rs.n_attach == n, "stale VM dropped (no attach)"); + fp.live_mode = 0; + + /* 7) retry-then-ok: first probe retries, tick re-probes and attaches */ + fp.live_mode = 2; fp.live_calls = 0; + n = rs.n_attach; + vmsig_discovery_feed(d, 104, 1); + CHECK(rs.n_attach == n, "retry: not attached on first probe"); + CHECK(vmsig_discovery_slot_of_vmid(d, 104) == -1, "retry: no slot yet"); + vmsig_discovery_tick(d); + CHECK(rs.n_attach == n + 1 && rs.la_vmid == 104, "retry: attached after re-probe"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); + rmdir(dir); +} + +/* Bootstrap path: files already present when discovery starts are picked up by the REAL + * readdir + parse_vmid scan (not the test feed hook); junk names are ignored. */ +static void touch(const char* dir, const char* name) { + char path[512]; + snprintf(path, sizeof path, "%s/%s", dir, name); + int fd = open(path, O_CREAT | O_WRONLY | O_CLOEXEC, 0600); + if (fd >= 0) close(fd); +} +static void rm(const char* dir, const char* name) { + char path[512]; + snprintf(path, sizeof path, "%s/%s", dir, name); + unlink(path); +} + +static void test_bootstrap(void) { + printf("test_bootstrap\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + fakeprobe fp; memset(&fp, 0, sizeof fp); fp.config_ok = 1; fp.live_mode = 0; + vmsig_host_probe probe = { fp_config, fp_live, &fp }; + recsink rs; memset(&rs, 0, sizeof rs); + vmsig_discovery_sink sink = { rs_attach, rs_detach, &rs }; + + char dir[] = "/tmp/vmsig_boot.XXXXXX"; + CHECK(mkdtemp(dir) != NULL, "temp dir"); + touch(dir, "vm-200-ram"); /* valid trigger */ + touch(dir, "notavm"); /* ignored */ + touch(dir, "vm-bad-ram"); /* non-numeric => ignored */ + + vmsig_discovery* d = vmsig_discovery_new(core, dir, NULL, NULL, NULL, &probe, &sink); + CHECK(d != NULL, "discovery created"); + CHECK(rs.n_attach == 1 && rs.la_vmid == 200, "bootstrap scan attached ONLY vm-200 (real parse)"); + CHECK(vmsig_discovery_slot_of_vmid(d, 200) == 0, "200 pinned to ep0 via bootstrap"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); + rm(dir, "vm-200-ram"); rm(dir, "notavm"); rm(dir, "vm-bad-ram"); rmdir(dir); +} + +int main(void) { + test_discovery(); + test_bootstrap(); + printf("discovery tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_dynep.c b/src/test/test_dynep.c new file mode 100644 index 0000000..8cb49fd --- /dev/null +++ b/src/test/test_dynep.c @@ -0,0 +1,100 @@ +/* test_dynep.c — runtime hot-plug of a VM endpoint (WS1): a discovery-style consumer + * attaches an adapter trio, then detaches it and re-attaches it on the SAME endpoint + * while the loop is running. Proves: + * - vmsig_core_add_adapter works AFTER vmsig_core_run started (from a loop-thread cb); + * - vmsig_core_detach_endpoint tears the trio down (deferred reap) and bumps the epoch, + * broadcasting MEMCTX_INVALIDATED so a holder settles; + * - re-attaching the same endpoint publishes MEMCTX at the strictly-higher epoch. + * All driven from the holder callbacks, which run on the loop thread (single-threaded + * with the pumps), so attach/detach are issued mid-loop exactly as discovery will. */ +#define _GNU_SOURCE +#include "vmsig.h" +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +typedef struct { + vmsig_core* core; + uint32_t ep; + int memctx; /* MEMCTX received */ + int invalidated; /* MEMCTX_INVALIDATED received */ + uint32_t last_epoch; /* epoch of the last MEMCTX */ + int phase; /* 0: pre-detach, 1: detached, 2: reattached */ + int ticks; /* vmhost watchdog ticks (failsafe) */ +} dyn; + +/* Re-attach the trio (vmhost watchdog + memctx) on the same endpoint, mid-loop, from the + * INVALIDATED delivery — exactly the discovery "file reappeared" path. */ +static void reattach_trio(dyn* d) { + vmsig_core_add_adapter(d->core, vmsig_vmhost_ops(), NULL, d->ep); + vmsig_core_add_adapter(d->core, vmsig_memctx_ops(), NULL, d->ep); +} + +static int dyn_on_ev(void* u, const vmsig_event* ev) { + dyn* d = u; + if (ev->kind == VMSIG_EV_VM_LIFECYCLE) d->ticks++; + else if (ev->kind == VMSIG_EV_MEMCTX_INVALIDATED) { + d->invalidated++; + if (d->phase == 1) { d->phase = 2; reattach_trio(d); } + } + if (d->ticks > 60) vmsig_core_stop(d->core); /* failsafe */ + return 0; +} + +static int dyn_on_memctx(void* u, const vmsig_event* ev, int fd) { + dyn* d = u; + const vmsig_memctx* m = (const vmsig_memctx*)ev->inln; + (void)fd; /* core closes the borrowed RO-fd after this call */ + d->memctx++; + d->last_epoch = m->epoch; + if (d->phase == 0 && m->epoch == 0) { + d->phase = 1; + vmsig_core_detach_endpoint(d->core, d->ep); /* deferred reap -> bump -> INVALIDATED */ + } else if (d->phase == 2 && m->epoch >= 1) { + vmsig_core_stop(d->core); /* re-attached context observed: done */ + } + return 0; +} + +static void test_dynep(void) { + printf("test_dynep\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + dyn d; memset(&d, 0, sizeof d); + d.core = core; d.ep = 0; + + vmsig_inproc_cfg cfg; memset(&cfg, 0, sizeof cfg); + cfg.on_event = dyn_on_ev; cfg.on_memctx = dyn_on_memctx; cfg.user = &d; + void* ctl = vmsig_inproc_control_new(&cfg); + + vmsig_grant g; memset(&g, 0, sizeof g); + g.principal = 1; g.endpoint_mask = 1ull << 0; g.source_mask = 0xFFFFFFFFu; + g.cap_mask = VMSIG_CAP_MEMCTX | VMSIG_CAP_OBSERVE; + vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); + + /* initial trio on ep0, pre-run (vmhost watchdog ticks the loop + memctx publishes). */ + CHECK(vmsig_core_add_adapter(core, vmsig_vmhost_ops(), NULL, 0) >= 0, "add vmhost ep0"); + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), NULL, 0) >= 0, "add memctx ep0"); + + vmsig_core_run(core); + + CHECK(d.memctx >= 2, "MEMCTX received before AND after re-attach"); + CHECK(d.invalidated >= 1, "MEMCTX_INVALIDATED delivered on detach"); + CHECK(d.last_epoch >= 1, "epoch advanced across detach/re-attach"); + CHECK(d.phase == 2, "reached the re-attached phase"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +int main(void) { + test_dynep(); + printf("dynep tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_memctx.c b/src/test/test_memctx.c index 9780706..116f992 100644 --- a/src/test/test_memctx.c +++ b/src/test/test_memctx.c @@ -16,6 +16,7 @@ #include "vmsig.h" #include "vmsig_socket.h" /* vmsig_wire, vmsig_socket_attach */ #include "core_internal.h" /* core_emit_up (synthetic lifecycle injection) */ +#include "memctx.h" /* vmsig_memctx_cfg (infra ro_fd ownership test) */ #include #include #include @@ -328,12 +329,77 @@ static void test_socket(void) { vmsig_ctx_free(ctx); } +/* ---- 6. ro_fd ownership: an infra-supplied RO-fd is closed by the adapter --- * + * Regression for the latent leak: cfg.ro_fd ownership transfers to the adapter at + * open(); mc_close() must close it, so a re-grant (detach + re-attach with a fresh + * infra ro_fd) does not leak the prior one. Only DUPS leave outward (one per share), + * so the original stays open across the run and is reaped at adapter close. */ +#ifndef MFD_CLOEXEC +#include +#include +static int memfd_create(const char* name, unsigned int flags) { + return (int)syscall(SYS_memfd_create, name, flags); +} +#endif +#ifndef MFD_ALLOW_SEALING +#define MFD_ALLOW_SEALING 0x0002U +#endif +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (1024 + 9) +#define F_SEAL_SHRINK 0x0002 +#define F_SEAL_GROW 0x0004 +#endif +#ifndef F_SEAL_FUTURE_WRITE +#define F_SEAL_FUTURE_WRITE 0x0010 +#endif + +static int make_ro_backing(uint32_t size) { + int fd = memfd_create("vmsig_test_ro", MFD_CLOEXEC | MFD_ALLOW_SEALING); + if (fd < 0) fd = memfd_create("vmsig_test_ro", MFD_CLOEXEC); + if (fd < 0) return -1; + if (ftruncate(fd, (off_t)size) != 0) { close(fd); return -1; } + (void)fcntl(fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_FUTURE_WRITE); + return fd; +} + +static void test_ro_fd_ownership(void) { + printf("test_ro_fd_ownership\n"); + int ro = make_ro_backing(0x10000u); /* >= the stub low so the holder can mmap */ + CHECK(ro >= 0, "created an RO backing fd"); + if (ro < 0) return; + + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + holder h; memset(&h, 0, sizeof h); + h.core = core; h.is_driver = 1; h.expect_ep = 0; h.stop_epoch = -1; + add_holder(core, &h, VMSIG_CAP_MEMCTX, 0xFFFFFFFFu, 1ull << 0); + + /* stub kcr3 (no VM) but a REAL infra ro_fd handed in for the RO share path. */ + vmsig_memctx_cfg mc; memset(&mc, 0, sizeof mc); + mc.stub = 1; mc.ram_path = NULL; mc.low = 0; mc.ro_fd = ro; + CHECK(vmsig_core_add_adapter(core, vmsig_memctx_ops(), &mc, 0) >= 0, "add memctx (infra ro_fd)"); + + vmsig_core_run(core); + + CHECK(h.memctx >= 1, "holder received MEMCTX over the infra ro_fd"); + CHECK(h.ro_ok, "infra ro_fd re-shared and mmaps PROT_READ"); + CHECK(fcntl(ro, F_GETFD) >= 0, "infra ro_fd still open before close (no premature close)"); + + vmsig_core_free(core); /* mc_close closes the owned cfg_ro_fd */ + vmsig_ctx_free(ctx); + + CHECK(fcntl(ro, F_GETFD) == -1, "infra ro_fd closed by mc_close after free (no leak)"); + if (fcntl(ro, F_GETFD) >= 0) close(ro); /* belt-and-braces if the assert failed */ +} + int main(void) { test_multicast(); test_epoch(); test_retain(); test_multivm(); test_socket(); + test_ro_fd_ownership(); printf("memctx tests: %s\n", g_fail ? "FAIL" : "PASS"); return g_fail ? 1 : 0; } diff --git a/src/test/test_perception.c b/src/test/test_perception.c new file mode 100644 index 0000000..8da8653 --- /dev/null +++ b/src/test/test_perception.c @@ -0,0 +1,279 @@ +/* test_perception.c — table-driven invariant predicates + per-cr3 user-AS scan. + * + * Two layers (no proc_list / win32 — that path needs a real Windows kernel + * bring-up and is covered by an out-of-tree integration run, not this unit): + * 1) Invariant predicates as a TABLE of cases over a synthesized producer + * block (pure, no vmie): valid / latest==NONE / torn odd seq / non-BGRA / + * stride!=width*4 / dims out of range — each asserts accept-vs-reject. + * 2) Per-cr3 user-AS scan + sampling under a SYNTHETIC cr3: lay out a real + * region per vgpu_stream.h in a memfd, build a minimal x86-64 identity page + * table (2 MiB large pages) that maps the region at a USER VA (the region + * really lives in a producer's user-AS), open it RO via vmie_mem_from_ro_fd, + * and run vgpup_scan_user_as_for_region + a two-phase heartbeat liveness + * check, then construct a handle (proc_cr3 = synth cr3) and run the real + * frame/cursor/geometry/status reads and the control-write seam under it. + * (cr3 0 over a flat image cannot translate — gva_* needs real page tables — + * so we synthesize them; this exercises the actual translation path the + * caller will use.) The win32 proc_list wrapper is deliberately NOT exercised + * here: vgpup_scan_user_as_for_region is the pure per-cr3 core it calls. + * + * Exit 0 on all-pass; nonzero on the first failure. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#include "perception-internal.h" + +static int g_fail; + +#define CHECK(cond, msg) do { \ + if (!(cond)) { fprintf(stderr, "FAIL: %s (%s:%d)\n", (msg), __FILE__, __LINE__); ++g_fail; } \ +} while (0) + +/* ---- layer 1: invariant predicate table ---------------------------------- */ + +/* Build a baseline VALID producer block (one published BGRA frame in slot 0). */ +static void make_valid_producer(vgpu_producer_t* p) +{ + memset(p, 0, sizeof *p); + p->latest = 0; + p->frame_id = 1; + p->seq[0] = 2; /* even = stable */ + p->desc[0].width = 1920; + p->desc[0].height = 1080; + p->desc[0].stride = 1920 * 4; + p->desc[0].format = VGPU_FMT_BGRA8888; + p->desc[0].frame_id = 1; + p->status = VGPU_ST_CAPTURING; + p->backend = VGPU_BK_DDA; + p->supported_formats = (1u << VGPU_FMT_BGRA8888); + p->heartbeat = 42; +} + +typedef struct { + const char* name; + void (*mutate)(vgpu_producer_t*); + int expect; /* expected vgpup_invariants_hold result */ +} inv_case; + +static void mut_none(vgpu_producer_t* p) { (void)p; } +static void mut_latest_none(vgpu_producer_t* p) { p->latest = VGPU_LATEST_NONE; } +static void mut_latest_oob(vgpu_producer_t* p) { p->latest = VGPU_SLOT_COUNT; } +static void mut_seq_odd(vgpu_producer_t* p) { p->seq[0] = 3; } +static void mut_not_bgra(vgpu_producer_t* p) { p->desc[0].format = 7; } +static void mut_bad_stride(vgpu_producer_t* p) { p->desc[0].stride = 1920 * 4 + 1; } +static void mut_width_zero(vgpu_producer_t* p) { p->desc[0].width = 0; } +static void mut_width_huge(vgpu_producer_t* p) { p->desc[0].width = VGPU_MAX_WIDTH + 1; } +static void mut_height_huge(vgpu_producer_t* p) { p->desc[0].height = VGPU_MAX_HEIGHT + 1; } +static void mut_status_oob(vgpu_producer_t* p) { p->status = VGPU_ST_ERROR + 1; } +static void mut_backend_oob(vgpu_producer_t* p) { p->backend = VGPU_BK_GDI + 1; } +static void mut_no_bgra_support(vgpu_producer_t* p) { p->supported_formats = 0; } + +static const inv_case INV_CASES[] = { + { "valid", mut_none, 1 }, + { "latest==NONE", mut_latest_none, 1 }, /* no frame yet, still valid */ + { "latest out of range", mut_latest_oob, 0 }, + { "torn odd seq", mut_seq_odd, 0 }, + { "non-BGRA format", mut_not_bgra, 0 }, + { "stride != width*4", mut_bad_stride, 0 }, + { "width == 0", mut_width_zero, 0 }, + { "width too large", mut_width_huge, 0 }, + { "height too large", mut_height_huge, 0 }, + { "status out of range", mut_status_oob, 0 }, + { "backend out of range", mut_backend_oob, 0 }, + { "BGRA not supported", mut_no_bgra_support, 0 }, +}; + +static void run_invariant_table(void) +{ + size_t i; + for (i = 0; i < sizeof(INV_CASES) / sizeof(INV_CASES[0]); ++i) { + vgpu_producer_t p; + int got; + make_valid_producer(&p); + INV_CASES[i].mutate(&p); + got = vgpup_invariants_hold(&p); + CHECK(got == INV_CASES[i].expect, INV_CASES[i].name); + } +} + +/* ---- layer 2: per-cr3 user-AS scan + sampling over a real RO vmie_mem ------ */ + +/* x86-64 paging entry flags for the synthetic identity table. */ +#define PTE_P 0x1u /* present */ +#define PTE_RW 0x2u /* writable */ +#define PTE_US 0x4u /* user-accessible (the region is in a user-AS) */ +#define PTE_PS 0x80u /* page size (2 MiB leaf at PD level) */ +#define LARGE_PAGE (2ull * 1024 * 1024) + +/* Build a minimal identity page table mapping [0, span) of the image at user VA + * `base` using 2 MiB large pages, with the PML4/PDPT/PD pages laid out right + * after the region in the same image. Every level carries US so the run reports + * VR_W|VR_U (a real user-AS mapping). Returns the cr3 (PML4 GPA). The mapped VA + * range fits one PD (covers up to 1 GiB), which is plenty for the region. */ +static uint64_t build_identity_table(uint8_t* img, uint64_t region_bytes, + uint64_t base, uint64_t span) +{ + const uint64_t pml4_gpa = region_bytes; /* one page each, after region */ + const uint64_t pdpt_gpa = region_bytes + 0x1000; + const uint64_t pd_gpa = region_bytes + 0x2000; + uint64_t* pml4 = (uint64_t*)(img + pml4_gpa); + uint64_t* pdpt = (uint64_t*)(img + pdpt_gpa); + uint64_t* pd = (uint64_t*)(img + pd_gpa); + const unsigned pml4i = (unsigned)((base >> 39) & 0x1ffu); + const unsigned pdpti = (unsigned)((base >> 30) & 0x1ffu); + const unsigned pdi0 = (unsigned)((base >> 21) & 0x1ffu); + uint64_t mapped = 0; + unsigned k = 0; + + pml4[pml4i] = pdpt_gpa | PTE_P | PTE_RW | PTE_US; + pdpt[pdpti] = pd_gpa | PTE_P | PTE_RW | PTE_US; + while (mapped < span) { + pd[pdi0 + k] = mapped | PTE_P | PTE_RW | PTE_US | PTE_PS; /* VA base+k*2M → GPA mapped */ + mapped += LARGE_PAGE; + ++k; + } + return pml4_gpa; +} + +static void run_flat_smoke(void) +{ + const uint64_t region_bytes = VGPU_REGION_BYTES; + /* region rounded up to a 2 MiB boundary for the large-page identity map */ + const uint64_t mapped_span = (region_bytes + LARGE_PAGE - 1) & ~(LARGE_PAGE - 1); + const size_t total_bytes = (size_t)region_bytes + 0x3000; /* + PML4/PDPT/PD */ + /* a USER VA, 2 MiB-aligned, within [USER_MIN, USER_MAX] — the region lives in + * a producer's user address space, so we map it there (not at a kernel VA). */ + const uint64_t base_va = 0x0000000010000000ull; + const uint32_t w = 64, h = 32; + const size_t frame_bytes = (size_t)w * h * 4u; + int fd; + uint8_t* img; + uint64_t cr3; + vmie_mem* m; + vgpu_producer_t p; + uint8_t marker; + + fd = memfd_create("vgpu-region", 0); + CHECK(fd >= 0, "memfd_create"); + if (fd < 0) { return; } + if (ftruncate(fd, (off_t)total_bytes) != 0) { CHECK(0, "ftruncate"); close(fd); return; } + + img = mmap(NULL, total_bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + CHECK(img != MAP_FAILED, "mmap"); + if (img == MAP_FAILED) { close(fd); return; } + + /* lay out a valid producer block with one BGRA frame in slot 0 (at GPA 0) */ + make_valid_producer(&p); + p.desc[0].width = w; + p.desc[0].height = h; + p.desc[0].stride = w * 4u; + memcpy(img + VGPU_PRODUCER_OFFSET, &p, sizeof p); + + /* fill the slot-0 frame bytes in the RING with a recognizable marker */ + marker = 0xA5; + memset(img + VGPU_RING_OFFSET + 0 * VGPU_SLOT_STRIDE, marker, frame_bytes); + + /* synthesize an identity table mapping the region at base_va, then open RO */ + cr3 = build_identity_table(img, region_bytes, base_va, mapped_span); + m = vmie_mem_from_ro_fd(fd, total_bytes); + CHECK(m != NULL, "vmie_mem_from_ro_fd"); + if (!m) { munmap(img, total_bytes); close(fd); return; } + + /* per-cr3 user-AS scan: candidate found at the user VA with hb0 == 42 */ + { + uint64_t rgva = 0xdead, hb0 = 0; + int rc = vgpup_scan_user_as_for_region(m, cr3, &rgva, &hb0); + CHECK(rc == 0, "scan_user_as rc"); + CHECK(rgva == base_va, "scan_user_as region gva"); + CHECK(hb0 == 42, "scan_user_as hb0"); + + /* two-phase liveness: not alive until heartbeat advances */ + CHECK(vgpup_confirm_alive(m, cr3, rgva, hb0) == 0, "confirm not-yet-alive"); + { uint64_t hb = 43; memcpy(img + offsetof(vgpu_producer_t, heartbeat), &hb, sizeof hb); } + CHECK(vgpup_confirm_alive(m, cr3, rgva, hb0) == 1, "confirm alive after tick"); + } + + /* construct a handle directly (the proc_list/win32 path is not unit-testable; + * proc_cr3 is the synthetic cr3 here) and exercise the read API + control seam */ + { + vgpup_region rr; + vgpup_region* r = &rr; + uint8_t* dst = malloc(frame_bytes); + vgpup_frame_info fi; + vgpup_cursor cur; + vgpup_geometry geo; + vgpup_status st; + int rc; + + memset(&rr, 0, sizeof rr); + rr.proc_cr3 = cr3; + rr.region_gva = base_va; + rr.ctrl_gva = base_va + VGPU_CONTROL_OFFSET; + rr.ring_gva = base_va + VGPU_RING_OFFSET; + + CHECK(dst != NULL, "malloc dst"); + + rc = vgpup_sample_frame(r, m, dst, frame_bytes, &fi); + CHECK(rc == 1, "sample_frame fresh"); + if (rc == 1) { + CHECK(fi.desc.width == w && fi.desc.height == h, "sample dims"); + CHECK(fi.bytes == frame_bytes, "sample bytes"); + CHECK(dst[0] == marker && dst[frame_bytes - 1] == marker, "sample content"); + } + + /* same frame_id → no fresh frame (dedup) */ + CHECK(vgpup_sample_frame(r, m, dst, frame_bytes, &fi) == 0, "sample dedup"); + + /* too-small buffer → lossy drop (0), not error */ + CHECK(vgpup_sample_frame(r, m, dst, 1, &fi) == 0, "sample tiny-cap"); + + CHECK(vgpup_read_cursor(r, m, &cur) == 1, "read_cursor"); + CHECK(vgpup_read_geometry(r, m, &geo) == 1, "read_geometry"); + CHECK(vgpup_read_status(r, m, &st) == 0, "read_status"); + CHECK(st.status == VGPU_ST_CAPTURING, "status value"); + CHECK(st.heartbeat == 43, "status heartbeat"); + CHECK(vgpup_run_epoch(r) == st.run_epoch, "run_epoch accessor"); + + /* control-write seam: builds frame + offsets, writes nothing */ + { + vgpup_control_intent in = { VGPU_CMD_RUN, 60, 1, 7 }; + vgpu_control_t frame; + uint64_t ctrl_gva = 0; + uint32_t off = 0, len = 0; + int crc = vgpup_build_control_write(r, &in, &frame, &ctrl_gva, &off, &len); + CHECK(crc == 0, "build_control_write rc"); + CHECK(frame.desired_state == VGPU_CMD_RUN, "control desired_state"); + CHECK(frame.target_fps == 60, "control target_fps"); + CHECK(frame.full_frame_req == 7, "control full_frame_req"); + CHECK(frame.ctrl_gen == 0, "control ctrl_gen untouched"); + CHECK(ctrl_gva == base_va + VGPU_CONTROL_OFFSET, "control gva"); + CHECK(off == offsetof(vgpu_control_t, desired_state), "control off"); + CHECK(len == offsetof(vgpu_control_t, full_frame_req) + sizeof(uint32_t) + - offsetof(vgpu_control_t, desired_state), "control len"); + } + + free(dst); + } + + vmie_mem_close(m); /* the TEST owns vmie_mem here (it is the caller) */ + munmap(img, total_bytes); + close(fd); +} + +int main(void) +{ + run_invariant_table(); + run_flat_smoke(); + if (g_fail) { + fprintf(stderr, "%d check(s) failed\n", g_fail); + return 1; + } + printf("all checks passed\n"); + return 0; +} diff --git a/src/test/test_roster.c b/src/test/test_roster.c new file mode 100644 index 0000000..6a7e3f1 --- /dev/null +++ b/src/test/test_roster.c @@ -0,0 +1,96 @@ +/* test_roster.c — VM roster inventory coherence (WS2): VMSIG_EV_ROSTER publish, CAP_ROSTER + * gating, endpoint_mask scoping, retained-replay to a late subscriber, and DETACH clearing + * the retained datum. Publish/replay are synchronous (no fd), so the loop is not run: the + * inproc deliver fires the subscriber callback inline. */ +#define _GNU_SOURCE +#include "vmsig.h" +#include "core_internal.h" /* core_roster_publish */ +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { \ + if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } \ +} while (0) + +typedef struct { + int count; + uint32_t ep, vmid, state, action; + char name[VMSIG_ROSTER_NAME_MAX]; +} robs; + +static int rob_on_ev(void* u, const vmsig_event* ev) { + robs* r = u; + if (ev->kind != VMSIG_EV_ROSTER) return 0; + const vmsig_roster* e = (const vmsig_roster*)ev->inln; + r->count++; + r->ep = ev->endpoint; r->vmid = e->vmid; r->state = e->state; r->action = e->action; + memcpy(r->name, e->name, sizeof r->name); + return 0; +} + +static int add_robs(vmsig_core* core, robs* r, uint32_t cap, uint64_t epmask) { + vmsig_inproc_cfg cfg; memset(&cfg, 0, sizeof cfg); + cfg.on_event = rob_on_ev; cfg.user = r; + void* ctl = vmsig_inproc_control_new(&cfg); + vmsig_grant g; memset(&g, 0, sizeof g); + g.principal = 9; g.endpoint_mask = epmask; g.source_mask = 0xFFFFFFFFu; g.cap_mask = cap; + return vmsig_core_add_control(core, vmsig_inproc_control_ops(), ctl, &g); +} + +static void publish(vmsig_core* core, uint32_t ep, uint32_t vmid, uint32_t state, + uint32_t action, const char* name) { + vmsig_roster e; memset(&e, 0, sizeof e); + e.vmid = vmid; e.state = state; e.action = action; + snprintf(e.name, sizeof e.name, "%s", name); + core_roster_publish(core, ep, &e); +} + +static void test_roster(void) { + printf("test_roster\n"); + vmsig_ctx* ctx = vmsig_ctx_new(); + vmsig_core* core = vmsig_core_new(ctx); + + robs a, b, cc; memset(&a,0,sizeof a); memset(&b,0,sizeof b); memset(&cc,0,sizeof cc); + add_robs(core, &a, VMSIG_CAP_ROSTER, ~0ull); /* all endpoints, can see roster */ + add_robs(core, &b, VMSIG_CAP_OBSERVE, ~0ull); /* no CAP_ROSTER -> denied */ + add_robs(core, &cc, VMSIG_CAP_ROSTER, 1ull << 0); /* scoped to ep0 only */ + + /* ATTACH ep0 */ + publish(core, 0, 1001, VMSIG_VM_RUNNING, VMSIG_ROSTER_ATTACH, "win-1001"); + CHECK(a.count == 1 && a.ep == 0 && a.vmid == 1001 && a.action == VMSIG_ROSTER_ATTACH, + "A (CAP_ROSTER) received ATTACH ep0"); + CHECK(strcmp(a.name, "win-1001") == 0, "A: name carried inline"); + CHECK(b.count == 0, "B without CAP_ROSTER does NOT receive roster"); + CHECK(cc.count == 1, "C scoped to ep0 received ep0 ATTACH"); + + /* ATTACH ep1 */ + publish(core, 1, 1002, VMSIG_VM_RUNNING, VMSIG_ROSTER_ATTACH, "win-1002"); + CHECK(a.count == 2 && a.ep == 1 && a.vmid == 1002, "A received ATTACH ep1"); + CHECK(cc.count == 1, "C scoped to ep0 does NOT receive ep1 (endpoint_mask filter)"); + + /* late subscriber D: replay of the retained roster (ep0 + ep1) on add_control */ + robs d; memset(&d, 0, sizeof d); + add_robs(core, &d, VMSIG_CAP_ROSTER, ~0ull); + CHECK(d.count == 2, "late subscriber D replayed BOTH retained roster entries"); + + /* DETACH ep0: current subscribers see it; the retained datum is cleared */ + publish(core, 0, 1001, VMSIG_VM_SHUTDOWN, VMSIG_ROSTER_DETACH, "win-1001"); + CHECK(a.count == 3 && a.ep == 0 && a.action == VMSIG_ROSTER_DETACH, "A received DETACH ep0"); + + /* late subscriber E after DETACH: replay yields ONLY ep1 (ep0 cleared) */ + robs e; memset(&e, 0, sizeof e); + add_robs(core, &e, VMSIG_CAP_ROSTER, ~0ull); + CHECK(e.count == 1 && e.ep == 1 && e.vmid == 1002, + "late subscriber E replayed only the live ep1 (detached ep0 not retained)"); + + vmsig_core_free(core); + vmsig_ctx_free(ctx); +} + +int main(void) { + test_roster(); + printf("roster tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +} diff --git a/src/test/test_slot.c b/src/test/test_slot.c new file mode 100644 index 0000000..6f15f48 --- /dev/null +++ b/src/test/test_slot.c @@ -0,0 +1,79 @@ +/* test_slot.c — vmid<->endpoint allocator (WS3): pin/idempotence, lowest-free-bit, free + + * reuse, full-table, and persistence round-trip. */ +#define _GNU_SOURCE +#include "slot.h" +#include +#include +#include +#include + +static int g_fail = 0; +#define CHECK(cond, msg) do { if (!(cond)) { printf(" FAIL: %s\n", (msg)); g_fail = 1; } } while (0) + +static void test_alloc(void) { + printf("test_slot_alloc\n"); + slot_table t; slot_init(&t); + + CHECK(slot_lookup(&t, 1001) == -1, "unbound vmid => -1"); + int a = slot_alloc(&t, 1001); + CHECK(a == 0, "first alloc => lowest bit 0"); + CHECK(slot_alloc(&t, 1001) == 0, "alloc is idempotent (pin same slot)"); + CHECK(slot_lookup(&t, 1001) == 0, "lookup returns the pinned slot"); + + int b = slot_alloc(&t, 1002); + CHECK(b == 1, "second vmid => next free bit 1"); + + slot_free(&t, 1001); + CHECK(slot_lookup(&t, 1001) == -1, "freed vmid => -1"); + int c = slot_alloc(&t, 1003); + CHECK(c == 0, "freed bit 0 reused by a new vmid (lowest free)"); + CHECK(slot_alloc(&t, 1002) == 1, "the other binding survived the free/reuse"); + + CHECK(slot_alloc(&t, 0) == -1, "vmid 0 rejected"); +} + +static void test_full(void) { + printf("test_slot_full\n"); + slot_table t; slot_init(&t); + for (uint32_t i = 0; i < VMSIG_SLOT_COUNT; i++) + CHECK(slot_alloc(&t, 1000 + i) == (int)i, "fill all 64 slots in order"); + CHECK(slot_alloc(&t, 9999) == -1, "65th vmid => -1 (ceiling)"); + CHECK(slot_alloc(&t, 1000) == 0, "an already-bound vmid still resolves when full"); + slot_free(&t, 1030); + CHECK(slot_alloc(&t, 9999) == 30, "after a free, the freed bit is available"); +} + +static void test_persist(void) { + printf("test_slot_persist\n"); + char path[] = "/tmp/vmsig_slot_test.XXXXXX"; + int fd = mkstemp(path); if (fd >= 0) close(fd); + + slot_table t; slot_init(&t); + slot_alloc(&t, 1001); /* bit 0 */ + slot_alloc(&t, 700); /* bit 1 */ + slot_free(&t, 1001); + slot_alloc(&t, 900); /* reuses bit 0 */ + CHECK(slot_save(&t, path) == 0, "save ok"); + + slot_table u; slot_init(&u); + CHECK(slot_load(&u, path) == 0, "load ok"); + CHECK(slot_lookup(&u, 900) == 0, "persisted: 900 on bit 0"); + CHECK(slot_lookup(&u, 700) == 1, "persisted: 700 on bit 1"); + CHECK(slot_lookup(&u, 1001) == -1, "persisted: freed 1001 absent"); + /* a new alloc on the loaded table must avoid the occupied bits */ + CHECK(slot_alloc(&u, 111) == 2, "loaded table: next free bit is 2"); + + slot_table v; slot_init(&v); + CHECK(slot_load(&v, "/tmp/vmsig_nonexistent_xyz") == 0, "missing file => fresh start (0)"); + CHECK(slot_alloc(&v, 1) == 0, "fresh table allocates bit 0"); + + unlink(path); +} + +int main(void) { + test_alloc(); + test_full(); + test_persist(); + printf("slot tests: %s\n", g_fail ? "FAIL" : "PASS"); + return g_fail ? 1 : 0; +}