vgpu in-guest producer in-tree, release CI, flexible vmie discovery

- src/si/vgpu-stream: in-guest vgpu producer built as a Windows cross-compiled target (if(WIN32)) - .gitea: release workflow — cross-build the agent and build/publish the deb against system vmie - cmake/makefile: resolve vmie from a source tree (LIBVMIE_PATH) or installed libvmie-dev
2026-06-26 04:36:37 +03:00 · 2026-06-22 18:35:12 +03:00
parent 9bde398b6c
commit bd8b966017
31 changed files with 2393 additions and 8 deletions
@@ -0,0 +1,52 @@
+#ifndef VGPU_ATOMIC_SHIM_H
+#define VGPU_ATOMIC_SHIM_H
+
+/* atomic-shim.h — x86-TSO memory-order accessors (arch, not OS).
+ *
+ * x86-TSO memory-order shim. NO _Atomic in the shared region type: the consumer
+ * maps the region as raw bytes. Synchronization lives entirely in the producer's
+ * accessors here. Per-compiler implementation, never exposed in the contract.
+ *
+ * On x86_64 every naturally-aligned MOV up to 8 bytes is atomic and stores are
+ * already release / loads already acquire at the hardware level; the only things
+ * we must prevent are (1) compiler reordering across the sync point and
+ * (2) store-buffer visibility delay between the data writes and the publish
+ * store, for which an explicit SFENCE is used at publish boundaries.
+ */
+
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+
+#include <intrin.h>
+
+static inline void vgpu_compiler_barrier(void) { _ReadWriteBarrier(); }
+static inline void vgpu_sfence(void) { _mm_sfence(); }
+
+static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) {
+    _ReadWriteBarrier();
+    *p = v;
+}
+
+static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) {
+    uint32_t v = *p;
+    _ReadWriteBarrier();
+    return v;
+}
+
+#else /* gcc / mingw / clang */
+
+static inline void vgpu_compiler_barrier(void) { __asm__ __volatile__("" ::: "memory"); }
+static inline void vgpu_sfence(void) { __asm__ __volatile__("sfence" ::: "memory"); }
+
+static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) {
+    __atomic_store_n(p, v, __ATOMIC_RELEASE);
+}
+
+static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) {
+    return __atomic_load_n(p, __ATOMIC_ACQUIRE);
+}
+
+#endif
+
+#endif /* VGPU_ATOMIC_SHIM_H */
@@ -0,0 +1,28 @@
+#ifndef VGPU_CAPTURE_H
+#define VGPU_CAPTURE_H
+
+/* capture.h — extension seam for capture backends.
+ * A backend produces desktop frames and submits them to the presenter. This
+ * header is OS-agnostic: it names backends through an opaque vgpu_ctx* and a
+ * uniform start contract. A platform layer defines vgpu_ctx and any private
+ * backend plumbing (see src/stream/win32/capture-win32.h). A future Linux layer
+ * implements the same seam against its own vgpu_ctx + region/sync/clock. */
+
+/* Opaque runtime context, defined by the platform layer (win32: ctx.h). */
+typedef struct vgpu_ctx vgpu_ctx;
+
+/* Start a capture backend. Returns 1 on success; on success the backend has
+ * spawned its capture thread(s) (which received ctx) and set ctx->backend /
+ * ctx->draw_cursor_cap. The submit contract: each captured desktop frame is
+ * handed to the presenter via vgpu_present_submit(). */
+typedef int (*capture_start_fn)(vgpu_ctx* ctx, int fps);
+
+typedef struct {
+    const char*      name;
+    capture_start_fn start;
+} capture_backend;
+
+/* Data-driven backend table; the entry point selects by env or availability. */
+const capture_backend* capture_backends(int* count);
+
+#endif /* VGPU_CAPTURE_H */
@@ -0,0 +1,88 @@
+#ifndef VGPU_STREAM_ENGINE_H
+#define VGPU_STREAM_ENGINE_H
+
+/* stream.h — OS-agnostic streaming protocol over the shared contract.
+ * Declares the neutral region-view handle (resolved contract pointers) and the
+ * seqlock publish / control-reconcile API. No platform headers: the engine
+ * operates purely on the contract; a platform layer (e.g. src/stream/win32/)
+ * builds the region and hands its pointers in as a vgpu_region_view. */
+
+#include <stdint.h>
+#include "vgpu_stream.h"   /* contract: producer/control types, slot geometry */
+
+/* Neutral view of the live contract: the three resolved blocks the engine
+ * publishes into / reconciles against. The platform region owns the backing
+ * memory; this is a borrowed view (no ownership). */
+typedef struct {
+    vgpu_producer_t* producer;
+    vgpu_control_t*  control;
+    uint8_t*         ring;
+} vgpu_region_view;
+
+/* Resolved view of the control block after a clean generation read. */
+typedef struct {
+    uint32_t gen;            /* even generation that was read (for ctrl_ack) */
+    uint32_t desired_state;  /* VGPU_CMD_* */
+    uint32_t target_fps;
+    uint32_t draw_cursor;
+    uint32_t full_frame_req;
+    uint32_t consumer_tick;
+    uint32_t attached;
+} vgpu_control_view;
+
+/* Seqlock-publish a tight BGRA frame into the next ring slot.
+ * Clamps by SLOT_STRIDE (rejects frames that do not fit). Writes desc[],
+ * bumps frame_id, release-stores latest. Returns 0 on publish, 1 if dropped
+ * (frame too large for a slot). */
+int vgpu_publish_frame(const vgpu_region_view* rv, const uint8_t* tight_bgra,
+                       uint32_t width, uint32_t height, uint64_t timestamp_ns);
+
+/* Read control block under its generation seqlock (bounded retry). Returns 1
+ * on a clean read (view filled), 0 if the writer kept it busy past the limit. */
+int vgpu_control_read(const vgpu_region_view* rv, vgpu_control_view* out);
+
+/* Echo the applied generation back to the host. */
+void vgpu_publish_ctrl_ack(const vgpu_region_view* rv, uint32_t gen);
+
+/* Status / lifecycle helpers (cold line). */
+void vgpu_set_status(const vgpu_region_view* rv, uint32_t status);
+void vgpu_set_backend(const vgpu_region_view* rv, uint32_t backend);
+void vgpu_set_error(const vgpu_region_view* rv, uint32_t error_code);
+void vgpu_set_applied_fps(const vgpu_region_view* rv, uint32_t fps);
+void vgpu_bump_run_epoch(const vgpu_region_view* rv);
+void vgpu_tick_heartbeat(const vgpu_region_view* rv);
+void vgpu_publish_full_frame_ack(const vgpu_region_view* rv, uint32_t req);
+
+/* Publish the on-screen cursor position (host-RO). Position is sensor data and is
+ * reported independent of control.draw_cursor (host may draw its own overlay even when the
+ * producer does not composite the cursor). x,y are screen coords (signed; multi-monitor may
+ * be negative); visible!=0 when the cursor is shown. Packs x|y into one 8-aligned 64-bit
+ * field (single atomic store) and bumps cursor_seq last. */
+void vgpu_publish_cursor(const vgpu_region_view* rv, int32_t x, int32_t y, uint32_t visible);
+
+/* Publish Tier-1 cursor shape data (host-RO), written under the same cursor_seq gate as
+ * vgpu_publish_cursor: call this BEFORE vgpu_publish_cursor so the position publish bumps
+ * cursor_seq last and gates the whole cursor line consistently. hot_x/hot_y are the glyph
+ * hotspot; gw/gh are glyph dims; cursor_id is a VGPU_CURSOR_ID_* shape identity. */
+void vgpu_publish_cursor_shape(const vgpu_region_view* rv,
+                               uint32_t hot_x, uint32_t hot_y,
+                               uint32_t gw, uint32_t gh, uint32_t cursor_id);
+
+/* Publish the monotonic timestamp (ns) of the last scene-content change. Single 8-aligned
+ * atomic store (heartbeat pattern). The producer reports the raw stamp only; the host derives
+ * "ms idle" by subtracting from its own clock — no behavioural distillation in the producer. */
+void vgpu_publish_content_change(const vgpu_region_view* rv, uint64_t change_ns);
+
+/* Publish display geometry under the geom_seq seqlock (odd/even, like the frame seqlock).
+ * Sampled rarely (session start + reactive resample on desc-size delta / backend recreate),
+ * read by the host with bounded retry. virt_* is the virtual-desktop bbox (interprets negative
+ * cursor_pos); cap_x/cap_y is the captured output's origin in virtual-desktop coords (the
+ * captured surface SIZE comes from desc.width/height, not from here). dpi/refresh_mhz describe
+ * the captured output (96=100% / milli-Hz; 0=unknown). */
+void vgpu_publish_geometry(const vgpu_region_view* rv,
+                           int32_t virt_x, int32_t virt_y,
+                           uint32_t virt_w, uint32_t virt_h,
+                           int32_t cap_x, int32_t cap_y,
+                           uint32_t dpi, uint32_t refresh_mhz);
+
+#endif /* VGPU_STREAM_ENGINE_H */