vgpu in-guest producer in-tree, release CI, flexible vmie discovery

- src/si/vgpu-stream: in-guest vgpu producer built as a Windows cross-compiled target (if(WIN32))
- .gitea: release workflow — cross-build the agent and build/publish the deb against system vmie
- cmake/makefile: resolve vmie from a source tree (LIBVMIE_PATH) or installed libvmie-dev
This commit is contained in:
2026-06-22 18:35:12 +03:00
parent 9bde398b6c
commit bd8b966017
31 changed files with 2393 additions and 8 deletions
+52
View File
@@ -0,0 +1,52 @@
#ifndef VGPU_ATOMIC_SHIM_H
#define VGPU_ATOMIC_SHIM_H
/* atomic-shim.h — x86-TSO memory-order accessors (arch, not OS).
*
* x86-TSO memory-order shim. NO _Atomic in the shared region type: the consumer
* maps the region as raw bytes. Synchronization lives entirely in the producer's
* accessors here. Per-compiler implementation, never exposed in the contract.
*
* On x86_64 every naturally-aligned MOV up to 8 bytes is atomic and stores are
* already release / loads already acquire at the hardware level; the only things
* we must prevent are (1) compiler reordering across the sync point and
* (2) store-buffer visibility delay between the data writes and the publish
* store, for which an explicit SFENCE is used at publish boundaries.
*/
#include <stdint.h>
#if defined(_MSC_VER)
#include <intrin.h>
static inline void vgpu_compiler_barrier(void) { _ReadWriteBarrier(); }
static inline void vgpu_sfence(void) { _mm_sfence(); }
static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) {
_ReadWriteBarrier();
*p = v;
}
static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) {
uint32_t v = *p;
_ReadWriteBarrier();
return v;
}
#else /* gcc / mingw / clang */
static inline void vgpu_compiler_barrier(void) { __asm__ __volatile__("" ::: "memory"); }
static inline void vgpu_sfence(void) { __asm__ __volatile__("sfence" ::: "memory"); }
static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) {
__atomic_store_n(p, v, __ATOMIC_RELEASE);
}
static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) {
return __atomic_load_n(p, __ATOMIC_ACQUIRE);
}
#endif
#endif /* VGPU_ATOMIC_SHIM_H */
+28
View File
@@ -0,0 +1,28 @@
#ifndef VGPU_CAPTURE_H
#define VGPU_CAPTURE_H
/* capture.h — extension seam for capture backends.
* A backend produces desktop frames and submits them to the presenter. This
* header is OS-agnostic: it names backends through an opaque vgpu_ctx* and a
* uniform start contract. A platform layer defines vgpu_ctx and any private
* backend plumbing (see src/stream/win32/capture-win32.h). A future Linux layer
* implements the same seam against its own vgpu_ctx + region/sync/clock. */
/* Opaque runtime context, defined by the platform layer (win32: ctx.h). */
typedef struct vgpu_ctx vgpu_ctx;
/* Start a capture backend. Returns 1 on success; on success the backend has
* spawned its capture thread(s) (which received ctx) and set ctx->backend /
* ctx->draw_cursor_cap. The submit contract: each captured desktop frame is
* handed to the presenter via vgpu_present_submit(). */
typedef int (*capture_start_fn)(vgpu_ctx* ctx, int fps);
typedef struct {
const char* name;
capture_start_fn start;
} capture_backend;
/* Data-driven backend table; the entry point selects by env or availability. */
const capture_backend* capture_backends(int* count);
#endif /* VGPU_CAPTURE_H */
+88
View File
@@ -0,0 +1,88 @@
#ifndef VGPU_STREAM_ENGINE_H
#define VGPU_STREAM_ENGINE_H
/* stream.h — OS-agnostic streaming protocol over the shared contract.
* Declares the neutral region-view handle (resolved contract pointers) and the
* seqlock publish / control-reconcile API. No platform headers: the engine
* operates purely on the contract; a platform layer (e.g. src/stream/win32/)
* builds the region and hands its pointers in as a vgpu_region_view. */
#include <stdint.h>
#include "vgpu_stream.h" /* contract: producer/control types, slot geometry */
/* Neutral view of the live contract: the three resolved blocks the engine
* publishes into / reconciles against. The platform region owns the backing
* memory; this is a borrowed view (no ownership). */
typedef struct {
vgpu_producer_t* producer;
vgpu_control_t* control;
uint8_t* ring;
} vgpu_region_view;
/* Resolved view of the control block after a clean generation read. */
typedef struct {
uint32_t gen; /* even generation that was read (for ctrl_ack) */
uint32_t desired_state; /* VGPU_CMD_* */
uint32_t target_fps;
uint32_t draw_cursor;
uint32_t full_frame_req;
uint32_t consumer_tick;
uint32_t attached;
} vgpu_control_view;
/* Seqlock-publish a tight BGRA frame into the next ring slot.
* Clamps by SLOT_STRIDE (rejects frames that do not fit). Writes desc[],
* bumps frame_id, release-stores latest. Returns 0 on publish, 1 if dropped
* (frame too large for a slot). */
int vgpu_publish_frame(const vgpu_region_view* rv, const uint8_t* tight_bgra,
uint32_t width, uint32_t height, uint64_t timestamp_ns);
/* Read control block under its generation seqlock (bounded retry). Returns 1
* on a clean read (view filled), 0 if the writer kept it busy past the limit. */
int vgpu_control_read(const vgpu_region_view* rv, vgpu_control_view* out);
/* Echo the applied generation back to the host. */
void vgpu_publish_ctrl_ack(const vgpu_region_view* rv, uint32_t gen);
/* Status / lifecycle helpers (cold line). */
void vgpu_set_status(const vgpu_region_view* rv, uint32_t status);
void vgpu_set_backend(const vgpu_region_view* rv, uint32_t backend);
void vgpu_set_error(const vgpu_region_view* rv, uint32_t error_code);
void vgpu_set_applied_fps(const vgpu_region_view* rv, uint32_t fps);
void vgpu_bump_run_epoch(const vgpu_region_view* rv);
void vgpu_tick_heartbeat(const vgpu_region_view* rv);
void vgpu_publish_full_frame_ack(const vgpu_region_view* rv, uint32_t req);
/* Publish the on-screen cursor position (host-RO). Position is sensor data and is
* reported independent of control.draw_cursor (host may draw its own overlay even when the
* producer does not composite the cursor). x,y are screen coords (signed; multi-monitor may
* be negative); visible!=0 when the cursor is shown. Packs x|y into one 8-aligned 64-bit
* field (single atomic store) and bumps cursor_seq last. */
void vgpu_publish_cursor(const vgpu_region_view* rv, int32_t x, int32_t y, uint32_t visible);
/* Publish Tier-1 cursor shape data (host-RO), written under the same cursor_seq gate as
* vgpu_publish_cursor: call this BEFORE vgpu_publish_cursor so the position publish bumps
* cursor_seq last and gates the whole cursor line consistently. hot_x/hot_y are the glyph
* hotspot; gw/gh are glyph dims; cursor_id is a VGPU_CURSOR_ID_* shape identity. */
void vgpu_publish_cursor_shape(const vgpu_region_view* rv,
uint32_t hot_x, uint32_t hot_y,
uint32_t gw, uint32_t gh, uint32_t cursor_id);
/* Publish the monotonic timestamp (ns) of the last scene-content change. Single 8-aligned
* atomic store (heartbeat pattern). The producer reports the raw stamp only; the host derives
* "ms idle" by subtracting from its own clock — no behavioural distillation in the producer. */
void vgpu_publish_content_change(const vgpu_region_view* rv, uint64_t change_ns);
/* Publish display geometry under the geom_seq seqlock (odd/even, like the frame seqlock).
* Sampled rarely (session start + reactive resample on desc-size delta / backend recreate),
* read by the host with bounded retry. virt_* is the virtual-desktop bbox (interprets negative
* cursor_pos); cap_x/cap_y is the captured output's origin in virtual-desktop coords (the
* captured surface SIZE comes from desc.width/height, not from here). dpi/refresh_mhz describe
* the captured output (96=100% / milli-Hz; 0=unknown). */
void vgpu_publish_geometry(const vgpu_region_view* rv,
int32_t virt_x, int32_t virt_y,
uint32_t virt_w, uint32_t virt_h,
int32_t cap_x, int32_t cap_y,
uint32_t dpi, uint32_t refresh_mhz);
#endif /* VGPU_STREAM_ENGINE_H */