vgpu in-guest producer in-tree, release CI, flexible vmie discovery

- src/si/vgpu-stream: in-guest vgpu producer built as a Windows cross-compiled target (if(WIN32)) - .gitea: release workflow — cross-build the agent and build/publish the deb against system vmie - cmake/makefile: resolve vmie from a source tree (LIBVMIE_PATH) or installed libvmie-dev
2026-06-26 04:36:37 +03:00 · 2026-06-22 18:35:12 +03:00
parent 9bde398b6c
commit bd8b966017
31 changed files with 2393 additions and 8 deletions
@@ -0,0 +1,212 @@
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <string.h>
+#include <stdio.h>
+#include "present.h"
+#include "stream.h"        /* OS-agnostic publish / control API + region-view */
+#include "cursor.h"
+#include "geometry.h"      /* one-shot display-geometry sample at session start */
+
+/* cursor arena sizing */
+#define VGPU_CUR_MAX     256u
+#define VGPU_CUR_BGRA    (VGPU_CUR_MAX * VGPU_CUR_MAX * 4u)
+#define VGPU_CUR_MASK    (VGPU_CUR_MAX * VGPU_CUR_MAX)
+
+static uint64_t now_ns(void) {
+    static LARGE_INTEGER freq = { .QuadPart = 0 };
+    if (freq.QuadPart == 0) QueryPerformanceFrequency(&freq);
+    LARGE_INTEGER c; QueryPerformanceCounter(&c);
+    return (uint64_t)((double)c.QuadPart * 1e9 / (double)freq.QuadPart);
+}
+
+int vgpu_present_init(vgpu_ctx* ctx, vgpu_region_t* region, uint32_t default_fps) {
+    memset(ctx, 0, sizeof *ctx);
+    ctx->view.producer = region->producer;
+    ctx->view.control  = region->control;
+    ctx->view.ring     = region->ring;
+    ctx->default_fps = default_fps ? default_fps : 30u;
+    ctx->backend  = VGPU_BK_NONE;
+    ctx->draw_cursor_cap = 1;
+
+    /* one arena: content + frame + cursor buffers */
+    size_t bytes = VGPU_STAGING_BYTES   /* content */
+                 + VGPU_STAGING_BYTES   /* frame   */
+                 + VGPU_CUR_BGRA        /* cursor bgra */
+                 + VGPU_CUR_MASK        /* and */
+                 + VGPU_CUR_MASK;       /* xor */
+    uint8_t* a = (uint8_t*)VirtualAlloc(NULL, bytes, MEM_RESERVE | MEM_COMMIT,
+                                        PAGE_READWRITE);
+    if (!a) {
+        fprintf(stderr, "present: arena VirtualAlloc %zu MiB failed (%lu)\n",
+                bytes / (1024 * 1024), GetLastError());
+        return 1;
+    }
+    ctx->arena       = a;
+    ctx->arena_bytes = bytes;
+
+    size_t off = 0;
+    ctx->content_buf      = a + off; off += VGPU_STAGING_BYTES;
+    ctx->frame_buf        = a + off; off += VGPU_STAGING_BYTES;
+    ctx->cursor.bgra      = a + off; off += VGPU_CUR_BGRA;
+    ctx->cursor.and_mask  = a + off; off += VGPU_CUR_MASK;
+    ctx->cursor.xor_mask  = a + off; off += VGPU_CUR_MASK;
+
+    InitializeCriticalSection(&ctx->lock);
+    ctx->submit_event = CreateEvent(NULL, FALSE, FALSE, NULL);
+    ctx->content_seq  = 0;
+    ctx->content_w = ctx->content_h = 0;
+    return 0;
+}
+
+void vgpu_present_deinit(vgpu_ctx* ctx) {
+    if (ctx->submit_event) { CloseHandle(ctx->submit_event); ctx->submit_event = NULL; }
+    DeleteCriticalSection(&ctx->lock);
+    if (ctx->arena) { VirtualFree(ctx->arena, 0, MEM_RELEASE); ctx->arena = NULL; }
+}
+
+void vgpu_present_submit(vgpu_ctx* ctx, const uint8_t* src,
+                         uint32_t W, uint32_t H, uint32_t src_pitch) {
+    if (W > VGPU_MAX_WIDTH)  W = VGPU_MAX_WIDTH;
+    if (H > VGPU_MAX_HEIGHT) H = VGPU_MAX_HEIGHT;
+    if (W == 0 || H == 0) return;
+
+    EnterCriticalSection(&ctx->lock);
+    uint8_t* d = ctx->content_buf;
+    const uint32_t row = W * 4u;
+    for (uint32_t y = 0; y < H; y++)
+        memcpy(d + (size_t)y * row, src + (size_t)y * src_pitch, row);
+    ctx->content_w = W;
+    ctx->content_h = H;
+    ctx->content_seq++;
+    LeaveCriticalSection(&ctx->lock);
+    /* static-idle: stamp the moment the source delivered new content (the raw perception;
+     * the host derives "ms idle" from its own clock). Single 8-aligned MOV, off the lock. */
+    vgpu_publish_content_change(&ctx->view, now_ns());
+    SetEvent(ctx->submit_event);
+}
+
+void vgpu_present_run(vgpu_ctx* ctx) {
+    const vgpu_region_view* rv = &ctx->view;   /* neutral handle for the engine */
+    const DWORD poll_ms = 8;
+    int64_t  last_seq   = -1;
+    uint32_t prev_state = VGPU_CMD_STOP;
+    uint32_t last_ff_ack = rv->producer->full_frame_ack;
+    DWORD    last_beat  = GetTickCount();
+    uint64_t last_publish_ns = 0;   /* 0 → first eligible frame publishes immediately */
+    int      last_cur_x = 0, last_cur_y = 0, last_cur_vis = 0;
+    HCURSOR  last_cur_handle = NULL;
+
+    /* one-shot display geometry: publish once before the loop (flat pull contract). The
+     * captured-output origin is (0,0) for the primary/full-screen capture path; backends
+     * resample reactively on recreate / capture-size change. No periodic poll in the loop. */
+    geometry_sample_and_publish(ctx, 0, 0);
+
+    for (;;) {
+        WaitForSingleObject(ctx->submit_event, poll_ms);
+
+        /* --- heartbeat: always ticks, independent of desired_state --- */
+        DWORD nowt = GetTickCount();
+        if (nowt - last_beat >= VGPU_HEARTBEAT_PERIOD_MS) {
+            vgpu_tick_heartbeat(rv);
+            last_beat = nowt;
+        }
+
+        /* --- reconcile control (gen-seqlock -> apply -> ack) --- */
+        vgpu_control_view cv;
+        uint32_t desired = prev_state;
+        uint32_t draw_cursor = 1;
+        int      force_full = 0;
+        uint32_t fps = ctx->default_fps;   /* publish-rate cap (applied) */
+        uint32_t ff_req = last_ff_ack;     /* full_frame_req value to honor */
+        if (vgpu_control_read(rv, &cv)) {
+            desired = cv.desired_state;
+            draw_cursor = cv.draw_cursor;
+            fps = cv.target_fps ? cv.target_fps : ctx->default_fps;
+            vgpu_set_applied_fps(rv, fps);
+            vgpu_publish_ctrl_ack(rv, cv.gen);
+
+            ff_req = cv.full_frame_req;
+            if ((ff_req - last_ff_ack) != 0u)
+                force_full = 1;            /* edge pending, wrap-tolerant */
+        }
+
+        /* --- lifecycle transitions --- */
+        if (desired != prev_state) {
+            if (desired == VGPU_CMD_RUN && prev_state != VGPU_CMD_RUN) {
+                vgpu_bump_run_epoch(rv);
+                vgpu_set_status(rv, VGPU_ST_CAPTURING);
+                force_full = 1;     /* fresh frame on start */
+            } else if (desired == VGPU_CMD_PAUSE) {
+                vgpu_set_status(rv, VGPU_ST_PAUSED);
+            } else if (desired == VGPU_CMD_STOP) {
+                vgpu_set_status(rv, VGPU_ST_STOPPED);
+            }
+            prev_state = desired;
+        } else if (last_seq < 0 && desired == VGPU_CMD_RUN) {
+            vgpu_set_status(rv, VGPU_ST_CAPTURING);
+        }
+
+        if (desired != VGPU_CMD_RUN) {
+            /* PAUSED/STOPPED: no new frames; heartbeat still ticks. We do NOT
+             * ack a pending full_frame here — acking without publishing would
+             * be a false "honored". A pending request is honored on the next
+             * transition to RUN (force_full=1 there → publish + ack). */
+            continue;
+        }
+
+        /* --- compose + publish on content change OR forced full frame, but
+         *     rate-limited to the applied fps cap (the single publish point →
+         *     contract-level cap, independent of the capture backend). A
+         *     force_full bypasses the cap (due=1). present does NOT sample the
+         *     cursor (capture threads source it); it only reads ctx->cursor under
+         *     ctx->lock for compositing, and detects cursor motion via a delta so
+         *     a pure cursor move over static desktop still recomposes. --- */
+        uint64_t interval_ns = fps > 0 ? (1000000000ull / fps) : 0;
+        uint64_t now = now_ns();
+        int due = force_full || interval_ns == 0
+                  || (now - last_publish_ns) >= interval_ns;
+
+        int compose_cursor = (ctx->draw_cursor_cap && draw_cursor);
+
+        EnterCriticalSection(&ctx->lock);
+        int64_t  seq = ctx->content_seq;
+        uint32_t W = ctx->content_w, H = ctx->content_h;
+        int cur_changed = compose_cursor
+                          && ((ctx->cursor.visible != last_cur_vis)
+                              || (ctx->cursor.x != last_cur_x)
+                              || (ctx->cursor.y != last_cur_y)
+                              || (ctx->cursor.handle != last_cur_handle));
+        int have = (W && H);
+        int content_new = have && (seq != last_seq || cur_changed || force_full);
+        /* take the frame ONLY when due — so we never drop the latest content;
+         * if not due, last_seq is left untouched and it publishes next due. */
+        int dirty = content_new && due;
+        if (dirty) {
+            memcpy(ctx->frame_buf, ctx->content_buf, (size_t)W * H * 4u);
+            last_seq = seq;
+            if (compose_cursor)
+                cursor_draw(ctx, ctx->frame_buf, W, H);
+            last_cur_vis = ctx->cursor.visible;
+            last_cur_x = ctx->cursor.x; last_cur_y = ctx->cursor.y;
+            last_cur_handle = ctx->cursor.handle;
+        }
+        LeaveCriticalSection(&ctx->lock);
+
+        if (!dirty) {
+            /* not due, or nothing to publish. A force_full with content has
+             * due=1 → dirty=1, so it never lands here while have is true; thus
+             * no spurious ack edge. */
+            continue;
+        }
+
+        if (vgpu_publish_frame(rv, ctx->frame_buf, W, H, now) == 0) {
+            last_publish_ns = now;
+            if (force_full) {
+                vgpu_publish_full_frame_ack(rv, ff_req);
+                last_ff_ack = ff_req;
+            }
+        } else {
+            vgpu_set_error(rv, 1u);  /* frame too large for slot (mode > max) */
+        }
+    }
+}