diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml
new file mode 100644
index 0000000..80590b9
--- /dev/null
+++ b/.gitea/workflows/release.yml
@@ -0,0 +1,123 @@
+name: release
+
+on:
+  push:
+    tags:
+      - 'v*'
+
+# No deployment-specific values are hardcoded: server/owner/repo come from the CI context,
+# the publish token from a secret. Mirrors the sibling vmie release pipeline.
+jobs:
+  # In-guest vgpu producer (Windows, cross-compiled) -> attached to the release.
+  windows-agent:
+    runs-on: ubuntu-latest
+    container:
+      image: node:20-bookworm-slim
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install toolchain
+        run: |
+          apt-get update
+          apt-get install -y --no-install-recommends \
+            cmake make zip jq curl ca-certificates gcc-mingw-w64-x86-64
+
+      - name: Cross-build the agent
+        run: |
+          cmake -S . -B build-win -DCMAKE_BUILD_TYPE=Release \
+                -DCMAKE_TOOLCHAIN_FILE=cmake/toolchain-mingw-w64.cmake
+          cmake --build build-win -j
+
+      - name: Package
+        env:
+          TAG: ${{ github.ref_name }}
+        run: |
+          set -euo pipefail
+          mkdir -p dist/vgpu-streamer
+          cp build-win/vgpu-streamer.exe dist/vgpu-streamer/
+          [ -f LICENSE ] && cp LICENSE dist/vgpu-streamer/ || true
+          (cd dist && zip -r "vgpu-streamer-${TAG}-win64.zip" vgpu-streamer)
+
+      - name: Attach to release
+        env:
+          GITEA_TOKEN: ${{ secrets.PUBLISH_TOKEN }}
+          SERVER: ${{ github.server_url }}
+          REPO: ${{ github.repository }}
+          TAG: ${{ github.ref_name }}
+        run: |
+          set -euo pipefail
+          asset="vgpu-streamer-${TAG}-win64.zip"
+          api="${SERVER}/api/v1/repos/${REPO}"
+          auth="Authorization: token ${GITEA_TOKEN}"
+
+          rid=$(curl -sSL -H "$auth" "${api}/releases/tags/${TAG}" | jq -r '.id // empty' || true)
+          if [ -z "$rid" ]; then
+            rid=$(curl -fsSL -X POST -H "$auth" -H "Content-Type: application/json" \
+              -d "{\"tag_name\":\"${TAG}\",\"name\":\"${TAG}\"}" \
+              "${api}/releases" | jq -r '.id')
+          fi
+          curl -fsSL -H "$auth" "${api}/releases/${rid}/assets" \
+            | jq -r ".[] | select(.name==\"${asset}\") | .id" \
+            | while read -r aid; do
+                [ -n "$aid" ] && curl -fsSL -X DELETE -H "$auth" "${api}/releases/${rid}/assets/${aid}"
+              done
+          curl -fsSL -X POST -H "$auth" \
+            -F "attachment=@dist/${asset};type=application/zip" \
+            "${api}/releases/${rid}/assets?name=${asset}"
+
+  # Host package (daemon + libs) -> the Gitea Debian registry. Built against the published
+  # vmie dev package (external dependency), installed from the same registry.
+  deb:
+    runs-on: ubuntu-latest
+    container:
+      image: node:20-bookworm-slim
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install toolchain + vmie (external dependency)
+        env:
+          SERVER: ${{ github.server_url }}
+          OWNER: ${{ github.repository_owner }}
+        run: |
+          set -euo pipefail
+          apt-get update
+          apt-get install -y --no-install-recommends \
+            cmake make gcc libc6-dev dpkg-dev file ca-certificates curl
+          echo "deb [trusted=yes] ${SERVER}/api/packages/${OWNER}/debian stable main" \
+            > /etc/apt/sources.list.d/gitea.list
+          apt-get update
+          apt-get install -y libvmie-dev
+
+      - name: Build package
+        env:
+          TAG: ${{ github.ref_name }}
+        run: make deb VERSION="${TAG#v}"
+
+      - name: Publish to Debian registry
+        env:
+          TOKEN: ${{ secrets.PUBLISH_TOKEN }}   # requires scope: package:write
+          SERVER: ${{ github.server_url }}
+          OWNER: ${{ github.repository_owner }}
+          DISTRIBUTION: stable
+          COMPONENT: main
+        run: |
+          set -euo pipefail
+          url="${SERVER}/api/packages/${OWNER}/debian/pool/${DISTRIBUTION}/${COMPONENT}/upload"
+          auth="Authorization: token ${TOKEN}"
+          for deb in dist/*.deb; do
+            # 201 Created = uploaded; 409 Conflict = this version already present (re-run).
+            code=$(curl -s -o /dev/null -w '%{http_code}' -X PUT -H "$auth" -T "$deb" "$url")
+            echo "$deb -> HTTP $code"
+            if [ "$code" != 201 ] && [ "$code" != 409 ]; then
+              echo "upload failed: $deb (HTTP $code)" >&2
+              exit 1
+            fi
+          done
diff --git a/.gitignore b/.gitignore
index bad1588..0ac2f23 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ compile*
 Testing/
 CLAUDE.md
 dist/
+!.gitea/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ba2128..630bc2f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.16)
-project(vmsig VERSION 0.3.0 LANGUAGES C)
+project(vmsig VERSION 0.3.1 LANGUAGES C)
 
 set(CMAKE_C_STANDARD 17)
 set(CMAKE_C_STANDARD_REQUIRED ON)
@@ -16,6 +16,37 @@ option(VMSIG_WITH_VMIE  "Link real vmie (libvmie.a, PIC) for armed memctx" OFF)
 # The input driver (vmctl) is ABSORBED in-tree (src/si/input/) — no external flag.
 set(LIBVMIE_PATH  "" CACHE PATH "Path to the vmie library sources (for VMSIG_WITH_VMIE)")
 
+# ---- in-guest vgpu producer (Windows agent, cross-compiled) -----------------
+# The host signaling stack below is Linux-only (epoll/eventfd/timerfd), so a Windows-targeted
+# build (mingw toolchain, CMAKE_SYSTEM_NAME=Windows) produces ONLY this agent. Producer and
+# host consumer share the ABI header include/vgpu_stream.h, so they version together in one tree.
+#   cmake -S . -B .build-win -DCMAKE_TOOLCHAIN_FILE=cmake/toolchain-mingw-w64.cmake
+if(WIN32)
+    add_executable(vgpu-streamer
+            src/si/vgpu-stream/win32/main.c
+            src/si/vgpu-stream/publish.c
+            src/si/vgpu-stream/win32/region.c
+            src/si/vgpu-stream/win32/present.c
+            src/si/vgpu-stream/win32/cursor.c
+            src/si/vgpu-stream/win32/geometry.c
+            src/si/vgpu-stream/win32/capture.c
+            src/si/vgpu-stream/win32/capture_nvfbc.c
+            src/si/vgpu-stream/win32/capture_dda.c
+            src/si/vgpu-stream/win32/capture_gdi.c)
+    target_include_directories(vgpu-streamer PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/include
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/si/vgpu-stream/include
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/si/vgpu-stream/win32
+            ${CMAKE_CURRENT_SOURCE_DIR}/third_party)   # vendor NvFBC + Windows.h shim
+    target_compile_definitions(vgpu-streamer PRIVATE CINTERFACE WIN32_LEAN_AND_MEAN=)
+    target_compile_options(vgpu-streamer PRIVATE
+            $<$<C_COMPILER_ID:GNU>:-O2;-Wall;-Wextra>
+            $<$<C_COMPILER_ID:MSVC>:/O2;/W3>)
+    target_link_libraries(vgpu-streamer PRIVATE d3d11 dxgi dxguid uuid user32 gdi32)
+    target_link_options(vgpu-streamer PRIVATE $<$<C_COMPILER_ID:GNU>:-static;-s>)
+    return()   # a Windows-targeted build is the agent ONLY; the host stack below is skipped
+endif()
+
 find_package(Threads REQUIRED)
 
 # ---- signaling library ------------------------------------------------------
@@ -58,9 +89,19 @@ target_link_libraries(vmsig PRIVATE Threads::Threads)
 # package Depends on libvmie). Headers + symbols come from the imported target.
 if(VMSIG_WITH_VMIE)
     add_library(vmie SHARED IMPORTED)
-    set_target_properties(vmie PROPERTIES
-            IMPORTED_LOCATION             ${LIBVMIE_PATH}/.build/libvmie.so
-            INTERFACE_INCLUDE_DIRECTORIES ${LIBVMIE_PATH}/include)
+    if(LIBVMIE_PATH)
+        # dev: link against an in-place source-tree build
+        set_target_properties(vmie PROPERTIES
+                IMPORTED_LOCATION             ${LIBVMIE_PATH}/.build/libvmie.so
+                INTERFACE_INCLUDE_DIRECTORIES ${LIBVMIE_PATH}/include)
+    else()
+        # CI/system: the installed libvmie-dev package (/usr, or via CMAKE_PREFIX_PATH)
+        find_library(VMIE_LIBRARY     NAMES vmie       REQUIRED)
+        find_path(   VMIE_INCLUDE_DIR NAMES memmodel.h PATH_SUFFIXES vmie REQUIRED)
+        set_target_properties(vmie PROPERTIES
+                IMPORTED_LOCATION             ${VMIE_LIBRARY}
+                INTERFACE_INCLUDE_DIRECTORIES ${VMIE_INCLUDE_DIR})
+    endif()
     target_link_libraries(vmsig PRIVATE vmie)
     target_compile_definitions(vmsig PRIVATE VMSIG_WITH_VMIE)
 endif()
diff --git a/Makefile b/Makefile
index 4455490..275d9d8 100644
--- a/Makefile
+++ b/Makefile
@@ -20,12 +20,12 @@ DIST        ?= $(CURDIR)/dist
 .PHONY: deb clean
 
 # Armed package: the shipped daemon needs vmie for memctx. vmie stays an external dependency
-# (the package Depends on its runtime; pass DEPENDS to add it).
+# (package Depends on its runtime). vmie is found from a source tree (LIBVMIE_PATH) or, when
+# that is empty, from the installed libvmie-dev (system / CMAKE_PREFIX_PATH) — the CI path.
 deb:
-	@test -n "$(LIBVMIE_PATH)" || { echo "set LIBVMIE_PATH=/path/to/vmie sources (armed memctx)"; exit 1; }
 	rm -rf $(STAGE)
-	cmake -S . -B $(BUILD_DIR) -DCMAKE_BUILD_TYPE=Release -DVMSIG_INSTALL=ON \
-	      -DVMSIG_WITH_VMIE=ON -DLIBVMIE_PATH=$(LIBVMIE_PATH)
+	cmake -S . -B $(BUILD_DIR) -DCMAKE_BUILD_TYPE=Release -DVMSIG_INSTALL=ON -DVMSIG_WITH_VMIE=ON \
+	      $(if $(LIBVMIE_PATH),-DLIBVMIE_PATH=$(LIBVMIE_PATH),)
 	cmake --build $(BUILD_DIR) -j
 	DESTDIR=$(STAGE) cmake --install $(BUILD_DIR) --prefix /usr
 	mkdir -p $(STAGE)/DEBIAN
diff --git a/cmake/toolchain-mingw-w64.cmake b/cmake/toolchain-mingw-w64.cmake
new file mode 100644
index 0000000..315a1b1
--- /dev/null
+++ b/cmake/toolchain-mingw-w64.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+set(CMAKE_C_COMPILER   x86_64-w64-mingw32-gcc)
+set(CMAKE_RC_COMPILER  x86_64-w64-mingw32-windres)
+set(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/src/si/vgpu-stream/include/atomic-shim.h b/src/si/vgpu-stream/include/atomic-shim.h
new file mode 100644
index 0000000..2576514
--- /dev/null
+++ b/src/si/vgpu-stream/include/atomic-shim.h
@@ -0,0 +1,52 @@
+#ifndef VGPU_ATOMIC_SHIM_H
+#define VGPU_ATOMIC_SHIM_H
+
+/* atomic-shim.h — x86-TSO memory-order accessors (arch, not OS).
+ *
+ * x86-TSO memory-order shim. NO _Atomic in the shared region type: the consumer
+ * maps the region as raw bytes. Synchronization lives entirely in the producer's
+ * accessors here. Per-compiler implementation, never exposed in the contract.
+ *
+ * On x86_64 every naturally-aligned MOV up to 8 bytes is atomic and stores are
+ * already release / loads already acquire at the hardware level; the only things
+ * we must prevent are (1) compiler reordering across the sync point and
+ * (2) store-buffer visibility delay between the data writes and the publish
+ * store, for which an explicit SFENCE is used at publish boundaries.
+ */
+
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+
+#include <intrin.h>
+
+static inline void vgpu_compiler_barrier(void) { _ReadWriteBarrier(); }
+static inline void vgpu_sfence(void) { _mm_sfence(); }
+
+static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) {
+    _ReadWriteBarrier();
+    *p = v;
+}
+
+static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) {
+    uint32_t v = *p;
+    _ReadWriteBarrier();
+    return v;
+}
+
+#else /* gcc / mingw / clang */
+
+static inline void vgpu_compiler_barrier(void) { __asm__ __volatile__("" ::: "memory"); }
+static inline void vgpu_sfence(void) { __asm__ __volatile__("sfence" ::: "memory"); }
+
+static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) {
+    __atomic_store_n(p, v, __ATOMIC_RELEASE);
+}
+
+static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) {
+    return __atomic_load_n(p, __ATOMIC_ACQUIRE);
+}
+
+#endif
+
+#endif /* VGPU_ATOMIC_SHIM_H */
diff --git a/src/si/vgpu-stream/include/capture.h b/src/si/vgpu-stream/include/capture.h
new file mode 100644
index 0000000..fa0be57
--- /dev/null
+++ b/src/si/vgpu-stream/include/capture.h
@@ -0,0 +1,28 @@
+#ifndef VGPU_CAPTURE_H
+#define VGPU_CAPTURE_H
+
+/* capture.h — extension seam for capture backends.
+ * A backend produces desktop frames and submits them to the presenter. This
+ * header is OS-agnostic: it names backends through an opaque vgpu_ctx* and a
+ * uniform start contract. A platform layer defines vgpu_ctx and any private
+ * backend plumbing (see src/stream/win32/capture-win32.h). A future Linux layer
+ * implements the same seam against its own vgpu_ctx + region/sync/clock. */
+
+/* Opaque runtime context, defined by the platform layer (win32: ctx.h). */
+typedef struct vgpu_ctx vgpu_ctx;
+
+/* Start a capture backend. Returns 1 on success; on success the backend has
+ * spawned its capture thread(s) (which received ctx) and set ctx->backend /
+ * ctx->draw_cursor_cap. The submit contract: each captured desktop frame is
+ * handed to the presenter via vgpu_present_submit(). */
+typedef int (*capture_start_fn)(vgpu_ctx* ctx, int fps);
+
+typedef struct {
+    const char*      name;
+    capture_start_fn start;
+} capture_backend;
+
+/* Data-driven backend table; the entry point selects by env or availability. */
+const capture_backend* capture_backends(int* count);
+
+#endif /* VGPU_CAPTURE_H */
diff --git a/src/si/vgpu-stream/include/stream.h b/src/si/vgpu-stream/include/stream.h
new file mode 100644
index 0000000..9ea7e29
--- /dev/null
+++ b/src/si/vgpu-stream/include/stream.h
@@ -0,0 +1,88 @@
+#ifndef VGPU_STREAM_ENGINE_H
+#define VGPU_STREAM_ENGINE_H
+
+/* stream.h — OS-agnostic streaming protocol over the shared contract.
+ * Declares the neutral region-view handle (resolved contract pointers) and the
+ * seqlock publish / control-reconcile API. No platform headers: the engine
+ * operates purely on the contract; a platform layer (e.g. src/stream/win32/)
+ * builds the region and hands its pointers in as a vgpu_region_view. */
+
+#include <stdint.h>
+#include "vgpu_stream.h"   /* contract: producer/control types, slot geometry */
+
+/* Neutral view of the live contract: the three resolved blocks the engine
+ * publishes into / reconciles against. The platform region owns the backing
+ * memory; this is a borrowed view (no ownership). */
+typedef struct {
+    vgpu_producer_t* producer;
+    vgpu_control_t*  control;
+    uint8_t*         ring;
+} vgpu_region_view;
+
+/* Resolved view of the control block after a clean generation read. */
+typedef struct {
+    uint32_t gen;            /* even generation that was read (for ctrl_ack) */
+    uint32_t desired_state;  /* VGPU_CMD_* */
+    uint32_t target_fps;
+    uint32_t draw_cursor;
+    uint32_t full_frame_req;
+    uint32_t consumer_tick;
+    uint32_t attached;
+} vgpu_control_view;
+
+/* Seqlock-publish a tight BGRA frame into the next ring slot.
+ * Clamps by SLOT_STRIDE (rejects frames that do not fit). Writes desc[],
+ * bumps frame_id, release-stores latest. Returns 0 on publish, 1 if dropped
+ * (frame too large for a slot). */
+int vgpu_publish_frame(const vgpu_region_view* rv, const uint8_t* tight_bgra,
+                       uint32_t width, uint32_t height, uint64_t timestamp_ns);
+
+/* Read control block under its generation seqlock (bounded retry). Returns 1
+ * on a clean read (view filled), 0 if the writer kept it busy past the limit. */
+int vgpu_control_read(const vgpu_region_view* rv, vgpu_control_view* out);
+
+/* Echo the applied generation back to the host. */
+void vgpu_publish_ctrl_ack(const vgpu_region_view* rv, uint32_t gen);
+
+/* Status / lifecycle helpers (cold line). */
+void vgpu_set_status(const vgpu_region_view* rv, uint32_t status);
+void vgpu_set_backend(const vgpu_region_view* rv, uint32_t backend);
+void vgpu_set_error(const vgpu_region_view* rv, uint32_t error_code);
+void vgpu_set_applied_fps(const vgpu_region_view* rv, uint32_t fps);
+void vgpu_bump_run_epoch(const vgpu_region_view* rv);
+void vgpu_tick_heartbeat(const vgpu_region_view* rv);
+void vgpu_publish_full_frame_ack(const vgpu_region_view* rv, uint32_t req);
+
+/* Publish the on-screen cursor position (host-RO). Position is sensor data and is
+ * reported independent of control.draw_cursor (host may draw its own overlay even when the
+ * producer does not composite the cursor). x,y are screen coords (signed; multi-monitor may
+ * be negative); visible!=0 when the cursor is shown. Packs x|y into one 8-aligned 64-bit
+ * field (single atomic store) and bumps cursor_seq last. */
+void vgpu_publish_cursor(const vgpu_region_view* rv, int32_t x, int32_t y, uint32_t visible);
+
+/* Publish Tier-1 cursor shape data (host-RO), written under the same cursor_seq gate as
+ * vgpu_publish_cursor: call this BEFORE vgpu_publish_cursor so the position publish bumps
+ * cursor_seq last and gates the whole cursor line consistently. hot_x/hot_y are the glyph
+ * hotspot; gw/gh are glyph dims; cursor_id is a VGPU_CURSOR_ID_* shape identity. */
+void vgpu_publish_cursor_shape(const vgpu_region_view* rv,
+                               uint32_t hot_x, uint32_t hot_y,
+                               uint32_t gw, uint32_t gh, uint32_t cursor_id);
+
+/* Publish the monotonic timestamp (ns) of the last scene-content change. Single 8-aligned
+ * atomic store (heartbeat pattern). The producer reports the raw stamp only; the host derives
+ * "ms idle" by subtracting from its own clock — no behavioural distillation in the producer. */
+void vgpu_publish_content_change(const vgpu_region_view* rv, uint64_t change_ns);
+
+/* Publish display geometry under the geom_seq seqlock (odd/even, like the frame seqlock).
+ * Sampled rarely (session start + reactive resample on desc-size delta / backend recreate),
+ * read by the host with bounded retry. virt_* is the virtual-desktop bbox (interprets negative
+ * cursor_pos); cap_x/cap_y is the captured output's origin in virtual-desktop coords (the
+ * captured surface SIZE comes from desc.width/height, not from here). dpi/refresh_mhz describe
+ * the captured output (96=100% / milli-Hz; 0=unknown). */
+void vgpu_publish_geometry(const vgpu_region_view* rv,
+                           int32_t virt_x, int32_t virt_y,
+                           uint32_t virt_w, uint32_t virt_h,
+                           int32_t cap_x, int32_t cap_y,
+                           uint32_t dpi, uint32_t refresh_mhz);
+
+#endif /* VGPU_STREAM_ENGINE_H */
diff --git a/src/si/vgpu-stream/publish.c b/src/si/vgpu-stream/publish.c
new file mode 100644
index 0000000..d1cf2d1
--- /dev/null
+++ b/src/si/vgpu-stream/publish.c
@@ -0,0 +1,163 @@
+/* publish.c — OS-agnostic implementation of the streaming protocol.
+ * Operates purely on the contract through a borrowed vgpu_region_view; no
+ * platform headers, no runtime context. The x86-TSO ordering lives in the
+ * atomic shim. */
+
+#include <string.h>
+#include "vgpu_stream.h"   /* contract types / slot geometry */
+#include "atomic-shim.h"   /* x86-TSO memory-order accessors */
+#include "stream.h"        /* region-view handle + this API */
+
+#define VGPU_CTRL_READ_TRIES 16u
+
+int vgpu_publish_frame(const vgpu_region_view* rv, const uint8_t* tight_bgra,
+                       uint32_t width, uint32_t height, uint64_t timestamp_ns) {
+    vgpu_producer_t* p = rv->producer;
+
+    const uint32_t stride = width * 4u;               /* tight invariant */
+    const uint64_t need   = (uint64_t)height * stride;
+    if (need > VGPU_SLOT_STRIDE)                       /* clamp by slot size */
+        return 1;
+
+    uint32_t cur = vgpu_load_acquire32(&p->latest);
+    uint32_t S   = (cur == VGPU_LATEST_NONE) ? 0u : ((cur + 1u) % VGPU_SLOT_COUNT);
+
+    uint8_t* dst = rv->ring + (size_t)S * VGPU_SLOT_STRIDE;
+
+    /* seqlock: even -> odd (writing) */
+    vgpu_store_release32(&p->seq[S], p->seq[S] + 1u);
+    vgpu_compiler_barrier();
+
+    /* descriptor (self-describing slot) */
+    p->desc[S].width        = width;
+    p->desc[S].height       = height;
+    p->desc[S].stride       = stride;
+    p->desc[S].format       = VGPU_FMT_BGRA8888;
+    p->desc[S].frame_id     = p->frame_id + 1u;
+    p->desc[S].timestamp_ns = timestamp_ns;
+
+    /* pixels (source is already tight) */
+    memcpy(dst, tight_bgra, (size_t)need);
+
+    vgpu_sfence();
+    /* seqlock: odd -> even (stable) */
+    vgpu_store_release32(&p->seq[S], p->seq[S] + 1u);
+    vgpu_sfence();
+
+    p->frame_id += 1u;
+    vgpu_store_release32(&p->latest, S);
+    return 0;
+}
+
+int vgpu_control_read(const vgpu_region_view* rv, vgpu_control_view* out) {
+    volatile vgpu_control_t* c = rv->control;
+
+    for (uint32_t t = 0; t < VGPU_CTRL_READ_TRIES; t++) {
+        uint32_t g0 = vgpu_load_acquire32(&c->ctrl_gen);
+        if (g0 & 1u)
+            continue;                       /* writer in progress */
+        vgpu_compiler_barrier();
+
+        uint32_t desired = c->desired_state;
+        uint32_t fps     = c->target_fps;
+        uint32_t cursor  = c->draw_cursor;
+        uint32_t ffreq   = c->full_frame_req;
+        uint32_t ctick   = c->consumer_tick;
+        uint32_t att     = c->attached;
+
+        vgpu_compiler_barrier();
+        uint32_t g1 = vgpu_load_acquire32(&c->ctrl_gen);
+        if (g0 != g1)
+            continue;                       /* torn read, retry */
+
+        out->gen            = g0;
+        out->desired_state  = desired;
+        out->target_fps     = fps;
+        out->draw_cursor    = cursor;
+        out->full_frame_req = ffreq;
+        out->consumer_tick  = ctick;
+        out->attached       = att;
+        return 1;
+    }
+    return 0;
+}
+
+void vgpu_publish_ctrl_ack(const vgpu_region_view* rv, uint32_t gen) {
+    vgpu_store_release32(&rv->producer->ctrl_ack, gen);
+}
+
+void vgpu_set_status(const vgpu_region_view* rv, uint32_t status) {
+    vgpu_store_release32(&rv->producer->status, status);
+}
+
+void vgpu_set_backend(const vgpu_region_view* rv, uint32_t backend) {
+    vgpu_store_release32(&rv->producer->backend, backend);
+}
+
+void vgpu_set_error(const vgpu_region_view* rv, uint32_t error_code) {
+    vgpu_store_release32(&rv->producer->error_code, error_code);
+}
+
+void vgpu_set_applied_fps(const vgpu_region_view* rv, uint32_t fps) {
+    vgpu_store_release32(&rv->producer->applied_fps, fps);
+}
+
+void vgpu_bump_run_epoch(const vgpu_region_view* rv) {
+    vgpu_producer_t* p = rv->producer;
+    vgpu_store_release32(&p->run_epoch, p->run_epoch + 1u);
+}
+
+void vgpu_tick_heartbeat(const vgpu_region_view* rv) {
+    /* 64-bit aligned single MOV is atomic on x86_64; barrier orders it */
+    rv->producer->heartbeat += 1u;
+    vgpu_compiler_barrier();
+}
+
+void vgpu_publish_full_frame_ack(const vgpu_region_view* rv, uint32_t req) {
+    vgpu_store_release32(&rv->producer->full_frame_ack, req);
+}
+
+void vgpu_publish_cursor(const vgpu_region_view* rv, int32_t x, int32_t y, uint32_t visible) {
+    vgpu_producer_t* p = rv->producer;
+    /* pack: low 32 = x, high 32 = y (signed → two's-complement bits) */
+    uint64_t packed = ((uint64_t)(uint32_t)y << 32) | (uint64_t)(uint32_t)x;
+    /* 64-bit aligned single MOV is atomic on x86_64; barrier orders it (heartbeat pattern) */
+    p->cursor_pos = packed;
+    vgpu_store_release32(&p->cursor_visible, visible);
+    /* publish seq last: its release-store gates the pos/visible writes above for the host */
+    vgpu_store_release32(&p->cursor_seq, p->cursor_seq + 1u);
+}
+
+void vgpu_publish_cursor_shape(const vgpu_region_view* rv, uint32_t hot_x, uint32_t hot_y,
+                               uint32_t gw, uint32_t gh, uint32_t cursor_id) {
+    vgpu_producer_t* p = rv->producer;
+    /* pack 16|16 strictly unsigned (mask low half so no sign bits bleed into the high half).
+     * No own seq: the following vgpu_publish_cursor bumps cursor_seq last and gates this line. */
+    vgpu_store_release32(&p->cursor_hotspot, (hot_y << 16) | (hot_x & 0xFFFFu));
+    vgpu_store_release32(&p->cursor_glyph,   (gh    << 16) | (gw    & 0xFFFFu));
+    vgpu_store_release32(&p->cursor_id,      cursor_id);
+}
+
+void vgpu_publish_content_change(const vgpu_region_view* rv, uint64_t change_ns) {
+    /* 64-bit aligned single MOV is atomic on x86_64; barrier orders it (heartbeat pattern) */
+    rv->producer->content_change_ns = change_ns;
+    vgpu_compiler_barrier();
+}
+
+void vgpu_publish_geometry(const vgpu_region_view* rv, int32_t virt_x, int32_t virt_y,
+                           uint32_t virt_w, uint32_t virt_h,
+                           int32_t cap_x, int32_t cap_y,
+                           uint32_t dpi, uint32_t refresh_mhz) {
+    vgpu_producer_t* p = rv->producer;
+    /* seqlock: even -> odd (writing) */
+    vgpu_store_release32(&p->geom_seq, p->geom_seq + 1u);
+    vgpu_compiler_barrier();
+    p->virt_x = virt_x; p->virt_y = virt_y;
+    p->virt_w = virt_w; p->virt_h = virt_h;
+    p->cap_x  = cap_x;  p->cap_y  = cap_y;
+    p->dpi    = dpi;    p->refresh_mhz = refresh_mhz;
+    vgpu_sfence();
+    /* seqlock: odd -> even (stable) */
+    vgpu_store_release32(&p->geom_seq, p->geom_seq + 1u);
+    vgpu_sfence();
+}
diff --git a/src/si/vgpu-stream/win32/capture-win32.h b/src/si/vgpu-stream/win32/capture-win32.h
new file mode 100644
index 0000000..a1939a0
--- /dev/null
+++ b/src/si/vgpu-stream/win32/capture-win32.h
@@ -0,0 +1,19 @@
+#ifndef VGPU_CAPTURE_WIN32_H
+#define VGPU_CAPTURE_WIN32_H
+
+/* capture-win32.h — private win32 plumbing shared by the capture backends.
+ * Not part of the OS-agnostic capture seam (see src/stream/include/capture.h):
+ * it depends on the win32 vgpu_ctx and the thread-handoff convention. */
+
+#include "ctx.h"            /* win32 vgpu_ctx (full definition) */
+
+/* Thread argument passed to capture threads via LPVOID. Heap-allocated by the
+ * backend's *_start, owned and freed by the thread. Carries the explicit ctx
+ * (no global state) plus per-backend state pointer. */
+typedef struct {
+    vgpu_ctx* ctx;
+    int       fps;
+    void*     backend_state;   /* opaque per-backend handle block */
+} capture_thread_arg;
+
+#endif /* VGPU_CAPTURE_WIN32_H */
diff --git a/src/si/vgpu-stream/win32/capture.c b/src/si/vgpu-stream/win32/capture.c
new file mode 100644
index 0000000..3efab8a
--- /dev/null
+++ b/src/si/vgpu-stream/win32/capture.c
@@ -0,0 +1,19 @@
+/* capture.c — win32 registration of the capture backends into the neutral
+ * capture seam's backend table (data-driven; no per-backend branching). */
+
+#include "capture.h"       /* neutral seam: capture_backend / capture_backends */
+#include "capture_nvfbc.h"
+#include "capture_dda.h"
+#include "capture_gdi.h"
+
+/* data-driven backend table; main selects by EYES env or first available */
+static const capture_backend g_backends[] = {
+    { "nvfbc", nvfbc_start },
+    { "dda",   dda_start   },
+    { "gdi",   gdi_start   },
+};
+
+const capture_backend* capture_backends(int* count) {
+    *count = (int)(sizeof g_backends / sizeof g_backends[0]);
+    return g_backends;
+}
diff --git a/src/si/vgpu-stream/win32/capture_dda.c b/src/si/vgpu-stream/win32/capture_dda.c
new file mode 100644
index 0000000..74e6d24
--- /dev/null
+++ b/src/si/vgpu-stream/win32/capture_dda.c
@@ -0,0 +1,198 @@
+#define WIN32_LEAN_AND_MEAN
+#define COBJMACROS
+#include <windows.h>
+#include <d3d11.h>
+#include <dxgi1_2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "capture_dda.h"
+#include "capture-win32.h"   /* capture_thread_arg (win32-private) */
+#include "present.h"
+#include "cursor.h"          /* cursor_resolve_id + ctx->cursor compose state */
+#include "geometry.h"        /* reactive geometry resample on recreate */
+#include "stream.h"          /* vgpu_publish_cursor / vgpu_publish_cursor_shape */
+
+typedef struct {
+    ID3D11Device*           dev;
+    ID3D11DeviceContext*    dctx;
+    IDXGIOutput1*           out1;
+    IDXGIOutputDuplication* dup;
+    ID3D11Texture2D*        staging;
+    UINT                    W, H;
+    int32_t                 cap_x, cap_y;       /* captured output origin (virt coords) */
+    UINT64                  last_mouse_update;  /* shape-gate by fi.LastMouseUpdateTime */
+    int                     seeded;             /* cold-start position seed done */
+} dda_state;
+
+/* Source the cursor from the already-fetched frame info (0 syscalls for position) and publish
+ * it under the cursor_seq gate. Position/visibility come from fi.PointerPosition; the shape is
+ * re-extracted only when fi.LastMouseUpdateTime changed (shape-gate). Cold start: fi is invalid
+ * until the mouse first moves (LastMouseUpdateTime==0) — seed the position once via one
+ * GetCursorInfo, then rely on fi. ctx->cursor compose fields are written under ctx->lock; the
+ * producer-block publish uses release/seq, no lock. */
+static void dda_source_cursor(vgpu_ctx* ctx, dda_state* st,
+                              const DXGI_OUTDUPL_FRAME_INFO* fi) {
+    int vis = fi->PointerPosition.Visible ? 1 : 0;
+    int x, y;
+    UINT64 upd = (UINT64)fi->LastMouseUpdateTime.QuadPart;
+
+    if (!st->seeded && upd == 0) {
+        CURSORINFO ci; ci.cbSize = sizeof ci;
+        if (GetCursorInfo(&ci)) {
+            vis = (ci.flags & CURSOR_SHOWING) != 0;
+            x = ci.ptScreenPos.x; y = ci.ptScreenPos.y;
+        } else {
+            x = ctx->cursor.x; y = ctx->cursor.y;
+        }
+        st->seeded = 1;
+    } else {
+        x = fi->PointerPosition.Position.x;
+        y = fi->PointerPosition.Position.y;
+        if (upd != 0) st->seeded = 1;
+    }
+
+    /* shape-gate: re-extract only when the mouse-update stamp advanced */
+    if (upd != 0 && upd != st->last_mouse_update) {
+        CURSORINFO ci; ci.cbSize = sizeof ci;
+        if (GetCursorInfo(&ci) && ci.hCursor && ci.hCursor != ctx->cursor.handle) {
+            EnterCriticalSection(&ctx->lock);
+            cursor_apply_shape(ctx, ci.hCursor);
+            LeaveCriticalSection(&ctx->lock);
+        }
+        st->last_mouse_update = upd;
+    }
+
+    EnterCriticalSection(&ctx->lock);
+    ctx->cursor.visible = vis;
+    ctx->cursor.x = x; ctx->cursor.y = y;
+    uint32_t hx = (uint32_t)ctx->cursor.hot_x, hy = (uint32_t)ctx->cursor.hot_y;
+    uint32_t gw = (uint32_t)ctx->cursor.gw,    gh = (uint32_t)ctx->cursor.gh;
+    uint32_t cid = (uint32_t)ctx->cursor.cursor_id;
+    LeaveCriticalSection(&ctx->lock);
+
+    vgpu_publish_cursor_shape(&ctx->view, hx, hy, gw, gh, cid);
+    vgpu_publish_cursor(&ctx->view, (int32_t)x, (int32_t)y, (uint32_t)vis);
+}
+
+static DWORD WINAPI dda_thread(LPVOID param) {
+    capture_thread_arg* arg = (capture_thread_arg*)param;
+    vgpu_ctx*  ctx = arg->ctx;
+    dda_state* st  = (dda_state*)arg->backend_state;
+    free(arg);
+
+    for (;;) {
+        DXGI_OUTDUPL_FRAME_INFO fi;
+        IDXGIResource* res = NULL;
+        HRESULT hr = st->dup->lpVtbl->AcquireNextFrame(st->dup, 1000, &fi, &res);
+        if (hr == DXGI_ERROR_WAIT_TIMEOUT) continue;
+        if (hr == DXGI_ERROR_ACCESS_LOST) {
+            if (st->dup) { st->dup->lpVtbl->Release(st->dup); st->dup = NULL; }
+            if (FAILED(st->out1->lpVtbl->DuplicateOutput(st->out1,
+                       (IUnknown*)st->dev, &st->dup))) {
+                Sleep(200);
+            } else {
+                /* display config may have changed across the access loss → resample geometry */
+                geometry_sample_and_publish(ctx, st->cap_x, st->cap_y);
+            }
+            continue;
+        }
+        if (FAILED(hr)) { Sleep(50); continue; }
+
+        dda_source_cursor(ctx, st, &fi);
+
+        ID3D11Texture2D* tex = NULL;
+        res->lpVtbl->QueryInterface(res, &IID_ID3D11Texture2D, (void**)&tex);
+        if (tex) {
+            st->dctx->lpVtbl->CopyResource(st->dctx,
+                (ID3D11Resource*)st->staging, (ID3D11Resource*)tex);
+            D3D11_MAPPED_SUBRESOURCE m;
+            if (SUCCEEDED(st->dctx->lpVtbl->Map(st->dctx,
+                    (ID3D11Resource*)st->staging, 0, D3D11_MAP_READ, 0, &m))) {
+                vgpu_present_submit(ctx, (const uint8_t*)m.pData, st->W, st->H, m.RowPitch);
+                st->dctx->lpVtbl->Unmap(st->dctx, (ID3D11Resource*)st->staging, 0);
+            }
+            tex->lpVtbl->Release(tex);
+        }
+        if (res) res->lpVtbl->Release(res);
+        st->dup->lpVtbl->ReleaseFrame(st->dup);
+    }
+    return 0;  /* unreachable; satisfies -Wreturn-type */
+}
+
+int dda_start(vgpu_ctx* ctx, int fps) {
+    (void)fps;
+    dda_state* st = (dda_state*)calloc(1, sizeof *st);
+    if (!st) return 0;
+
+    D3D_FEATURE_LEVEL fl;
+    if (FAILED(D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, 0, NULL, 0,
+              D3D11_SDK_VERSION, &st->dev, &fl, &st->dctx))) {
+        fprintf(stderr, "eyes(dda): D3D11CreateDevice failed\n");
+        goto fail;
+    }
+
+    IDXGIDevice*  dxgiDev = NULL;
+    IDXGIAdapter* adapter = NULL;
+    IDXGIOutput*  output  = NULL;
+    st->dev->lpVtbl->QueryInterface(st->dev, &IID_IDXGIDevice, (void**)&dxgiDev);
+    if (dxgiDev) dxgiDev->lpVtbl->GetAdapter(dxgiDev, &adapter);
+    if (adapter) adapter->lpVtbl->EnumOutputs(adapter, 0, &output);
+    if (output) {
+        DXGI_OUTPUT_DESC od;
+        if (SUCCEEDED(output->lpVtbl->GetDesc(output, &od))) {
+            st->cap_x = (int32_t)od.DesktopCoordinates.left;
+            st->cap_y = (int32_t)od.DesktopCoordinates.top;
+        }
+        output->lpVtbl->QueryInterface(output, &IID_IDXGIOutput1, (void**)&st->out1);
+    }
+
+    if (output)  output->lpVtbl->Release(output);
+    if (adapter) adapter->lpVtbl->Release(adapter);
+    if (dxgiDev) dxgiDev->lpVtbl->Release(dxgiDev);
+
+    if (!st->out1 || FAILED(st->out1->lpVtbl->DuplicateOutput(st->out1,
+                            (IUnknown*)st->dev, &st->dup))) {
+        fprintf(stderr, "eyes(dda): DuplicateOutput failed\n");
+        goto fail;
+    }
+
+    DXGI_OUTDUPL_DESC dd;
+    st->dup->lpVtbl->GetDesc(st->dup, &dd);
+    st->W = dd.ModeDesc.Width;
+    st->H = dd.ModeDesc.Height;
+
+    D3D11_TEXTURE2D_DESC td; memset(&td, 0, sizeof td);
+    td.Width = st->W; td.Height = st->H; td.MipLevels = 1; td.ArraySize = 1;
+    td.Format = DXGI_FORMAT_B8G8R8A8_UNORM; td.SampleDesc.Count = 1;
+    td.Usage = D3D11_USAGE_STAGING; td.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+    if (FAILED(st->dev->lpVtbl->CreateTexture2D(st->dev, &td, NULL, &st->staging))) {
+        fprintf(stderr, "eyes(dda): CreateTexture2D failed\n");
+        goto fail;
+    }
+
+    capture_thread_arg* arg = (capture_thread_arg*)malloc(sizeof *arg);
+    if (!arg) goto fail;
+    arg->ctx = ctx; arg->fps = fps; arg->backend_state = st;
+
+    ctx->backend = VGPU_BK_DDA;
+    ctx->draw_cursor_cap = 1;   /* DDA frames are content-only → presenter draws cursor */
+
+    HANDLE t = CreateThread(NULL, 0, dda_thread, arg, 0, NULL);
+    if (!t) { free(arg); goto fail; }
+    CloseHandle(t);
+
+    fprintf(stderr, "eyes(dda): desktop %ux%u (content-only; cursor by presenter)\n",
+            st->W, st->H);
+    return 1;
+
+fail:
+    /* release any COM objects created before the failure (no ref leaks) */
+    if (st->staging) st->staging->lpVtbl->Release(st->staging);
+    if (st->dup)     st->dup->lpVtbl->Release(st->dup);
+    if (st->out1)    st->out1->lpVtbl->Release(st->out1);
+    if (st->dctx)    st->dctx->lpVtbl->Release(st->dctx);
+    if (st->dev)     st->dev->lpVtbl->Release(st->dev);
+    free(st);
+    return 0;
+}
diff --git a/src/si/vgpu-stream/win32/capture_dda.h b/src/si/vgpu-stream/win32/capture_dda.h
new file mode 100644
index 0000000..8031c26
--- /dev/null
+++ b/src/si/vgpu-stream/win32/capture_dda.h
@@ -0,0 +1,10 @@
+#ifndef VGPU_CAPTURE_DDA_H
+#define VGPU_CAPTURE_DDA_H
+
+/* capture_dda.h — DXGI Desktop Duplication capture backend (win32). */
+
+#include "ctx.h"           /* win32 vgpu_ctx */
+
+int dda_start(vgpu_ctx* ctx, int fps);
+
+#endif /* VGPU_CAPTURE_DDA_H */
diff --git a/src/si/vgpu-stream/win32/capture_gdi.c b/src/si/vgpu-stream/win32/capture_gdi.c
new file mode 100644
index 0000000..71df921
--- /dev/null
+++ b/src/si/vgpu-stream/win32/capture_gdi.c
@@ -0,0 +1,79 @@
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "capture_gdi.h"
+#include "capture-win32.h"   /* capture_thread_arg (win32-private) */
+#include "present.h"
+#include "cursor.h"          /* cursor_sample (position+shape+id) for compose+publish */
+#include "geometry.h"        /* reactive geometry resample on capture-size change */
+#include "stream.h"          /* vgpu_publish_cursor / vgpu_publish_cursor_shape */
+
+static DWORD WINAPI gdi_thread(LPVOID param) {
+    capture_thread_arg* arg = (capture_thread_arg*)param;
+    vgpu_ctx* ctx = arg->ctx;
+    int fps = arg->fps > 0 ? arg->fps : 30;
+    free(arg);
+
+    HDC screen = GetDC(NULL);
+    HDC mem = CreateCompatibleDC(screen);
+    HBITMAP dib = NULL;
+    void* bits = NULL;
+    int W = 0, H = 0;
+    const DWORD interval = (DWORD)(1000 / fps);
+
+    for (;;) {
+        int w = GetSystemMetrics(SM_CXSCREEN), h = GetSystemMetrics(SM_CYSCREEN);
+        if (w <= 0 || h <= 0) { Sleep(200); continue; }
+        if (w != W || h != H || !dib) {
+            if (dib) DeleteObject(dib);
+            BITMAPINFO bi; memset(&bi, 0, sizeof bi);
+            bi.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
+            bi.bmiHeader.biWidth = w; bi.bmiHeader.biHeight = -h;
+            bi.bmiHeader.biPlanes = 1; bi.bmiHeader.biBitCount = 32;
+            bi.bmiHeader.biCompression = BI_RGB;
+            dib = CreateDIBSection(screen, &bi, DIB_RGB_COLORS, &bits, NULL, 0);
+            if (!dib) {
+                fprintf(stderr, "eyes(gdi): CreateDIBSection %dx%d failed\n", w, h);
+                Sleep(200); continue;
+            }
+            SelectObject(mem, dib);
+            W = w; H = h;
+            fprintf(stderr, "eyes(gdi): desktop %dx%d (BitBlt; cursor by presenter)\n", W, H);
+            /* capture size changed (primary at origin (0,0)) → resample geometry */
+            geometry_sample_and_publish(ctx, 0, 0);
+        }
+        if (BitBlt(mem, 0, 0, W, H, screen, 0, 0, SRCCOPY))
+            vgpu_present_submit(ctx, (const uint8_t*)bits,
+                                (uint32_t)W, (uint32_t)H, (uint32_t)W * 4u);
+
+        /* source the cursor for present's compositing (under ctx->lock) and publish it */
+        EnterCriticalSection(&ctx->lock);
+        cursor_sample(ctx);
+        uint32_t hx = (uint32_t)ctx->cursor.hot_x, hy = (uint32_t)ctx->cursor.hot_y;
+        uint32_t gw = (uint32_t)ctx->cursor.gw,    gh = (uint32_t)ctx->cursor.gh;
+        uint32_t cid = (uint32_t)ctx->cursor.cursor_id;
+        int32_t  cx = (int32_t)ctx->cursor.x, cy = (int32_t)ctx->cursor.y;
+        uint32_t cvis = (uint32_t)(ctx->cursor.visible != 0);
+        LeaveCriticalSection(&ctx->lock);
+        vgpu_publish_cursor_shape(&ctx->view, hx, hy, gw, gh, cid);
+        vgpu_publish_cursor(&ctx->view, cx, cy, cvis);
+
+        Sleep(interval);
+    }
+    return 0;  /* unreachable; satisfies -Wreturn-type */
+}
+
+int gdi_start(vgpu_ctx* ctx, int fps) {
+    ctx->backend = VGPU_BK_GDI;
+    ctx->draw_cursor_cap = 1;   /* GDI BitBlt excludes cursor → presenter draws it */
+
+    capture_thread_arg* arg = (capture_thread_arg*)malloc(sizeof *arg);
+    if (!arg) return 0;
+    arg->ctx = ctx; arg->fps = fps; arg->backend_state = NULL;
+    HANDLE t = CreateThread(NULL, 0, gdi_thread, arg, 0, NULL);
+    if (!t) { free(arg); return 0; }
+    CloseHandle(t);
+    return 1;
+}
diff --git a/src/si/vgpu-stream/win32/capture_gdi.h b/src/si/vgpu-stream/win32/capture_gdi.h
new file mode 100644
index 0000000..dae936a
--- /dev/null
+++ b/src/si/vgpu-stream/win32/capture_gdi.h
@@ -0,0 +1,10 @@
+#ifndef VGPU_CAPTURE_GDI_H
+#define VGPU_CAPTURE_GDI_H
+
+/* capture_gdi.h — GDI BitBlt capture backend (win32, universal fallback). */
+
+#include "ctx.h"           /* win32 vgpu_ctx */
+
+int gdi_start(vgpu_ctx* ctx, int fps);
+
+#endif /* VGPU_CAPTURE_GDI_H */
diff --git a/src/si/vgpu-stream/win32/capture_nvfbc.c b/src/si/vgpu-stream/win32/capture_nvfbc.c
new file mode 100644
index 0000000..0086c22
--- /dev/null
+++ b/src/si/vgpu-stream/win32/capture_nvfbc.c
@@ -0,0 +1,162 @@
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "capture_nvfbc.h"
+#include "capture-win32.h"   /* capture_thread_arg (win32-private) */
+#include "present.h"
+#include "cursor.h"          /* cursor_apply_shape / ctx->cursor */
+#include "geometry.h"        /* reactive geometry resample on recreate */
+#include "stream.h"          /* vgpu_publish_cursor / vgpu_publish_cursor_shape */
+#include "nvfbc_tosys_c.h"
+
+typedef struct {
+    NvFBCToSys_c*              fbc;
+    void*                      buf;
+    NvFBC_CreateFunctionExType create;
+    HCURSOR                    last_handle;   /* shape-gate by HCURSOR change */
+} nvfbc_state;
+
+/* Source the cursor for an NvFBC grab and publish it under the cursor_seq gate. NvFBC reports
+ * only HW-cursor visibility (gi.bHWMouseVisible); position is not exposed, so one GetCursorInfo
+ * per frame supplies x/y (the minimum possible). Shape is re-extracted only on HCURSOR change.
+ * NvFBC composites the cursor itself (draw_cursor_cap==0) → present never reads ctx->cursor for
+ * drawing, so no ctx->lock is required around the compose fields here.
+ * gi.bProtectedContent / gi.dwSourcePID are available but out of scope (not in the contract). */
+static void nvfbc_source_cursor(vgpu_ctx* ctx, nvfbc_state* st,
+                                const NvFBCFrameGrabInfo* gi) {
+    CURSORINFO ci; ci.cbSize = sizeof ci;
+    int vis = gi->bHWMouseVisible ? 1 : 0;
+    int x = ctx->cursor.x, y = ctx->cursor.y;
+    if (GetCursorInfo(&ci)) {
+        x = ci.ptScreenPos.x; y = ci.ptScreenPos.y;
+        if (ci.hCursor && ci.hCursor != st->last_handle) {
+            cursor_apply_shape(ctx, ci.hCursor);
+            st->last_handle = ci.hCursor;
+        }
+    }
+    ctx->cursor.visible = vis; ctx->cursor.x = x; ctx->cursor.y = y;
+
+    vgpu_publish_cursor_shape(&ctx->view,
+                              (uint32_t)ctx->cursor.hot_x, (uint32_t)ctx->cursor.hot_y,
+                              (uint32_t)ctx->cursor.gw,    (uint32_t)ctx->cursor.gh,
+                              (uint32_t)ctx->cursor.cursor_id);
+    vgpu_publish_cursor(&ctx->view, (int32_t)x, (int32_t)y, (uint32_t)vis);
+}
+
+static NvFBCToSys_c* nvfbc_create(NvFBC_CreateFunctionExType pCreate, void** ppBuf) {
+    NvFBCCreateParams cp; memset(&cp, 0, sizeof cp);
+    cp.dwVersion       = NVFBC_CREATE_PARAMS_VER;
+    cp.dwInterfaceType = NVFBC_TO_SYS_C;
+    cp.dwAdapterIdx    = 0;
+    if (pCreate(&cp) != NVFBC_SUCCESS || !cp.pNvFBC) return NULL;
+
+    NvFBCToSys_c* fbc = (NvFBCToSys_c*)cp.pNvFBC;
+    *ppBuf = NULL;
+
+    NVFBC_TOSYS_SETUP_PARAMS_C sp; memset(&sp, 0, sizeof sp);
+    sp.dwVersion = NVFBC_TOSYS_SETUP_PARAMS_VER_C;
+    sp.bits      = 1u;                 /* bWithHWCursor = 1 (bit 0) */
+    sp.eMode     = NVFBC_TOSYS_ARGB;
+    sp.ppBuffer  = ppBuf;
+    if (fbc->lpVtbl->NvFBCToSysSetUp(fbc, &sp) != NVFBC_SUCCESS || !*ppBuf) {
+        fbc->lpVtbl->NvFBCToSysRelease(fbc);
+        return NULL;
+    }
+    return fbc;
+}
+
+static DWORD WINAPI nvfbc_thread(LPVOID param) {
+    capture_thread_arg* arg = (capture_thread_arg*)param;
+    vgpu_ctx*    ctx = arg->ctx;
+    nvfbc_state* st  = (nvfbc_state*)arg->backend_state;
+    free(arg);
+
+    NvFBCToSys_c* fbc = st->fbc;
+    void*         buf = st->buf;
+
+    for (;;) {
+        NvFBCFrameGrabInfo gi; memset(&gi, 0, sizeof gi);
+        NVFBC_TOSYS_GRAB_FRAME_PARAMS_C gp; memset(&gp, 0, sizeof gp);
+        gp.dwVersion = NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER_C;
+        gp.dwFlags   = NVFBC_TOSYS_WAIT_WITH_TIMEOUT_C;
+        gp.dwWaitTime = 1000;
+        gp.eGMode    = NVFBC_TOSYS_SOURCEMODE_FULL;
+        gp.pNvFBCFrameGrabInfo = &gi;
+
+        NVFBCRESULT r = fbc->lpVtbl->NvFBCToSysGrabFrame(fbc, &gp);
+        if (r != NVFBC_SUCCESS) {
+            if (r == NVFBC_ERROR_INVALIDATED_SESSION || gi.bMustRecreate) {
+                fprintf(stderr, "eyes(nvfbc): session invalidated (r=%d), recreating\n", (int)r);
+                fbc->lpVtbl->NvFBCToSysRelease(fbc);
+                fbc = NULL;
+                while (!(fbc = nvfbc_create(st->create, &buf))) Sleep(200);
+                st->fbc = fbc; st->buf = buf;
+                /* grab session was recreated → display config may have changed: resample */
+                geometry_sample_and_publish(ctx, 0, 0);
+            } else {
+                Sleep(50);
+            }
+            continue;
+        }
+        if (gi.dwWidth && gi.dwHeight)
+            vgpu_present_submit(ctx, (const uint8_t*)buf,
+                                gi.dwWidth, gi.dwHeight, gi.dwBufferWidth * 4u);
+        nvfbc_source_cursor(ctx, st, &gi);
+    }
+    return 0;  /* unreachable; satisfies -Wreturn-type */
+}
+
+int nvfbc_start(vgpu_ctx* ctx, int fps) {
+    (void)fps;
+    HMODULE lib = LoadLibraryA("NvFBC64.dll");
+    if (!lib) {
+        fprintf(stderr, "eyes(nvfbc): LoadLibrary NvFBC64.dll failed (%lu)\n", GetLastError());
+        return 0;
+    }
+    NvFBC_SetGlobalFlagsType    pSetFlags = (NvFBC_SetGlobalFlagsType)(void*)GetProcAddress(lib, "NvFBC_SetGlobalFlags");
+    NvFBC_EnableFunctionType    pEnable   = (NvFBC_EnableFunctionType)(void*)GetProcAddress(lib, "NvFBC_Enable");
+    NvFBC_CreateFunctionExType  pCreate   = (NvFBC_CreateFunctionExType)(void*)GetProcAddress(lib, "NvFBC_CreateEx");
+    NvFBC_GetStatusExFunctionType pStatus = (NvFBC_GetStatusExFunctionType)(void*)GetProcAddress(lib, "NvFBC_GetStatusEx");
+    if (!pEnable || !pCreate || !pStatus) {
+        fprintf(stderr, "eyes(nvfbc): missing exports\n");
+        return 0;
+    }
+    if (pSetFlags) pSetFlags(NVFBC_GLOBAL_FLAGS_NO_INITIAL_REFRESH);
+    if (pEnable(NVFBC_STATE_ENABLE) != NVFBC_SUCCESS) {
+        fprintf(stderr, "eyes(nvfbc): NvFBC_Enable failed\n");
+        return 0;
+    }
+    NvFBCStatusEx stx; memset(&stx, 0, sizeof stx);
+    stx.dwVersion = NVFBC_STATUS_VER; stx.dwAdapterIdx = 0;
+    if (pStatus(&stx) != NVFBC_SUCCESS || !stx.bIsCapturePossible) {
+        fprintf(stderr, "eyes(nvfbc): capture NOT possible on this GPU/license\n");
+        return 0;
+    }
+    void* buf = NULL;
+    NvFBCToSys_c* fbc = nvfbc_create(pCreate, &buf);
+    if (!fbc) {
+        fprintf(stderr, "eyes(nvfbc): CreateEx/ToSysSetUp failed\n");
+        return 0;
+    }
+
+    nvfbc_state* st = (nvfbc_state*)malloc(sizeof *st);
+    if (!st) { fbc->lpVtbl->NvFBCToSysRelease(fbc); return 0; }
+    st->fbc = fbc; st->buf = buf; st->create = pCreate; st->last_handle = NULL;
+
+    capture_thread_arg* arg = (capture_thread_arg*)malloc(sizeof *arg);
+    if (!arg) { fbc->lpVtbl->NvFBCToSysRelease(fbc); free(st); return 0; }
+    arg->ctx = ctx; arg->fps = fps; arg->backend_state = st;
+
+    ctx->backend = VGPU_BK_NVFBC;
+    ctx->draw_cursor_cap = 0;   /* NvFBC composites HW cursor itself */
+
+    HANDLE t = CreateThread(NULL, 0, nvfbc_thread, arg, 0, NULL);
+    if (!t) { fbc->lpVtbl->NvFBCToSysRelease(fbc); free(st); free(arg); return 0; }
+    CloseHandle(t);
+
+    fprintf(stderr, "eyes(nvfbc): session up (ToSys ARGB/BGRA), iface=0x%lx\n",
+            (unsigned long)stx.dwNvFBCVersion);
+    return 1;
+}
diff --git a/src/si/vgpu-stream/win32/capture_nvfbc.h b/src/si/vgpu-stream/win32/capture_nvfbc.h
new file mode 100644
index 0000000..d1a5dcf
--- /dev/null
+++ b/src/si/vgpu-stream/win32/capture_nvfbc.h
@@ -0,0 +1,10 @@
+#ifndef VGPU_CAPTURE_NVFBC_H
+#define VGPU_CAPTURE_NVFBC_H
+
+/* capture_nvfbc.h — NVIDIA NvFBC ToSys capture backend (win32). */
+
+#include "ctx.h"           /* win32 vgpu_ctx */
+
+int nvfbc_start(vgpu_ctx* ctx, int fps);
+
+#endif /* VGPU_CAPTURE_NVFBC_H */
diff --git a/src/si/vgpu-stream/win32/ctx.h b/src/si/vgpu-stream/win32/ctx.h
new file mode 100644
index 0000000..977604c
--- /dev/null
+++ b/src/si/vgpu-stream/win32/ctx.h
@@ -0,0 +1,66 @@
+#ifndef VGPU_CTX_H
+#define VGPU_CTX_H
+
+/* ctx.h — win32 runtime context. Embeds the neutral region-view (the engine's
+ * borrowed handle onto the contract) alongside win32-owned staging/cursor/sync
+ * state. Object = memory: ctx owns the staging arena and cursor state. */
+
+#include <stdint.h>
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include "stream.h"        /* vgpu_region_view (neutral contract handle) */
+#include "region.h"        /* vgpu_region_t (win32 pinned region) */
+
+/*
+ * vgpu_ctx — the explicitly-passed context. Replaces all former g_* shared
+ * state. Object = memory: ctx owns the producer staging arena and cursor
+ * state; capture threads receive a vgpu_ctx* via their LPVOID thread param.
+ *
+ * Staging is a fixed arena sized for the max mode (no STL, no per-frame
+ * malloc). content_buf holds the latest submitted desktop; frame_buf is the
+ * composed (cursor-drawn) frame the publisher copies into a ring slot.
+ */
+
+#define VGPU_STAGING_BYTES ((size_t)VGPU_MAX_WIDTH * VGPU_MAX_HEIGHT * 4u)
+
+/* Cursor sample/compose state (GDI). Fixed buffers, no heap. */
+typedef struct {
+    HCURSOR  handle;
+    int      visible;
+    int      x, y;
+    int      hot_x, hot_y;
+    int      gw, gh;          /* glyph dims */
+    int      cursor_id;       /* VGPU_CURSOR_ID_* resolved on shape change */
+    int      mono;            /* 1 = AND/XOR monochrome cursor */
+    uint8_t* bgra;            /* color cursor BGRA (arena) */
+    uint8_t* and_mask;        /* mono AND (arena) */
+    uint8_t* xor_mask;        /* mono XOR (arena) */
+} vgpu_cursor_t;
+
+typedef struct vgpu_ctx {
+    /* neutral contract handle (borrowed from region) — engine publishes through
+     * this; win32 code reads region blocks via view.producer / view.control */
+    vgpu_region_view view;
+
+    /* producer staging arena (owned) */
+    uint8_t*         arena;          /* one VirtualAlloc block for all buffers */
+    size_t           arena_bytes;
+    uint8_t*         content_buf;    /* latest submitted desktop, tight BGRA */
+    uint8_t*         frame_buf;      /* composed frame to publish, tight BGRA */
+
+    /* submit handoff (capture thread -> publish pump) */
+    CRITICAL_SECTION lock;
+    HANDLE           submit_event;
+    int64_t          content_seq;    /* bumped on every submit */
+    uint32_t         content_w, content_h;
+
+    /* cursor */
+    vgpu_cursor_t    cursor;
+
+    /* runtime config (resolved from control) */
+    uint32_t         default_fps;    /* fps from CLI; used when target_fps==0 */
+    uint32_t         backend;        /* VGPU_BK_* chosen */
+    int              draw_cursor_cap; /* backend capability: does it need SW cursor */
+} vgpu_ctx;
+
+#endif /* VGPU_CTX_H */
diff --git a/src/si/vgpu-stream/win32/cursor.c b/src/si/vgpu-stream/win32/cursor.c
new file mode 100644
index 0000000..5941cb1
--- /dev/null
+++ b/src/si/vgpu-stream/win32/cursor.c
@@ -0,0 +1,175 @@
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <string.h>
+#include "cursor.h"
+#include "vgpu_stream.h"   /* VGPU_CURSOR_ID_* */
+
+/* Max supported cursor glyph; buffers are pre-arena'd in ctx (no heap here). */
+#define VGPU_CURSOR_MAX 256
+
+static void read_mono(HBITMAP hbm, int w, int h, uint8_t* out /* w*h */) {
+    int stride = ((w + 31) / 32) * 4;
+    /* bounded scratch on stack: max (256/32*4)=32 bytes/row * 512 rows */
+    static const int kMaxRows = VGPU_CURSOR_MAX * 2;
+    uint8_t raw[(VGPU_CURSOR_MAX / 32 * 4) * (VGPU_CURSOR_MAX * 2)];
+    if (h > kMaxRows) h = kMaxRows;
+    if ((size_t)stride * h > sizeof raw) return;
+
+    struct { BITMAPINFOHEADER hdr; RGBQUAD pal[2]; } bi;
+    memset(&bi, 0, sizeof bi);
+    bi.hdr.biSize = sizeof(BITMAPINFOHEADER);
+    bi.hdr.biWidth = w; bi.hdr.biHeight = -h;
+    bi.hdr.biPlanes = 1; bi.hdr.biBitCount = 1; bi.hdr.biCompression = BI_RGB;
+    HDC dc = GetDC(NULL);
+    GetDIBits(dc, hbm, 0, h, raw, (BITMAPINFO*)&bi, DIB_RGB_COLORS);
+    ReleaseDC(NULL, dc);
+
+    memset(out, 0, (size_t)w * h);
+    for (int y = 0; y < h; y++)
+        for (int x = 0; x < w; x++) {
+            int bit = 7 - (x & 7);
+            out[(size_t)y * w + x] = (raw[(size_t)y * stride + (x >> 3)] >> bit) & 1u;
+        }
+}
+
+static void extract(vgpu_ctx* ctx, HCURSOR hc) {
+    vgpu_cursor_t* cur = &ctx->cursor;
+    cur->gw = cur->gh = 0;
+    cur->mono = 0;
+
+    ICONINFO ii;
+    if (!GetIconInfo(hc, &ii)) return;
+    cur->hot_x = (int)ii.xHotspot;
+    cur->hot_y = (int)ii.yHotspot;
+
+    if (ii.hbmColor) {
+        BITMAP bm; GetObject(ii.hbmColor, sizeof bm, &bm);
+        int w = bm.bmWidth, h = bm.bmHeight;
+        if (w > VGPU_CURSOR_MAX) w = VGPU_CURSOR_MAX;
+        if (h > VGPU_CURSOR_MAX) h = VGPU_CURSOR_MAX;
+        BITMAPINFO bi; memset(&bi, 0, sizeof bi);
+        bi.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
+        bi.bmiHeader.biWidth = w; bi.bmiHeader.biHeight = -h;
+        bi.bmiHeader.biPlanes = 1; bi.bmiHeader.biBitCount = 32;
+        bi.bmiHeader.biCompression = BI_RGB;
+        memset(cur->bgra, 0, (size_t)w * h * 4);
+        HDC dc = GetDC(NULL);
+        GetDIBits(dc, ii.hbmColor, 0, h, cur->bgra, &bi, DIB_RGB_COLORS);
+        ReleaseDC(NULL, dc);
+        cur->gw = w; cur->gh = h; cur->mono = 0;
+
+        int has_alpha = 0;
+        for (size_t i = 0; i < (size_t)w * h; i++)
+            if (cur->bgra[i * 4 + 3]) { has_alpha = 1; break; }
+        if (!has_alpha && ii.hbmMask) {
+            read_mono(ii.hbmMask, w, h, cur->and_mask);
+            for (size_t i = 0; i < (size_t)w * h; i++)
+                cur->bgra[i * 4 + 3] = cur->and_mask[i] ? 0 : 255;
+        }
+    } else if (ii.hbmMask) {
+        BITMAP bm; GetObject(ii.hbmMask, sizeof bm, &bm);
+        int w = bm.bmWidth, h = bm.bmHeight / 2;
+        if (w > VGPU_CURSOR_MAX) w = VGPU_CURSOR_MAX;
+        if (h > VGPU_CURSOR_MAX) h = VGPU_CURSOR_MAX;
+        /* read both halves into a scratch laid over xor_mask region: reuse
+         * and_mask for AND and xor_mask for XOR; read full into a stack pass */
+        static uint8_t both[VGPU_CURSOR_MAX * VGPU_CURSOR_MAX * 2];
+        read_mono(ii.hbmMask, w, bm.bmHeight, both);
+        for (int y = 0; y < h; y++)
+            for (int x = 0; x < w; x++) {
+                cur->and_mask[(size_t)y * w + x] = both[(size_t)y * w + x];
+                cur->xor_mask[(size_t)y * w + x] = both[(size_t)(y + h) * w + x];
+            }
+        cur->gw = w; cur->gh = h; cur->mono = 1;
+    }
+    if (ii.hbmColor) DeleteObject(ii.hbmColor);
+    if (ii.hbmMask)  DeleteObject(ii.hbmMask);
+}
+
+int cursor_resolve_id(HCURSOR hc) {
+    /* System-cursor table loaded once (IDC_* are stable per session). Lazy: built on first
+     * call, then a linear handle compare. UNKNOWN for custom/unrecognized cursors. */
+    static const struct { LPCTSTR idc; int id; } kSpec[] = {
+        { IDC_ARROW,       VGPU_CURSOR_ID_ARROW       },
+        { IDC_IBEAM,       VGPU_CURSOR_ID_IBEAM       },
+        { IDC_WAIT,        VGPU_CURSOR_ID_WAIT        },
+        { IDC_CROSS,       VGPU_CURSOR_ID_CROSS       },
+        { IDC_HAND,        VGPU_CURSOR_ID_HAND        },
+        { IDC_SIZENS,      VGPU_CURSOR_ID_SIZENS      },
+        { IDC_SIZEWE,      VGPU_CURSOR_ID_SIZEWE      },
+        { IDC_SIZENWSE,    VGPU_CURSOR_ID_SIZENWSE    },
+        { IDC_SIZENESW,    VGPU_CURSOR_ID_SIZENESW    },
+        { IDC_SIZEALL,     VGPU_CURSOR_ID_SIZEALL     },
+        { IDC_NO,          VGPU_CURSOR_ID_NO          },
+        { IDC_APPSTARTING, VGPU_CURSOR_ID_APPSTARTING },
+    };
+    enum { N = (int)(sizeof kSpec / sizeof kSpec[0]) };
+    static HCURSOR cache[N];
+    static int loaded = 0;
+    if (!loaded) {
+        for (int i = 0; i < N; i++) cache[i] = LoadCursor(NULL, kSpec[i].idc);
+        loaded = 1;
+    }
+    if (!hc) return VGPU_CURSOR_ID_UNKNOWN;
+    for (int i = 0; i < N; i++)
+        if (cache[i] == hc) return kSpec[i].id;
+    return VGPU_CURSOR_ID_UNKNOWN;
+}
+
+void cursor_apply_shape(vgpu_ctx* ctx, HCURSOR hc) {
+    extract(ctx, hc);
+    ctx->cursor.cursor_id = cursor_resolve_id(hc);
+    ctx->cursor.handle = hc;
+}
+
+int cursor_sample(vgpu_ctx* ctx) {
+    vgpu_cursor_t* cur = &ctx->cursor;
+    CURSORINFO ci; ci.cbSize = sizeof ci;
+    if (!GetCursorInfo(&ci)) {
+        int changed = cur->visible;
+        cur->visible = 0;
+        return changed;
+    }
+    int vis = (ci.flags & CURSOR_SHOWING) != 0;
+    int x = ci.ptScreenPos.x, y = ci.ptScreenPos.y;
+    int changed = (vis != cur->visible) || (x != cur->x) || (y != cur->y)
+                  || (ci.hCursor != cur->handle);
+    if (vis && ci.hCursor && ci.hCursor != cur->handle) {
+        extract(ctx, ci.hCursor);
+        cur->cursor_id = cursor_resolve_id(ci.hCursor);
+        cur->handle = ci.hCursor;
+    }
+    cur->visible = vis; cur->x = x; cur->y = y;
+    return changed;
+}
+
+void cursor_draw(vgpu_ctx* ctx, uint8_t* dst, uint32_t W, uint32_t H) {
+    vgpu_cursor_t* cur = &ctx->cursor;
+    if (!cur->visible || cur->gw == 0) return;
+    int ox = cur->x - cur->hot_x, oy = cur->y - cur->hot_y;
+    for (int gy = 0; gy < cur->gh; gy++) {
+        int dy = oy + gy;
+        if (dy < 0 || dy >= (int)H) continue;
+        for (int gx = 0; gx < cur->gw; gx++) {
+            int dx = ox + gx;
+            if (dx < 0 || dx >= (int)W) continue;
+            uint8_t* d = dst + ((size_t)dy * W + dx) * 4;
+            if (!cur->mono) {
+                const uint8_t* s = &cur->bgra[((size_t)gy * cur->gw + gx) * 4];
+                uint32_t a = s[3];
+                if (!a) continue;
+                d[0] = (uint8_t)((s[0] * a + d[0] * (255 - a)) / 255);
+                d[1] = (uint8_t)((s[1] * a + d[1] * (255 - a)) / 255);
+                d[2] = (uint8_t)((s[2] * a + d[2] * (255 - a)) / 255);
+            } else {
+                int a  = cur->and_mask[(size_t)gy * cur->gw + gx];
+                int xr = cur->xor_mask[(size_t)gy * cur->gw + gx];
+                if (a == 0 && xr == 0)      { d[0] = d[1] = d[2] = 0; }
+                else if (a == 0 && xr == 1) { d[0] = d[1] = d[2] = 255; }
+                else if (a == 1 && xr == 1) { d[0] = (uint8_t)(255 - d[0]);
+                                              d[1] = (uint8_t)(255 - d[1]);
+                                              d[2] = (uint8_t)(255 - d[2]); }
+            }
+        }
+    }
+}
diff --git a/src/si/vgpu-stream/win32/cursor.h b/src/si/vgpu-stream/win32/cursor.h
new file mode 100644
index 0000000..81d4842
--- /dev/null
+++ b/src/si/vgpu-stream/win32/cursor.h
@@ -0,0 +1,26 @@
+#ifndef VGPU_CURSOR_H
+#define VGPU_CURSOR_H
+
+/* cursor.h — win32 GDI cursor sample/compose onto a tight BGRA frame. */
+
+#include <stdint.h>
+#include "ctx.h"           /* win32 vgpu_ctx (cursor state) */
+
+/* Sample the current cursor (position/shape) into ctx->cursor.
+ * Returns 1 if anything changed since last sample, else 0. */
+int  cursor_sample(vgpu_ctx* ctx);
+
+/* Resolve a HCURSOR to a VGPU_CURSOR_ID_* by comparing against the system cursor table
+ * (LoadCursor(NULL, IDC_*) loaded once on first use). Returns VGPU_CURSOR_ID_UNKNOWN for
+ * custom cursors. Not hot-path: called only under the shape-change gate. */
+int  cursor_resolve_id(HCURSOR hc);
+
+/* Extract glyph/hotspot/dims for hc into ctx->cursor, resolve its cursor_id, and record it as
+ * the current handle. For backends that source position elsewhere (DDA from frame info) and
+ * only need the shape on a shape-change gate. Caller serializes ctx->cursor writes. */
+void cursor_apply_shape(vgpu_ctx* ctx, HCURSOR hc);
+
+/* Alpha/AND-XOR compose the sampled cursor onto a tight BGRA frame. */
+void cursor_draw(vgpu_ctx* ctx, uint8_t* bgra, uint32_t width, uint32_t height);
+
+#endif /* VGPU_CURSOR_H */
diff --git a/src/si/vgpu-stream/win32/geometry.c b/src/si/vgpu-stream/win32/geometry.c
new file mode 100644
index 0000000..39bfa2d
--- /dev/null
+++ b/src/si/vgpu-stream/win32/geometry.c
@@ -0,0 +1,52 @@
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include "geometry.h"
+#include "stream.h"        /* vgpu_publish_geometry */
+
+/* GetDpiForMonitor lives in Shcore.dll (per-monitor DPI awareness API). Loaded dynamically so
+ * the binary does not hard-depend on it; absence degrades dpi to "unknown" (0). */
+typedef HRESULT (WINAPI *GetDpiForMonitor_t)(HMONITOR, int /*MDT_*/, UINT*, UINT*);
+#define VGPU_MDT_EFFECTIVE_DPI 0
+
+static UINT monitor_dpi(HMONITOR mon) {
+    static GetDpiForMonitor_t fn = NULL;
+    static int tried = 0;
+    if (!tried) {
+        HMODULE lib = LoadLibraryA("Shcore.dll");
+        if (lib) fn = (GetDpiForMonitor_t)(void*)GetProcAddress(lib, "GetDpiForMonitor");
+        tried = 1;
+    }
+    if (!fn || !mon) return 0u;
+    UINT dx = 0, dy = 0;
+    if (fn(mon, VGPU_MDT_EFFECTIVE_DPI, &dx, &dy) != S_OK || dx == 0u)
+        return 0u;
+    return dx;
+}
+
+static uint32_t monitor_refresh_mhz(HMONITOR mon) {
+    MONITORINFOEXW mi; mi.cbSize = sizeof mi;
+    if (!mon || !GetMonitorInfoW(mon, (MONITORINFO*)&mi))
+        return 0u;
+    DEVMODEW dm; ZeroMemory(&dm, sizeof dm); dm.dmSize = sizeof dm;
+    if (!EnumDisplaySettingsW(mi.szDevice, ENUM_CURRENT_SETTINGS, &dm))
+        return 0u;
+    if (dm.dmDisplayFrequency <= 1u)   /* 0/1 = hardware default, not a real rate */
+        return 0u;
+    return (uint32_t)dm.dmDisplayFrequency * 1000u;   /* whole Hz -> milli-Hz */
+}
+
+void geometry_sample_and_publish(vgpu_ctx* ctx, int32_t cap_x, int32_t cap_y) {
+    int32_t virt_x = (int32_t)GetSystemMetrics(SM_XVIRTUALSCREEN);
+    int32_t virt_y = (int32_t)GetSystemMetrics(SM_YVIRTUALSCREEN);
+    uint32_t virt_w = (uint32_t)GetSystemMetrics(SM_CXVIRTUALSCREEN);
+    uint32_t virt_h = (uint32_t)GetSystemMetrics(SM_CYVIRTUALSCREEN);
+
+    POINT origin = { cap_x, cap_y };
+    HMONITOR mon = MonitorFromPoint(origin, MONITOR_DEFAULTTOPRIMARY);
+
+    uint32_t dpi     = monitor_dpi(mon);
+    uint32_t refresh = monitor_refresh_mhz(mon);
+
+    vgpu_publish_geometry(&ctx->view, virt_x, virt_y, virt_w, virt_h,
+                          cap_x, cap_y, dpi, refresh);
+}
diff --git a/src/si/vgpu-stream/win32/geometry.h b/src/si/vgpu-stream/win32/geometry.h
new file mode 100644
index 0000000..02d339f
--- /dev/null
+++ b/src/si/vgpu-stream/win32/geometry.h
@@ -0,0 +1,18 @@
+#ifndef VGPU_GEOMETRY_H
+#define VGPU_GEOMETRY_H
+
+/* geometry.h — win32 display-geometry sampler. Samples the virtual-desktop bbox plus the
+ * captured output's origin / DPI / refresh and publishes them under the geom_seq seqlock.
+ * Not per-frame: called once at session start and reactively on backend recreate / capture-
+ * size change (the captured surface SIZE itself travels in desc.width/height, not here). */
+
+#include <stdint.h>
+#include "ctx.h"           /* win32 vgpu_ctx (region-view) */
+
+/* Sample display geometry for the captured output whose top-left origin is (cap_x,cap_y) in
+ * virtual-desktop coordinates, and publish it. cap_x/cap_y is (0,0) for primary/full-screen
+ * backends and the duplicated output's DesktopCoordinates for DDA. The captured size is taken
+ * from desc.width/height and is not sampled here. */
+void geometry_sample_and_publish(vgpu_ctx* ctx, int32_t cap_x, int32_t cap_y);
+
+#endif /* VGPU_GEOMETRY_H */
diff --git a/src/si/vgpu-stream/win32/main.c b/src/si/vgpu-stream/win32/main.c
new file mode 100644
index 0000000..09477fe
--- /dev/null
+++ b/src/si/vgpu-stream/win32/main.c
@@ -0,0 +1,55 @@
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "region.h"        /* win32 pinned region */
+#include "ctx.h"           /* win32 vgpu_ctx (embeds region-view) */
+#include "present.h"       /* present/pump lifecycle */
+#include "stream.h"        /* OS-agnostic status/error/backend setters */
+#include "capture.h"       /* backend table */
+
+int main(int argc, char** argv) {
+    int fps = argc > 1 ? atoi(argv[1]) : 30;
+    if (fps <= 0) fps = 30;
+
+    vgpu_region_t region;
+    if (vgpu_region_create(&region) != 0) {
+        fprintf(stderr, "main: region_create failed\n");
+        return 1;
+    }
+
+    vgpu_ctx ctx;
+    if (vgpu_present_init(&ctx, &region, (uint32_t)fps) != 0) {
+        fprintf(stderr, "main: present_init failed\n");
+        vgpu_region_destroy(&region);
+        return 1;
+    }
+
+    const char* eyes = getenv("EYES");
+    int n = 0;
+    const capture_backend* bks = capture_backends(&n);
+    int started = 0;
+    for (int i = 0; i < n && !started; i++) {
+        if (eyes && _stricmp(eyes, bks[i].name) != 0) continue;
+        fprintf(stderr, "eyes: trying %s\n", bks[i].name);
+        started = bks[i].start(&ctx, fps);
+        if (!started) fprintf(stderr, "eyes: %s unavailable\n", bks[i].name);
+    }
+    if (!started) {
+        fprintf(stderr, "eyes: no capture backend available\n");
+        vgpu_set_status(&ctx.view, VGPU_ST_ERROR);
+        vgpu_set_error(&ctx.view, 2u);
+        vgpu_present_deinit(&ctx);
+        vgpu_region_destroy(&region);
+        return 1;
+    }
+
+    vgpu_set_backend(&ctx.view, ctx.backend);
+    vgpu_present_run(&ctx);   /* never returns */
+
+    vgpu_present_deinit(&ctx);
+    vgpu_region_destroy(&region);
+    return 0;
+}
diff --git a/src/si/vgpu-stream/win32/nvfbc_tosys_c.h b/src/si/vgpu-stream/win32/nvfbc_tosys_c.h
new file mode 100644
index 0000000..3446f4b
--- /dev/null
+++ b/src/si/vgpu-stream/win32/nvfbc_tosys_c.h
@@ -0,0 +1,93 @@
+#ifndef VGPU_NVFBC_TOSYS_C_H
+#define VGPU_NVFBC_TOSYS_C_H
+
+/*
+ * C mirror of NvFBC's ToSys interface. The vendor header
+ * third_party/NvFBC/nvFBCToSys.h declares INvFBCToSys_v3 as a C++ abstract
+ * class (vtable of 5 pure-virtual
+ * __stdcall methods). We do NOT edit the vendor header; instead we replicate its
+ * single-inheritance vtable ABI as a COM-in-C interface so the producer stays
+ * pure C. Slot order MUST match declaration order in nvFBCToSys.h:
+ *   0 NvFBCToSysSetUp
+ *   1 NvFBCToSysGrabFrame
+ *   2 NvFBCToSysCursorCapture
+ *   3 NvFBCToSysGPUBasedCPUSleep
+ *   4 NvFBCToSysRelease
+ * On x64 (mingw/MSVC) `this` is the implicit first integer argument; __stdcall
+ * is a no-op for x64 so a plain pointer arg matches the vtable slot.
+ */
+
+#include "NvFBC/nvFBC.h"          /* vendor (third_party/): NVFBCRESULT, NvU32, param structs */
+
+/* SetUp / GrabFrame param structs come from nvFBCToSys.h, but that header is C++.
+ * Redeclare the two we use here (layout-identical, C-clean). */
+
+typedef enum {
+    NVFBC_TOSYS_ARGB = 0,
+    NVFBC_TOSYS_RGB,
+    NVFBC_TOSYS_YYYYUV420p,
+    NVFBC_TOSYS_RGB_PLANAR,
+    NVFBC_TOSYS_XOR,
+    NVFBC_TOSYS_YUV444p,
+    NVFBC_TOSYS_BUF_FMT_LAST
+} NVFBCToSysBufferFormat_c;
+
+typedef enum {
+    NVFBC_TOSYS_SOURCEMODE_FULL = 0,
+    NVFBC_TOSYS_SOURCEMODE_SCALE,
+    NVFBC_TOSYS_SOURCEMODE_CROP,
+    NVFBC_TOSYS_SOURCEMODE_LAST
+} NVFBCToSysGrabMode_c;
+
+enum {
+    NVFBC_TOSYS_NOFLAGS_C           = 0x0,
+    NVFBC_TOSYS_NOWAIT_C            = 0x1,
+    NVFBC_TOSYS_WAIT_WITH_TIMEOUT_C = 0x10
+};
+
+#define NVFBC_TO_SYS_C (0x1204)
+
+typedef struct {
+    NvU32 dwVersion;
+    NvU32 bits;                /* bWithHWCursor:1, bDiffMap:1, bSep:1, rsvd:29 */
+    NVFBCToSysBufferFormat_c eMode;
+    NvU32 dwReserved1;
+    void **ppBuffer;
+    void **ppDiffMap;
+    void  *hCursorCaptureEvent;
+    NvU32 dwReserved[58];
+    void *pReserved[29];
+} NVFBC_TOSYS_SETUP_PARAMS_C;
+#define NVFBC_TOSYS_SETUP_PARAMS_VER_C \
+    NVFBC_STRUCT_VERSION(NVFBC_TOSYS_SETUP_PARAMS_C, 2)
+
+typedef struct {
+    NvU32 dwVersion;
+    NvU32 dwFlags;
+    NvU32 dwTargetWidth;
+    NvU32 dwTargetHeight;
+    NvU32 dwStartX;
+    NvU32 dwStartY;
+    NVFBCToSysGrabMode_c eGMode;
+    NvU32 dwWaitTime;
+    NvFBCFrameGrabInfo *pNvFBCFrameGrabInfo;
+    NvU32 dwReserved[56];
+    void *pReserved[31];
+} NVFBC_TOSYS_GRAB_FRAME_PARAMS_C;
+#define NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER_C \
+    NVFBC_STRUCT_VERSION(NVFBC_TOSYS_GRAB_FRAME_PARAMS_C, 1)
+
+/* COM-in-C interface mirror */
+typedef struct NvFBCToSys_c NvFBCToSys_c;
+typedef struct {
+    NVFBCRESULT (__stdcall *NvFBCToSysSetUp)(NvFBCToSys_c*, NVFBC_TOSYS_SETUP_PARAMS_C*);
+    NVFBCRESULT (__stdcall *NvFBCToSysGrabFrame)(NvFBCToSys_c*, NVFBC_TOSYS_GRAB_FRAME_PARAMS_C*);
+    NVFBCRESULT (__stdcall *NvFBCToSysCursorCapture)(NvFBCToSys_c*, void*);
+    NVFBCRESULT (__stdcall *NvFBCToSysGPUBasedCPUSleep)(NvFBCToSys_c*, __int64);
+    NVFBCRESULT (__stdcall *NvFBCToSysRelease)(NvFBCToSys_c*);
+} NvFBCToSys_c_vtbl;
+struct NvFBCToSys_c {
+    const NvFBCToSys_c_vtbl* lpVtbl;
+};
+
+#endif /* VGPU_NVFBC_TOSYS_C_H */
diff --git a/src/si/vgpu-stream/win32/present.c b/src/si/vgpu-stream/win32/present.c
new file mode 100644
index 0000000..c9513ec
--- /dev/null
+++ b/src/si/vgpu-stream/win32/present.c
@@ -0,0 +1,212 @@
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <string.h>
+#include <stdio.h>
+#include "present.h"
+#include "stream.h"        /* OS-agnostic publish / control API + region-view */
+#include "cursor.h"
+#include "geometry.h"      /* one-shot display-geometry sample at session start */
+
+/* cursor arena sizing */
+#define VGPU_CUR_MAX     256u
+#define VGPU_CUR_BGRA    (VGPU_CUR_MAX * VGPU_CUR_MAX * 4u)
+#define VGPU_CUR_MASK    (VGPU_CUR_MAX * VGPU_CUR_MAX)
+
+static uint64_t now_ns(void) {
+    static LARGE_INTEGER freq = { .QuadPart = 0 };
+    if (freq.QuadPart == 0) QueryPerformanceFrequency(&freq);
+    LARGE_INTEGER c; QueryPerformanceCounter(&c);
+    return (uint64_t)((double)c.QuadPart * 1e9 / (double)freq.QuadPart);
+}
+
+int vgpu_present_init(vgpu_ctx* ctx, vgpu_region_t* region, uint32_t default_fps) {
+    memset(ctx, 0, sizeof *ctx);
+    ctx->view.producer = region->producer;
+    ctx->view.control  = region->control;
+    ctx->view.ring     = region->ring;
+    ctx->default_fps = default_fps ? default_fps : 30u;
+    ctx->backend  = VGPU_BK_NONE;
+    ctx->draw_cursor_cap = 1;
+
+    /* one arena: content + frame + cursor buffers */
+    size_t bytes = VGPU_STAGING_BYTES   /* content */
+                 + VGPU_STAGING_BYTES   /* frame   */
+                 + VGPU_CUR_BGRA        /* cursor bgra */
+                 + VGPU_CUR_MASK        /* and */
+                 + VGPU_CUR_MASK;       /* xor */
+    uint8_t* a = (uint8_t*)VirtualAlloc(NULL, bytes, MEM_RESERVE | MEM_COMMIT,
+                                        PAGE_READWRITE);
+    if (!a) {
+        fprintf(stderr, "present: arena VirtualAlloc %zu MiB failed (%lu)\n",
+                bytes / (1024 * 1024), GetLastError());
+        return 1;
+    }
+    ctx->arena       = a;
+    ctx->arena_bytes = bytes;
+
+    size_t off = 0;
+    ctx->content_buf      = a + off; off += VGPU_STAGING_BYTES;
+    ctx->frame_buf        = a + off; off += VGPU_STAGING_BYTES;
+    ctx->cursor.bgra      = a + off; off += VGPU_CUR_BGRA;
+    ctx->cursor.and_mask  = a + off; off += VGPU_CUR_MASK;
+    ctx->cursor.xor_mask  = a + off; off += VGPU_CUR_MASK;
+
+    InitializeCriticalSection(&ctx->lock);
+    ctx->submit_event = CreateEvent(NULL, FALSE, FALSE, NULL);
+    ctx->content_seq  = 0;
+    ctx->content_w = ctx->content_h = 0;
+    return 0;
+}
+
+void vgpu_present_deinit(vgpu_ctx* ctx) {
+    if (ctx->submit_event) { CloseHandle(ctx->submit_event); ctx->submit_event = NULL; }
+    DeleteCriticalSection(&ctx->lock);
+    if (ctx->arena) { VirtualFree(ctx->arena, 0, MEM_RELEASE); ctx->arena = NULL; }
+}
+
+void vgpu_present_submit(vgpu_ctx* ctx, const uint8_t* src,
+                         uint32_t W, uint32_t H, uint32_t src_pitch) {
+    if (W > VGPU_MAX_WIDTH)  W = VGPU_MAX_WIDTH;
+    if (H > VGPU_MAX_HEIGHT) H = VGPU_MAX_HEIGHT;
+    if (W == 0 || H == 0) return;
+
+    EnterCriticalSection(&ctx->lock);
+    uint8_t* d = ctx->content_buf;
+    const uint32_t row = W * 4u;
+    for (uint32_t y = 0; y < H; y++)
+        memcpy(d + (size_t)y * row, src + (size_t)y * src_pitch, row);
+    ctx->content_w = W;
+    ctx->content_h = H;
+    ctx->content_seq++;
+    LeaveCriticalSection(&ctx->lock);
+    /* static-idle: stamp the moment the source delivered new content (the raw perception;
+     * the host derives "ms idle" from its own clock). Single 8-aligned MOV, off the lock. */
+    vgpu_publish_content_change(&ctx->view, now_ns());
+    SetEvent(ctx->submit_event);
+}
+
+void vgpu_present_run(vgpu_ctx* ctx) {
+    const vgpu_region_view* rv = &ctx->view;   /* neutral handle for the engine */
+    const DWORD poll_ms = 8;
+    int64_t  last_seq   = -1;
+    uint32_t prev_state = VGPU_CMD_STOP;
+    uint32_t last_ff_ack = rv->producer->full_frame_ack;
+    DWORD    last_beat  = GetTickCount();
+    uint64_t last_publish_ns = 0;   /* 0 → first eligible frame publishes immediately */
+    int      last_cur_x = 0, last_cur_y = 0, last_cur_vis = 0;
+    HCURSOR  last_cur_handle = NULL;
+
+    /* one-shot display geometry: publish once before the loop (flat pull contract). The
+     * captured-output origin is (0,0) for the primary/full-screen capture path; backends
+     * resample reactively on recreate / capture-size change. No periodic poll in the loop. */
+    geometry_sample_and_publish(ctx, 0, 0);
+
+    for (;;) {
+        WaitForSingleObject(ctx->submit_event, poll_ms);
+
+        /* --- heartbeat: always ticks, independent of desired_state --- */
+        DWORD nowt = GetTickCount();
+        if (nowt - last_beat >= VGPU_HEARTBEAT_PERIOD_MS) {
+            vgpu_tick_heartbeat(rv);
+            last_beat = nowt;
+        }
+
+        /* --- reconcile control (gen-seqlock -> apply -> ack) --- */
+        vgpu_control_view cv;
+        uint32_t desired = prev_state;
+        uint32_t draw_cursor = 1;
+        int      force_full = 0;
+        uint32_t fps = ctx->default_fps;   /* publish-rate cap (applied) */
+        uint32_t ff_req = last_ff_ack;     /* full_frame_req value to honor */
+        if (vgpu_control_read(rv, &cv)) {
+            desired = cv.desired_state;
+            draw_cursor = cv.draw_cursor;
+            fps = cv.target_fps ? cv.target_fps : ctx->default_fps;
+            vgpu_set_applied_fps(rv, fps);
+            vgpu_publish_ctrl_ack(rv, cv.gen);
+
+            ff_req = cv.full_frame_req;
+            if ((ff_req - last_ff_ack) != 0u)
+                force_full = 1;            /* edge pending, wrap-tolerant */
+        }
+
+        /* --- lifecycle transitions --- */
+        if (desired != prev_state) {
+            if (desired == VGPU_CMD_RUN && prev_state != VGPU_CMD_RUN) {
+                vgpu_bump_run_epoch(rv);
+                vgpu_set_status(rv, VGPU_ST_CAPTURING);
+                force_full = 1;     /* fresh frame on start */
+            } else if (desired == VGPU_CMD_PAUSE) {
+                vgpu_set_status(rv, VGPU_ST_PAUSED);
+            } else if (desired == VGPU_CMD_STOP) {
+                vgpu_set_status(rv, VGPU_ST_STOPPED);
+            }
+            prev_state = desired;
+        } else if (last_seq < 0 && desired == VGPU_CMD_RUN) {
+            vgpu_set_status(rv, VGPU_ST_CAPTURING);
+        }
+
+        if (desired != VGPU_CMD_RUN) {
+            /* PAUSED/STOPPED: no new frames; heartbeat still ticks. We do NOT
+             * ack a pending full_frame here — acking without publishing would
+             * be a false "honored". A pending request is honored on the next
+             * transition to RUN (force_full=1 there → publish + ack). */
+            continue;
+        }
+
+        /* --- compose + publish on content change OR forced full frame, but
+         *     rate-limited to the applied fps cap (the single publish point →
+         *     contract-level cap, independent of the capture backend). A
+         *     force_full bypasses the cap (due=1). present does NOT sample the
+         *     cursor (capture threads source it); it only reads ctx->cursor under
+         *     ctx->lock for compositing, and detects cursor motion via a delta so
+         *     a pure cursor move over static desktop still recomposes. --- */
+        uint64_t interval_ns = fps > 0 ? (1000000000ull / fps) : 0;
+        uint64_t now = now_ns();
+        int due = force_full || interval_ns == 0
+                  || (now - last_publish_ns) >= interval_ns;
+
+        int compose_cursor = (ctx->draw_cursor_cap && draw_cursor);
+
+        EnterCriticalSection(&ctx->lock);
+        int64_t  seq = ctx->content_seq;
+        uint32_t W = ctx->content_w, H = ctx->content_h;
+        int cur_changed = compose_cursor
+                          && ((ctx->cursor.visible != last_cur_vis)
+                              || (ctx->cursor.x != last_cur_x)
+                              || (ctx->cursor.y != last_cur_y)
+                              || (ctx->cursor.handle != last_cur_handle));
+        int have = (W && H);
+        int content_new = have && (seq != last_seq || cur_changed || force_full);
+        /* take the frame ONLY when due — so we never drop the latest content;
+         * if not due, last_seq is left untouched and it publishes next due. */
+        int dirty = content_new && due;
+        if (dirty) {
+            memcpy(ctx->frame_buf, ctx->content_buf, (size_t)W * H * 4u);
+            last_seq = seq;
+            if (compose_cursor)
+                cursor_draw(ctx, ctx->frame_buf, W, H);
+            last_cur_vis = ctx->cursor.visible;
+            last_cur_x = ctx->cursor.x; last_cur_y = ctx->cursor.y;
+            last_cur_handle = ctx->cursor.handle;
+        }
+        LeaveCriticalSection(&ctx->lock);
+
+        if (!dirty) {
+            /* not due, or nothing to publish. A force_full with content has
+             * due=1 → dirty=1, so it never lands here while have is true; thus
+             * no spurious ack edge. */
+            continue;
+        }
+
+        if (vgpu_publish_frame(rv, ctx->frame_buf, W, H, now) == 0) {
+            last_publish_ns = now;
+            if (force_full) {
+                vgpu_publish_full_frame_ack(rv, ff_req);
+                last_ff_ack = ff_req;
+            }
+        } else {
+            vgpu_set_error(rv, 1u);  /* frame too large for slot (mode > max) */
+        }
+    }
+}
diff --git a/src/si/vgpu-stream/win32/present.h b/src/si/vgpu-stream/win32/present.h
new file mode 100644
index 0000000..4ae9473
--- /dev/null
+++ b/src/si/vgpu-stream/win32/present.h
@@ -0,0 +1,24 @@
+#ifndef VGPU_PRESENT_H
+#define VGPU_PRESENT_H
+
+/* present.h — win32 present/pump lifecycle: staging arena, submit handoff, and
+ * the publish loop driving the OS-agnostic engine over ctx's region-view. */
+
+#include <stdint.h>
+#include "ctx.h"           /* win32 vgpu_ctx + vgpu_region_t */
+
+/* Initialize present/staging state inside ctx over an already-created region.
+ * Allocates the staging+cursor arena. Returns 0 on success. */
+int  vgpu_present_init(vgpu_ctx* ctx, vgpu_region_t* region, uint32_t default_fps);
+void vgpu_present_deinit(vgpu_ctx* ctx);
+
+/* Capture backends submit a freshly captured desktop frame (any source pitch).
+ * Repacked tight into ctx->content_buf, clamped to max mode. Thread-safe. */
+void vgpu_present_submit(vgpu_ctx* ctx, const uint8_t* bgra,
+                         uint32_t width, uint32_t height, uint32_t src_pitch);
+
+/* Run the publish pump: reconcile control, tick heartbeat, compose cursor,
+ * publish on change / on full_frame_req. Never returns (process lifetime). */
+void vgpu_present_run(vgpu_ctx* ctx);
+
+#endif /* VGPU_PRESENT_H */
diff --git a/src/si/vgpu-stream/win32/region.c b/src/si/vgpu-stream/win32/region.c
new file mode 100644
index 0000000..1181402
--- /dev/null
+++ b/src/si/vgpu-stream/win32/region.c
@@ -0,0 +1,172 @@
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <stdio.h>
+#include <string.h>
+#include "region.h"
+#include "atomic-shim.h"   /* x86-TSO ordering for contract init publish */
+
+#define VGPU_2MB (2u * 1024u * 1024u)
+
+/* Page-segregated init of the contract over an already-pinned region base.
+ * Init-ordering per contract: status=INIT, latest=NONE, backend, supported_formats,
+ * release-barrier; heartbeat starts later (in the run pump). */
+static void region_init_contract(vgpu_region_t* r) {
+    vgpu_producer_t* p = r->producer;
+    vgpu_control_t*  c = r->control;
+
+    memset(p, 0, sizeof *p);
+    memset(c, 0, sizeof *c);
+
+    p->status            = VGPU_ST_INIT;
+    p->backend           = VGPU_BK_NONE;
+    p->error_code        = 0;
+    p->applied_fps       = 0;
+    p->supported_formats = (1u << VGPU_FMT_BGRA8888);
+    p->run_epoch         = 0;
+    p->heartbeat         = 0;
+    p->frame_id          = 0;
+    p->ctrl_ack          = 0;
+    p->full_frame_ack    = 0;
+    for (uint32_t i = 0; i < VGPU_SLOT_COUNT; i++)
+        p->seq[i] = 0;
+
+    /* control starts RUN: producer captures immediately; host may STOP/PAUSE */
+    c->ctrl_gen       = 0;
+    c->desired_state  = VGPU_CMD_RUN;
+    c->target_fps     = 0;
+    c->draw_cursor    = 1;
+    c->full_frame_req = 0;
+    c->consumer_tick  = 0;
+    c->attached       = 0;
+
+    /* publish latest last with a release store gating all of the above */
+    vgpu_sfence();
+    vgpu_store_release32(&p->latest, VGPU_LATEST_NONE);
+}
+
+static int adjust_lock_memory_privilege(void) {
+    HANDLE tok;
+    if (!OpenProcessToken(GetCurrentProcess(),
+                          TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &tok))
+        return 0;
+    TOKEN_PRIVILEGES tp;
+    memset(&tp, 0, sizeof tp);
+    tp.PrivilegeCount = 1;
+    if (!LookupPrivilegeValueA(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid)) {
+        CloseHandle(tok);
+        return 0;
+    }
+    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+    int ok = AdjustTokenPrivileges(tok, FALSE, &tp, sizeof tp, NULL, NULL)
+             && GetLastError() == ERROR_SUCCESS;
+    CloseHandle(tok);
+    return ok;
+}
+
+int vgpu_region_create(vgpu_region_t* out) {
+    memset(out, 0, sizeof *out);
+
+    const uint64_t bytes = VGPU_REGION_BYTES;
+
+    void*    os_base  = NULL;
+    uint8_t* base     = NULL;
+    uint64_t os_total = 0;
+
+    if (adjust_lock_memory_privilege()) {
+        SIZE_T large_min = GetLargePageMinimum();
+        if (large_min && large_min <= VGPU_2MB) {
+            SIZE_T rounded = (SIZE_T)((bytes + VGPU_2MB - 1) & ~(uint64_t)(VGPU_2MB - 1));
+            void* p = VirtualAlloc(NULL, rounded,
+                                   MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES,
+                                   PAGE_READWRITE);
+            if (p) {
+                /* large pages are >= 2 MiB → base is already 2 MiB-aligned */
+                os_base  = p;
+                base     = (uint8_t*)p;
+                os_total = rounded;
+                fprintf(stderr, "region: MEM_LARGE_PAGES %llu MiB at %p\n",
+                        (unsigned long long)(rounded / (1024 * 1024)), p);
+            } else {
+                fprintf(stderr, "region: MEM_LARGE_PAGES failed (%lu), fallback\n",
+                        GetLastError());
+            }
+        }
+    } else {
+        fprintf(stderr, "region: SE_LOCK_MEMORY unavailable, fallback\n");
+    }
+
+    if (!base) {
+        uint64_t total = bytes + VGPU_2MB;
+        void* p = VirtualAlloc(NULL, (SIZE_T)total, MEM_RESERVE | MEM_COMMIT,
+                               PAGE_READWRITE);
+        if (!p) {
+            fprintf(stderr, "region: VirtualAlloc %llu MiB failed (%lu)\n",
+                    (unsigned long long)(total / (1024 * 1024)), GetLastError());
+            return 1;
+        }
+        uintptr_t addr    = (uintptr_t)p;
+        uintptr_t aligned = (addr + VGPU_2MB - 1) & ~(uintptr_t)(VGPU_2MB - 1);
+
+        /* The region must be RESIDENT, not merely committed: the host reads it out
+         * of guest RAM and only PRESENT pages are visible to it — a committed but
+         * demand-zero page has no PTE, so it is unreadable from the host. VirtualLock
+         * pins the pages into the working set, but it can lock at most the process
+         * MINIMUM working set, and the default quota is far below the region size
+         * (so a bare VirtualLock fails with ERROR_WORKING_SET_QUOTA). Raise the
+         * minimum first. NB: VirtualLock / SetProcessWorkingSetSize do NOT need
+         * SE_LOCK_MEMORY — that privilege is only for large pages / AWE. */
+        SIZE_T ws_min = (SIZE_T)(bytes + 64ull * 1024 * 1024);   /* region + headroom */
+        SIZE_T ws_max = ws_min + 128ull * 1024 * 1024;
+        SIZE_T cur_min = 0, cur_max = 0;
+        if (GetProcessWorkingSetSize(GetCurrentProcess(), &cur_min, &cur_max)) {
+            if (cur_min > ws_min) ws_min = cur_min;   /* never shrink an existing quota */
+            if (cur_max > ws_max) ws_max = cur_max;
+        }
+        if (!SetProcessWorkingSetSize(GetCurrentProcess(), ws_min, ws_max))
+            fprintf(stderr, "region: SetProcessWorkingSetSize(%llu MiB) failed (%lu)\n",
+                    (unsigned long long)(ws_min / (1024 * 1024)), GetLastError());
+
+        if (!VirtualLock((void*)aligned, (SIZE_T)bytes)) {
+            fprintf(stderr, "region: VirtualLock failed (%lu) — pre-faulting region\n",
+                    GetLastError());
+            /* Last resort: fault every page so it is at least PRESENT now. Without
+             * the lock the trimmer may evict it under pressure, but the raised
+             * minimum working set above makes eviction far less likely. */
+            volatile uint8_t* q = (volatile uint8_t*)aligned;
+            for (uint64_t off = 0; off < bytes; off += 4096u) q[off] = q[off];
+        }
+
+        os_base  = p;
+        base     = (uint8_t*)aligned;
+        os_total = total;
+        fprintf(stderr, "region: fallback VirtualAlloc+lock %llu MiB, aligned at %p\n",
+                (unsigned long long)(bytes / (1024 * 1024)), (void*)aligned);
+    }
+
+    if (((uintptr_t)base & (VGPU_2MB - 1)) != 0) {
+        fprintf(stderr, "region: base %p not 2 MiB aligned\n", (void*)base);
+        VirtualFree(os_base, 0, MEM_RELEASE);
+        return 1;
+    }
+
+    out->os_base  = os_base;
+    out->base     = base;
+    out->os_total = os_total;
+    out->producer = (vgpu_producer_t*)(base + VGPU_PRODUCER_OFFSET);
+    out->control  = (vgpu_control_t*)(base + VGPU_CONTROL_OFFSET);
+    out->ring     = base + VGPU_RING_OFFSET;
+
+    region_init_contract(out);
+
+    fprintf(stderr, "region: contract ready (producer=%p control=%p ring=%p)\n",
+            (void*)out->producer, (void*)out->control, (void*)out->ring);
+    return 0;
+}
+
+void vgpu_region_destroy(vgpu_region_t* r) {
+    if (r && r->os_base) {
+        VirtualUnlock(r->base, (SIZE_T)VGPU_REGION_BYTES);
+        VirtualFree(r->os_base, 0, MEM_RELEASE);
+        memset(r, 0, sizeof *r);
+    }
+}
diff --git a/src/si/vgpu-stream/win32/region.h b/src/si/vgpu-stream/win32/region.h
new file mode 100644
index 0000000..113eb32
--- /dev/null
+++ b/src/si/vgpu-stream/win32/region.h
@@ -0,0 +1,28 @@
+#ifndef VGPU_REGION_H
+#define VGPU_REGION_H
+
+/* region.h — win32 pinned contract region (resolves blocks for the region-view). */
+
+#include <stdint.h>
+#include "vgpu_stream.h"   /* public contract: blocks, offsets, slot geometry */
+
+/*
+ * One contiguous 2 MiB-aligned pinned region holding the full contract:
+ * producer block (page 0), control block (page 1), then SLOT_COUNT frame slots
+ * starting at VGPU_RING_OFFSET. Object = memory: the region owns the mapping,
+ * its lifetime is the mapping's lifetime. No hidden global state.
+ */
+typedef struct {
+    void*            os_base;   /* raw allocation base (for free) */
+    uint8_t*         base;      /* 2 MiB-aligned region base (== contract origin) */
+    uint64_t         os_total;  /* bytes reserved at os_base */
+    vgpu_producer_t* producer;  /* base + VGPU_PRODUCER_OFFSET */
+    vgpu_control_t*  control;   /* base + VGPU_CONTROL_OFFSET  */
+    uint8_t*         ring;      /* base + VGPU_RING_OFFSET      */
+} vgpu_region_t;
+
+/* Returns 0 on success, non-zero on failure (region zeroed on failure). */
+int  vgpu_region_create(vgpu_region_t* out);
+void vgpu_region_destroy(vgpu_region_t* r);
+
+#endif /* VGPU_REGION_H */
diff --git a/third_party/NvFBC/nvFBC.h b/third_party/NvFBC/nvFBC.h
new file mode 100644
index 0000000..63d50ff
--- /dev/null
+++ b/third_party/NvFBC/nvFBC.h
@@ -0,0 +1,275 @@
+/**
+ * \file This file contains definitions for NVFBC API.
+ * \copyright
+ *
+ * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ * NOTICE TO LICENSEE: This source code and/or documentation ("Licensed Deliverables")
+ * are subject to the applicable NVIDIA license agreement
+ * that governs the use of the Licensed Deliverables.
+ *
+ */
+
+#pragma once
+#include <Windows.h>
+
+typedef unsigned char NvU8;
+typedef unsigned long NvU32;
+typedef unsigned long long NvU64;
+
+/**
+ * \defgroup NVFBC The NVIDIA Frame Buffer Capture API.
+ * \brief Defines a set of interfaces for high performance Capture of desktop content.
+ */
+
+/**
+ * \defgroup NVFBC_ENUMS Enums
+ * \ingroup NVFBC
+ * \brief Enumerations to be used with NVFBC API
+ */
+
+/**
+ * \defgroup NVFBC_STRUCTS Structs
+ * \ingroup NVFBC
+ * \brief Defines Parameter Structures to be used with NVFBC APIs.
+ */
+
+/**
+ * \defgroup NVFBC_ENTRYPOINTS Entrypoints
+ * \ingroup NVFBC
+ * \brief Declarations for NVFBC Entrypoint functions
+ */
+
+/**
+ * \ingroup NVFBC
+ * Macro to define the NVFBC API version corresponding to this distribution.
+ */
+#define NVFBC_DLL_VERSION 0x50
+
+/**
+ * \ingroup NVFBC
+ * Macro to construct version numbers for parameter structs.
+ */
+#define NVFBC_STRUCT_VERSION(typeName, ver) (NvU32)(sizeof(typeName) | ((ver)<<16) | (NVFBC_DLL_VERSION << 24))
+
+/**
+ * \ingroup NVFBC
+ * Calling Convention
+ */
+#define NVFBCAPI __stdcall
+
+/**
+ * \ingroup NVFBC
+ * Indicates that there are no global overrides specified for NVFBC. To be used with NVFBC_SetGlobalFlags API
+ */
+#define NVFBC_GLOBAL_FLAGS_NONE                     0x00000000
+
+/**
+ * \ingroup NVFBC
+ * Indicates to NVFBC that stereo rendering is enabled. Currently unsupported. To be used with NVFBC_SetGlobalFlags API.
+ */
+#define NVFBC_GLOBAL_FLAGS_STEREO_BUFFER            0x00000001
+
+/**
+ * \ingroup NVFBC
+ * Indicates that NVFBC should not request a repaint of the desktop when initiating NVFBC capture. To be used with NVFBC_SetGlobalFlags API.
+ */
+#define NVFBC_GLOBAL_FLAGS_NO_INITIAL_REFRESH       0x00000002
+
+/**
+ * \ingroup NVFBC
+ * Indicates that NVFBC should not reset the graphics driver while servicing subsequent NVFBC_Enable API requests.
+*/
+
+#define NVFBC_GLOBAL_FLAGS_NO_DEVICE_RESET_TOGGLE   0x00000004
+
+/**
+ * \ingroup NVFBC_ENUMS
+ * \brief Enumerates status codes returned by NVFBC APIs.
+ */
+typedef enum _NVFBCRESULT
+{
+    NVFBC_SUCCESS = 0,
+    NVFBC_ERROR_GENERIC = -1,                     /**< Unexpected failure in NVFBC. */
+    NVFBC_ERROR_INVALID_PARAM = -2,               /**< One or more of the paramteres passed to NvFBC are invalid [This include NULL pointers]. */
+    NVFBC_ERROR_INVALIDATED_SESSION = -3,         /**< NvFBC session is invalid. Client needs to recreate session. */
+    NVFBC_ERROR_PROTECTED_CONTENT = -4,           /**< Protected content detected. Capture failed. */
+    NVFBC_ERROR_DRIVER_FAILURE = -5,              /**< GPU driver returned failure to process NvFBC command. */
+    NVFBC_ERROR_CUDA_FAILURE   = -6,              /**< CUDA driver returned failure to process NvFBC command. */
+    NVFBC_ERROR_UNSUPPORTED    = -7,              /**< API Unsupported on this version of NvFBC. */
+    NVFBC_ERROR_HW_ENC_FAILURE  = -8,             /**< HW Encoder returned failure to process NVFBC command. */
+    NVFBC_ERROR_INCOMPATIBLE_DRIVER = -9,         /**< NVFBC is not compatible with this version of the GPU driver. */
+    NVFBC_ERROR_UNSUPPORTED_PLATFORM = -10,       /**< NVFBC is not supported on this platform. */
+    NVFBC_ERROR_OUT_OF_MEMORY  = -11,             /**< Failed to allocate memory. */
+    NVFBC_ERROR_INVALID_PTR    = -12,             /**< A NULL pointer was passed. */
+    NVFBC_ERROR_INCOMPATIBLE_VERSION = -13,       /**< An API was called with a parameter struct that has an incompatible version. Check dwVersion field of paramter struct. */
+    NVFBC_ERROR_OPT_CAPTURE_FAILURE = -14,        /**< Desktop Capture failed. */
+    NVFBC_ERROR_INSUFFICIENT_PRIVILEGES  = -15,   /**< User doesn't have appropriate previlages. */
+    NVFBC_ERROR_INVALID_CALL = -16,               /**< NVFBC APIs called in wrong sequence. */
+    NVFBC_ERROR_SYSTEM_ERROR = -17,               /**< Win32 error. */
+    NVFBC_ERROR_INVALID_TARGET = -18,             /**< The target adapter idx can not be used for NVFBC capture. It may not correspond to an NVIDIA GPU, or may not be attached to desktop. */
+    NVFBC_ERROR_DYNAMIC_DISABLE = -20,            /**< NvFBC is dynamically disabled. Cannot continue to capture */
+} NVFBCRESULT;
+
+/**
+ * \ingroup NVFBC_ENUMS
+ * \brief Enumerates NVFBC states. To be used with NvFBC_Enable API
+ */
+typedef enum _NVFBC_STATE
+{
+    NVFBC_STATE_DISABLE          = 0,   /** Disables NvFBC. */
+    NVFBC_STATE_ENABLE              ,   /** Enables NvFBC. */
+    NVFBC_STATE_LAST                ,   /** Sentinel value. Shouldn't be used. */
+} NVFBC_STATE;
+
+/**
+ * \ingroup NVFBC_STRUCTS
+ * \brief Defines parameters that describe the grabbed data, and provides detailed information about status of the NVFBC session.
+ */
+typedef struct _NvFBCFrameGrabInfo
+{
+    DWORD   dwWidth;                /**< [out] Indicates the current width of captured buffer. */
+    DWORD   dwHeight;               /**< [out] Indicates the current height of captured buffer. */
+    DWORD   dwBufferWidth;          /**< [out] Indicates the current width of the pixel buffer(padded width). */
+    DWORD   dwReserved;             /**< [in] Reserved, do not use. */
+    BOOL    bOverlayActive;         /**< [out] Is set to 1 if overlay was active. */
+    BOOL    bMustRecreate;          /**< [out] Is set to 1 if the compressor must call NvBFC_Create again. */
+    BOOL    bFirstBuffer;           /**< [out] Is set to 1 is this was the first capture call, or first call after a desktop mode change.
+                                               Relevant only for XOR and diff modes supported by NVFBCToSys interface. */
+    BOOL    bHWMouseVisible;        /**< [out] Is set to 1 if HW cursor was enabled by OS at the time of the grab. */
+    BOOL    bProtectedContent;      /**< [out] Is set to 1 if protected content was active (DXVA encryption Session). */
+    DWORD   dwDriverInternalError;  /**< [out] Indicates the status code from lower layers. 0 or 0xFBCA11F9 indicates no error was returned. */
+    BOOL    bStereoOn;              /**< [out] Is set to 1 if stereo was on. */
+    BOOL    bIGPUCapture;           /**< [out] Is set to 1 if the captured frame is from iGPU. 0 if capture fails or if captured from dGPU*/
+    DWORD   dwSourcePID;            /**< [out] Indicates which process caused the last screen update that got grabbed*/
+    DWORD   dwReserved3;            /**< [in] Reserved, do not use. */
+    NvU32   dwReserved2[13];        /**< [in] Resereved, should be set to 0. */
+} NvFBCFrameGrabInfo;
+
+/**
+ * \ingroup NVFBC_STRUCTS
+ * \brief Deines the parameters to be used with NvFBC_GetStatusEx API
+ */
+typedef struct _NvFBCStatusEx
+{
+    NvU32  dwVersion;              /**< [in]  Struct version. Set to NVFBC_STATUS_VER. */
+    NvU32  bIsCapturePossible :1;  /**< [out] Indicates if NvFBC feature is enabled. */
+    NvU32  bCurrentlyCapturing:1;  /**< [out] Indicates if NVFBC is currently capturing for the Adapter ordinal specified in dwAdapterIdx. */
+    NvU32  bCanCreateNow      :1;  /**< [out] Deprecated. Do not use. */
+    NvU32  bSupportMultiHead  :1;  /**< [out] MultiHead grab supported. */
+    NvU32  bSupportMultiClient:1;  /**< [out] Multiple capture clients on same display adapter supported. */
+    NvU32  bReservedBits      :27; /**< [in]  Reserved, do not use. */
+    NvU32  dwNvFBCVersion;         /**< [out] Indicates the highest NvFBC interface version supported by the loaded NVFBC library. */
+    NvU32  dwAdapterIdx;           /**< [in]  Adapter Ordinal corresponding to the display to be grabbed. IGNORED if bCapturePID is set */
+    void*  pPrivateData;           /**< [in]  optional **/
+    NvU32  dwPrivateDataSize;      /**< [in]  optional **/
+    NvU32  dwReserved[59];         /**< [in]  Reserved. Should be set to 0. */
+    void*  pReserved[31];          /**< [in]  Reserved. Should be set to NULL. */
+} NvFBCStatusEx;
+#define NVFBC_STATUS_VER_1  NVFBC_STRUCT_VERSION(NvFBCStatusEx, 1)
+#define NVFBC_STATUS_VER_2  NVFBC_STRUCT_VERSION(NvFBCStatusEx, 2)
+#define NVFBC_STATUS_VER    NVFBC_STATUS_VER_2
+
+/**
+ * \ingroup NVFBC_STRUCTS
+ * \brief Defines the parameters to be used with NvFBC_CreateEx API.
+ */
+typedef struct _NvFBCCreateParams
+{
+    NvU32  dwVersion;              /**< [in]  Struct version. Set to NVFBC_CREATE_PARAMS_VER. */
+    NvU32  dwInterfaceType;        /**< [in]  ID of the NVFBC interface Type being requested. */
+    NvU32  dwMaxDisplayWidth;      /**< [out] Max. display width allowed. */
+    NvU32  dwMaxDisplayHeight;     /**< [out] Max. display height allowed. */
+    void*  pDevice;                /**< [in]  Device pointer. */
+    void*  pPrivateData;           /**< [in]  Private data [optional].  */
+    NvU32  dwPrivateDataSize;      /**< [in]  Size of private data. */
+    NvU32  dwInterfaceVersion;     /**< [in]  Version of the capture interface. */
+    void*  pNvFBC;                 /**< [out] A pointer to the requested NVFBC object. */
+    NvU32  dwAdapterIdx;           /**< [in]  Adapter Ordinal corresponding to the display to be grabbed. If pDevice is set, this parameter is ignored. */
+    NvU32  dwNvFBCVersion;         /**< [out] Indicates the highest NvFBC interface version supported by the loaded NVFBC library. */
+    void*  cudaCtx;                /**< [in]  CUDA context created using cuD3D9CtxCreate with the D3D9 device passed as pDevice. Only used for NvFBCCuda interface.
+                                              It is mandatory to pass a valid D3D9 device if cudaCtx is passed. The call will fail otherwise.
+                                              Client must release NvFBCCuda object before destroying the cudaCtx. */
+    void*  pPrivateData2;           /**< [in]  Private data [optional].  */
+    NvU32  dwPrivateData2Size;      /**< [in]  Size of private data. */
+    NvU32  dwReserved[55];         /**< [in]  Reserved. Should be set to 0. */
+    void*  pReserved[27];          /**< [in]  Reserved. Should be set to NULL. */
+}NvFBCCreateParams;
+#define NVFBC_CREATE_PARAMS_VER_1 NVFBC_STRUCT_VERSION(NvFBCCreateParams, 1)
+#define NVFBC_CREATE_PARAMS_VER_2 NVFBC_STRUCT_VERSION(NvFBCCreateParams, 2)
+#define NVFBC_CREATE_PARAMS_VER NVFBC_CREATE_PARAMS_VER_2
+
+/**
+* \ingroup NVFBC_STRUCTS
+* \brief Defines parameters for a Grab\Capture call to get HW cursor data in the NVFBCToSys capture session.
+*/
+typedef struct
+{
+    NvU32 dwVersion;                         /**< [in]:  Struct version. Set to NVFBC_MOUSE_GRAB_INFO_VER.*/
+    NvU32 dwWidth;                           /**< [out]: Width of mouse glyph captured.*/
+    NvU32 dwHeight;                          /**< [out]: Height of mouse glyph captured.*/
+    NvU32 dwPitch;                           /**< [out]: Pitch of mouse glyph captured.*/
+    NvU32 bIsHwCursor : 1;                   /**< [out]: Tells if cursor is HW cursor or SW cursor. If set to 0, ignore height, width, pitch and pBits.*/
+    NvU32 bReserved : 32;                    /**< [in]:  Reserved.*/
+    NvU32 dwPointerFlags;                    /**< [out]: Maps to DXGK_POINTERFLAGS::Value.*/
+    NvU32 dwXHotSpot;                        /**< [out]: Maps to DXGKARG_SETPOINTERSHAPE::XHot.*/
+    NvU32 dwYHotSpot;                        /**< [out]: Maps to DXGKARG_SETPOINTERSHAPE::YHot.*/
+    NvU32 dwUpdateCounter;                   /**< [out]: Cursor update Counter. */
+    NvU32 dwBufferSize;                      /**< [out]: Size of the buffer contaiing the captured cursor glyph. */
+    void * pBits;                            /**< [out]: pointer to buffer containing the captured cursor glyph.*/
+    NvU32 dwReservedA[22];                   /**< [in]:  Reserved. Set to 0.*/
+    void * pReserved[15];                    /**< [in]:  Reserved. Set to 0.*/
+}NVFBC_CURSOR_CAPTURE_PARAMS;
+#define NVFBC_CURSOR_CAPTURE_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_CURSOR_CAPTURE_PARAMS, 1)
+
+/**
+ * \ingroup NVFBC_ENTRYPOINTS
+ * \brief NVFBC API to set global overrides
+ * \param [in] dwFlags Global overrides for NVFBC. Use ::NVFBC_GLOBAL_FLAGS value.
+ */
+void NVFBCAPI NvFBC_SetGlobalFlags(DWORD dwFlags);
+
+/**
+ * \ingroup NVFBC_ENTRYPOINTS
+ * \brief NVFBC API to create an NVFBC capture session.
+ *  Instantiates an interface identified by NvFBCCreateParams::dwInterfaceType.
+ * \param [inout] pCreateParams Pointer to a struct of type ::NvFBCCreateParams, typecast to void*
+ * \return An applicable ::NVFBCRESULT value.
+ */
+NVFBCRESULT NVFBCAPI NvFBC_CreateEx(void * pCreateParams);
+
+/**
+ * \ingroup NVFBC_ENTRYPOINTS
+ * \brief NVFBC API to query Current NVFBC status.
+ *  Queries the status for the adapter pointed to by the NvFBCStatusEx::dwAdapterIdx parameter.
+ * \param [inout] pCreateParams Pointer to a struct of type ::NvFBCStatusEx.
+ * \return An applicable ::NVFBCRESULT value.
+ */
+NVFBCRESULT NVFBCAPI NvFBC_GetStatusEx(NvFBCStatusEx *pNvFBCStatusEx);
+
+/**
+ * \ingroup NVFBC_ENTRYPOINTS
+ * \brief NVFBC API to enable \ disable NVFBC feature.
+ * \param [in] nvFBCState Refer ::NVFBC_STATE
+ * \return An applicable ::NVFBCRESULT value.
+ */
+NVFBCRESULT NVFBCAPI NvFBC_Enable(NVFBC_STATE nvFBCState);
+
+/**
+ * \ingroup NVFBC_ENTRYPOINTS
+ * \brief NVFBC API to query highest GRID SDK version supported by the loaded NVFBC library.
+ * \param [out] pVersion Pointer to a 32-bit integer to hold the supported GRID SDK version.
+ * \return An applicable ::NVFBCRESULT value.
+ */
+NVFBCRESULT NVFBCAPI NvFBC_GetSDKVersion(NvU32 * pVersion);
+
+/**
+ * \cond API_PFN
+ */
+typedef void (NVFBCAPI * NvFBC_SetGlobalFlagsType) (DWORD dwFlags);
+typedef NVFBCRESULT (NVFBCAPI * NvFBC_CreateFunctionExType)  (void * pCreateParams);
+typedef NVFBCRESULT (NVFBCAPI * NvFBC_GetStatusExFunctionType) (void * pNvFBCStatus);
+typedef NVFBCRESULT (NVFBCAPI * NvFBC_EnableFunctionType) (NVFBC_STATE nvFBCState);
+typedef NVFBCRESULT (NVFBCAPI * NvFBC_GetSDKVersionFunctionType) (NvU32 * pVersion);
+/**
+ * \endcond API_PFN
+*/
diff --git a/third_party/NvFBC/nvFBCToSys.h b/third_party/NvFBC/nvFBCToSys.h
new file mode 100644
index 0000000..67fb9e8
--- /dev/null
+++ b/third_party/NvFBC/nvFBCToSys.h
@@ -0,0 +1,176 @@
+/**
+ * \file This file contains defintions for NVFBCToSys
+ *
+ * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ * NOTICE TO LICENSEE: This source code and/or documentation ("Licensed Deliverables")
+ * are subject to the applicable NVIDIA license agreement
+ * that governs the use of the Licensed Deliverables.
+ *
+ */
+
+#ifndef NVFBC_TO_SYS_H_
+#define NVFBC_TO_SYS_H_
+/**
+ * \defgroup NVFBC_TOSYS NVFBCToSys Interface
+ * \brief Interface for grabbing Desktop images and generating output in system memory.
+ */
+
+/**
+ * \defgroup NVFBC_TOSYS_ENUMS Enums
+ * \ingroup NVFBC_TOSYS
+ * \brief Enumerations used with NVFBCToSys interface.
+ */
+
+/**
+ * \defgroup NVFBC_TOSYS_STRUCTS Structs
+ * \ingroup NVFBC_TOSYS
+ * \brief  Parameter Structs Defined for use with NVFBCToSys interface.
+ */
+
+/**
+ * \defgroup NVFBC_TOSYS_INTERFACE Object Interface
+ * \ingroup NVFBC_TOSYS
+ * \brief Interface class definition for NVFBCToSys Capture API
+ */
+
+/**
+ * \ingroup NVFBC_TOSYS
+ * \brief Macro to define the interface ID to be passed as NvFBCCreateParams::dwInterfaceType
+ * for creating an NVFBCToSys capture session object.
+ */
+#define NVFBC_TO_SYS (0x1204)
+
+/**
+ * \ingroup NVFBC_TOSYS_ENUMS
+ *  Enumerates output buffer pixel data formats supported by NVFBCToSys.
+ */
+typedef enum
+{
+    NVFBC_TOSYS_ARGB       = 0,              /**< Output Pixels in ARGB format: 32bpp, one byte per channel. */
+    NVFBC_TOSYS_RGB           ,              /**< Output Pixels in RGB format: 24bpp, one byte per channel. */
+    NVFBC_TOSYS_YYYYUV420p    ,              /**< Output Pixels in YUV420 format: 12bpp,
+                                                  the Y' channel at full resolution, U channel at half resolution (1 byte for four pixels), V channel at half resolution. */
+    NVFBC_TOSYS_RGB_PLANAR    ,              /**< Output Pixels in planar RGB format: 24bpp,
+                                                  stored sequentially in memory as complete red channel, complete green channel, complete blue channel. */
+    NVFBC_TOSYS_XOR           ,              /**< Output Pixels in RGB format: 24bpp XOR'd with the prior frame. */
+    NVFBC_TOSYS_YUV444p       ,              /**< Output Pixels in YUV444 planar format, i.e. separate 8-bpp Y, U, V planes with no subsampling.*/
+    NVFBC_TOSYS_BUF_FMT_LAST  ,              /**< Sentinel value. Do not use.*/
+} NVFBCToSysBufferFormat;
+
+/**
+ * \ingroup NVFBC_TOSYS_ENUMS
+ *  Enumerates Capture\Grab modes supported by NVFBCToSys.
+ */
+typedef enum
+{
+    NVFBC_TOSYS_SOURCEMODE_FULL  = 0,        /**< Grab full res */
+    NVFBC_TOSYS_SOURCEMODE_SCALE    ,        /**< Will convert current res to supplied resolution (dwTargetWidth and dwTargetHeight) */
+    NVFBC_TOSYS_SOURCEMODE_CROP     ,        /**< Native res, crops a subwindow, of dwTargetWidth and dwTargetHeight sizes, starting at dwStartX and dwStartY */
+    NVFBC_TOSYS_SOURCEMODE_LAST     ,        /**< Sentinel value. Do not use. */
+}NVFBCToSysGrabMode;
+
+/**
+ * \ingroup NVFBC_TOSYS_ENUMS
+ * \enum NVFBC_TOSYS_GRAB_FLAGS Enumerates special commands for grab\capture supported by NVFBCToSys.
+ */
+typedef enum
+{
+    NVFBC_TOSYS_NOFLAGS           = 0x0,     /**< Default (no flags set). Grabbing will wait for a new frame or HW mouse move. */
+    NVFBC_TOSYS_NOWAIT            = 0x1,     /**< Grabbing will not wait for a new frame nor a HW cursor move. */
+    NVFBC_TOSYS_WAIT_WITH_TIMEOUT = 0x10,    /**< Grabbing will wait for a new frame or HW mouse move with a maximum wait time of NVFBC_TOSYS_GRAB_FRAME_PARAMS::dwWaitTime millisecond*/
+} NVFBC_TOSYS_GRAB_FLAGS;
+
+/**
+ * \ingroup NVFBC_TOSYS_STRUCTS
+ * \brief Defines parameters used to configure NVFBCToSys capture session.
+ */
+typedef struct
+{
+    NvU32 dwVersion;                         /**< [in]: Struct version. Set to NVFBC_TOSYS_SETUP_PARAMS_VER.*/
+    NvU32 bWithHWCursor :1;                  /**< [in]: The client should set this to 1 if it requires the HW cursor to be composited on the captured image.*/
+    NvU32 bDiffMap      :1;                  /**< [in]: The client should set this to use the DiffMap feature.*/
+    NvU32 bEnableSeparateCursorCapture : 1;  /**< [in]: The client should set this to 1 if it wants to enable mouse capture in separate stream.*/
+    NvU32 bReservedBits :29;                 /**< [in]: Reserved. Set to 0.*/
+    NVFBCToSysBufferFormat eMode;            /**< [in]: Output image format.*/
+    NvU32 dwReserved1;                       /**< [in]: Reserved. Set to 0.*/
+    void **ppBuffer;                         /**< [out]: Container to hold NvFBC output buffers.*/
+    void **ppDiffMap;                        /**< [out]: Container to hold NvFBC output diffmap buffers.*/
+    void  *hCursorCaptureEvent;                 /**< [out]: Client should wait for mouseEventHandle event before calling MouseGrab function. */
+    NvU32 dwReserved[58];                    /**< [in]: Reserved. Set to 0.*/
+    void *pReserved[29];                     /**< [in]: Reserved. Set to 0.*/
+} NVFBC_TOSYS_SETUP_PARAMS_V2;
+#define NVFBC_TOSYS_SETUP_PARAMS_VER2 NVFBC_STRUCT_VERSION(NVFBC_TOSYS_SETUP_PARAMS, 2)
+typedef  NVFBC_TOSYS_SETUP_PARAMS_V2 NVFBC_TOSYS_SETUP_PARAMS;
+#define NVFBC_TOSYS_SETUP_PARAMS_VER NVFBC_TOSYS_SETUP_PARAMS_VER2
+
+/**
+ * \ingroup NVFBC_TOSYS_STRUCTS
+ * \brief Defines parameters for a Grab\Capture call in the NVFBCToSys capture session.
+ * Also holds information regarding the grabbed data.
+ */
+typedef struct
+{
+    NvU32 dwVersion;                         /**< [in]: Struct version. Set to NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER.*/
+    NvU32 dwFlags;                           /**< [in]: Special grabbing requests. This should be a bit-mask of NVFBC_TOSYS_GRAB_FLAGS values.*/
+    NvU32 dwTargetWidth;                     /**< [in]: Target image width. NvFBC will scale the captured image to fit taret width and height. Used with NVFBC_TOSYS_SOURCEMODE_SCALE and NVFBC_TOSYS_SOURCEMODE_CROP. */
+    NvU32 dwTargetHeight;                    /**< [in]: Target image height. NvFBC will scale the captured image to fit taret width and height. Used with NVFBC_TOSYS_SOURCEMODE_SCALE and NVFBC_TOSYS_SOURCEMODE_CROP. */
+    NvU32 dwStartX;                          /**< [in]: x-coordinate of starting pixel for cropping. Used with NVFBC_TOSYS_SOURCEMODE_CROP. */
+    NvU32 dwStartY;                          /**< [in]: y-coordinate of starting pixel for cropping. Used with NVFBC_TOSYS_SOURCEMODE_CROP. .*/
+    NVFBCToSysGrabMode eGMode;               /**< [in]: Frame grab mode.*/
+    NvU32 dwWaitTime;                        /**< [in]: Time limit for NvFBCToSysGrabFrame() to wait until a new frame is available or a HW mouse moves. Use with NVFBC_TOSYS_WAIT_WITH_TIMEOUT */
+    NvFBCFrameGrabInfo *pNvFBCFrameGrabInfo; /**< [in/out]: Frame grab information and feedback from NvFBC driver.*/
+    NvU32 dwReserved[56];                    /**< [in]: Reserved. Set to 0.*/
+    void *pReserved[31];                     /**< [in]: Reserved. Set to NULL.*/
+} NVFBC_TOSYS_GRAB_FRAME_PARAMS_V1;
+#define NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER1 NVFBC_STRUCT_VERSION(NVFBC_TOSYS_GRAB_FRAME_PARAMS, 1)
+typedef NVFBC_TOSYS_GRAB_FRAME_PARAMS_V1 NVFBC_TOSYS_GRAB_FRAME_PARAMS;
+#define NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER1
+
+
+/**
+ * \ingroup NVFBC_TOSYS_INTERFACE
+ * Interface class definition for NVFBCToSys Capture API
+ */
+class INvFBCToSys_v3
+{
+public:
+    /**
+     * \brief Sets up NVFBC System Memory capture according to the provided parameters.
+     * \param [in] pParam Pointer to a struct of type ::NVFBC_TOSYS_SETUP_PARAMS.
+     * \return An applicable ::NVFBCRESULT value.
+     */
+    virtual NVFBCRESULT NVFBCAPI NvFBCToSysSetUp              (NVFBC_TOSYS_SETUP_PARAMS_V2 *pParam) = 0;
+
+    /**
+     * \brief Captures the desktop and dumps the captured data to a System memory buffer.
+     *  If the API returns a failure, the client should check the return codes and ::NvFBCFrameGrabInfo output fields to determine if the session needs to be re-created.
+     * \param [inout] pParam Pointer to a struct of type ::NVFBC_TOSYS_GRAB_FRAME_PARAMS.
+     * \return An applicable ::NVFBCRESULT value.
+     */
+    virtual NVFBCRESULT NVFBCAPI NvFBCToSysGrabFrame          (NVFBC_TOSYS_GRAB_FRAME_PARAMS *pParam) = 0;
+
+    /**
+     * \brief Captures HW cursor data whenever shape of mouse is changed
+     * \param [inout] pParam Pointer to a struct of type ::NVFBC_CURSOR_CAPTURE_PARAMS.
+     * \return An applicable ::NVFBCRESULT value.
+     */
+    virtual NVFBCRESULT NVFBCAPI NvFBCToSysCursorCapture      (NVFBC_CURSOR_CAPTURE_PARAMS *pParam) = 0;
+
+    /**
+     * \brief A high precision implementation of Sleep().
+     *  Can provide sub quantum (usually 16ms) sleep that does not burn CPU cycles.
+     * \param [in] qwMicroSeconds The number of microseconds that the thread should sleep for.
+     * \return An applicable ::NVFBCRESULT value.
+     */
+    virtual NVFBCRESULT NVFBCAPI NvFBCToSysGPUBasedCPUSleep   (__int64 qwMicroSeconds) = 0;
+
+    /**
+     * \brief Destroys the NVFBCToSys capture session.
+     * \return An applicable ::NVFBCRESULT value.
+     */
+    virtual NVFBCRESULT NVFBCAPI NvFBCToSysRelease            () = 0;
+};
+
+typedef INvFBCToSys_v3 NvFBCToSys;
+
+#endif // NVFBC_TO_SYS_H_
diff --git a/third_party/Windows.h b/third_party/Windows.h
new file mode 100644
index 0000000..6d21fea
--- /dev/null
+++ b/third_party/Windows.h
@@ -0,0 +1,2 @@
+/* Windows.h — case-compat shim for the vendor NvFBC header, not our API. */
+#include <windows.h>