/* publish.c — OS-agnostic implementation of the streaming protocol.
 * Operates purely on the contract through a borrowed vgpu_region_view; no
 * platform headers, no runtime context. The x86-TSO ordering lives in the
 * atomic shim. */

#include <string.h>
#include "vgpu_stream.h"   /* contract types / slot geometry */
#include "atomic-shim.h"   /* x86-TSO memory-order accessors */
#include "stream.h"        /* region-view handle + this API */

#define VGPU_CTRL_READ_TRIES 16u

int vgpu_publish_frame(const vgpu_region_view* rv, const uint8_t* tight_bgra,
                       uint32_t width, uint32_t height, uint64_t timestamp_ns) {
    vgpu_producer_t* p = rv->producer;

    const uint32_t stride = width * 4u;               /* tight invariant */
    const uint64_t need   = (uint64_t)height * stride;
    if (need > VGPU_SLOT_STRIDE)                       /* clamp by slot size */
        return 1;

    uint32_t cur = vgpu_load_acquire32(&p->latest);
    uint32_t S   = (cur == VGPU_LATEST_NONE) ? 0u : ((cur + 1u) % VGPU_SLOT_COUNT);

    uint8_t* dst = rv->ring + (size_t)S * VGPU_SLOT_STRIDE;

    /* seqlock: even -> odd (writing) */
    vgpu_store_release32(&p->seq[S], p->seq[S] + 1u);
    vgpu_compiler_barrier();

    /* descriptor (self-describing slot) */
    p->desc[S].width        = width;
    p->desc[S].height       = height;
    p->desc[S].stride       = stride;
    p->desc[S].format       = VGPU_FMT_BGRA8888;
    p->desc[S].frame_id     = p->frame_id + 1u;
    p->desc[S].timestamp_ns = timestamp_ns;

    /* pixels (source is already tight) */
    memcpy(dst, tight_bgra, (size_t)need);

    vgpu_sfence();
    /* seqlock: odd -> even (stable) */
    vgpu_store_release32(&p->seq[S], p->seq[S] + 1u);
    vgpu_sfence();

    p->frame_id += 1u;
    vgpu_store_release32(&p->latest, S);
    return 0;
}

int vgpu_control_read(const vgpu_region_view* rv, vgpu_control_view* out) {
    volatile vgpu_control_t* c = rv->control;

    for (uint32_t t = 0; t < VGPU_CTRL_READ_TRIES; t++) {
        uint32_t g0 = vgpu_load_acquire32(&c->ctrl_gen);
        if (g0 & 1u)
            continue;                       /* writer in progress */
        vgpu_compiler_barrier();

        uint32_t desired = c->desired_state;
        uint32_t fps     = c->target_fps;
        uint32_t cursor  = c->draw_cursor;
        uint32_t ffreq   = c->full_frame_req;
        uint32_t ctick   = c->consumer_tick;
        uint32_t att     = c->attached;

        vgpu_compiler_barrier();
        uint32_t g1 = vgpu_load_acquire32(&c->ctrl_gen);
        if (g0 != g1)
            continue;                       /* torn read, retry */

        out->gen            = g0;
        out->desired_state  = desired;
        out->target_fps     = fps;
        out->draw_cursor    = cursor;
        out->full_frame_req = ffreq;
        out->consumer_tick  = ctick;
        out->attached       = att;
        return 1;
    }
    return 0;
}

void vgpu_publish_ctrl_ack(const vgpu_region_view* rv, uint32_t gen) {
    vgpu_store_release32(&rv->producer->ctrl_ack, gen);
}

void vgpu_set_status(const vgpu_region_view* rv, uint32_t status) {
    vgpu_store_release32(&rv->producer->status, status);
}

void vgpu_set_backend(const vgpu_region_view* rv, uint32_t backend) {
    vgpu_store_release32(&rv->producer->backend, backend);
}

void vgpu_set_error(const vgpu_region_view* rv, uint32_t error_code) {
    vgpu_store_release32(&rv->producer->error_code, error_code);
}

void vgpu_set_applied_fps(const vgpu_region_view* rv, uint32_t fps) {
    vgpu_store_release32(&rv->producer->applied_fps, fps);
}

void vgpu_bump_run_epoch(const vgpu_region_view* rv) {
    vgpu_producer_t* p = rv->producer;
    vgpu_store_release32(&p->run_epoch, p->run_epoch + 1u);
}

void vgpu_tick_heartbeat(const vgpu_region_view* rv) {
    /* 64-bit aligned single MOV is atomic on x86_64; barrier orders it */
    rv->producer->heartbeat += 1u;
    vgpu_compiler_barrier();
}

void vgpu_publish_full_frame_ack(const vgpu_region_view* rv, uint32_t req) {
    vgpu_store_release32(&rv->producer->full_frame_ack, req);
}

void vgpu_publish_cursor(const vgpu_region_view* rv, int32_t x, int32_t y, uint32_t visible) {
    vgpu_producer_t* p = rv->producer;
    /* pack: low 32 = x, high 32 = y (signed → two's-complement bits) */
    uint64_t packed = ((uint64_t)(uint32_t)y << 32) | (uint64_t)(uint32_t)x;
    /* 64-bit aligned single MOV is atomic on x86_64; barrier orders it (heartbeat pattern) */
    p->cursor_pos = packed;
    vgpu_store_release32(&p->cursor_visible, visible);
    /* publish seq last: its release-store gates the pos/visible writes above for the host */
    vgpu_store_release32(&p->cursor_seq, p->cursor_seq + 1u);
}

void vgpu_publish_cursor_shape(const vgpu_region_view* rv, uint32_t hot_x, uint32_t hot_y,
                               uint32_t gw, uint32_t gh, uint32_t cursor_id) {
    vgpu_producer_t* p = rv->producer;
    /* pack 16|16 strictly unsigned (mask low half so no sign bits bleed into the high half).
     * No own seq: the following vgpu_publish_cursor bumps cursor_seq last and gates this line. */
    vgpu_store_release32(&p->cursor_hotspot, (hot_y << 16) | (hot_x & 0xFFFFu));
    vgpu_store_release32(&p->cursor_glyph,   (gh    << 16) | (gw    & 0xFFFFu));
    vgpu_store_release32(&p->cursor_id,      cursor_id);
}

void vgpu_publish_content_change(const vgpu_region_view* rv, uint64_t change_ns) {
    /* 64-bit aligned single MOV is atomic on x86_64; barrier orders it (heartbeat pattern) */
    rv->producer->content_change_ns = change_ns;
    vgpu_compiler_barrier();
}

void vgpu_publish_geometry(const vgpu_region_view* rv, int32_t virt_x, int32_t virt_y,
                           uint32_t virt_w, uint32_t virt_h,
                           int32_t cap_x, int32_t cap_y,
                           uint32_t dpi, uint32_t refresh_mhz) {
    vgpu_producer_t* p = rv->producer;
    /* seqlock: even -> odd (writing) */
    vgpu_store_release32(&p->geom_seq, p->geom_seq + 1u);
    vgpu_compiler_barrier();
    p->virt_x = virt_x; p->virt_y = virt_y;
    p->virt_w = virt_w; p->virt_h = virt_h;
    p->cap_x  = cap_x;  p->cap_y  = cap_y;
    p->dpi    = dpi;    p->refresh_mhz = refresh_mhz;
    vgpu_sfence();
    /* seqlock: odd -> even (stable) */
    vgpu_store_release32(&p->geom_seq, p->geom_seq + 1u);
    vgpu_sfence();
}