src/stream/include/atomic-shim.h

#ifndef VGPU_ATOMIC_SHIM_H
#define VGPU_ATOMIC_SHIM_H

/* atomic-shim.h — x86-TSO memory-order accessors (arch, not OS).
 *
 * x86-TSO memory-order shim. NO _Atomic in the shared region type: the consumer
 * maps the region as raw bytes. Synchronization lives entirely in the producer's
 * accessors here. Per-compiler implementation, never exposed in the contract.
 *
 * On x86_64 every naturally-aligned MOV up to 8 bytes is atomic and stores are
 * already release / loads already acquire at the hardware level; the only things
 * we must prevent are (1) compiler reordering across the sync point and
 * (2) store-buffer visibility delay between the data writes and the publish
 * store, for which an explicit SFENCE is used at publish boundaries.
 */

#include <stdint.h>

#if defined(_MSC_VER)

#include <intrin.h>

static inline void vgpu_compiler_barrier(void) { _ReadWriteBarrier(); }
static inline void vgpu_sfence(void) { _mm_sfence(); }

static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) {
    _ReadWriteBarrier();
    *p = v;
}

static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) {
    uint32_t v = *p;
    _ReadWriteBarrier();
    return v;
}

#else /* gcc / mingw / clang */

static inline void vgpu_compiler_barrier(void) { __asm__ __volatile__("" ::: "memory"); }
static inline void vgpu_sfence(void) { __asm__ __volatile__("sfence" ::: "memory"); }

static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) {
    __atomic_store_n(p, v, __ATOMIC_RELEASE);
}

static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) {
    return __atomic_load_n(p, __ATOMIC_ACQUIRE);
}

#endif

#endif /* VGPU_ATOMIC_SHIM_H */
Initial commit: win32 vGPU stream capture module 2026-06-17 12:55:19 +03:00			`#ifndef VGPU_ATOMIC_SHIM_H`
			`#define VGPU_ATOMIC_SHIM_H`

			`/* atomic-shim.h — x86-TSO memory-order accessors (arch, not OS).`
			`*`
			`* x86-TSO memory-order shim. NO _Atomic in the shared region type: the consumer`
			`* maps the region as raw bytes. Synchronization lives entirely in the producer's`
			`* accessors here. Per-compiler implementation, never exposed in the contract.`
			`*`
			`* On x86_64 every naturally-aligned MOV up to 8 bytes is atomic and stores are`
			`* already release / loads already acquire at the hardware level; the only things`
			`* we must prevent are (1) compiler reordering across the sync point and`
			`* (2) store-buffer visibility delay between the data writes and the publish`
			`* store, for which an explicit SFENCE is used at publish boundaries.`
			`*/`

			`#include <stdint.h>`

			`#if defined(_MSC_VER)`

			`#include <intrin.h>`

			`static inline void vgpu_compiler_barrier(void) { _ReadWriteBarrier(); }`
			`static inline void vgpu_sfence(void) { _mm_sfence(); }`

			`static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) {`
			`_ReadWriteBarrier();`
			`*p = v;`
			`}`

			`static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) {`
			`uint32_t v = *p;`
			`_ReadWriteBarrier();`
			`return v;`
			`}`

			`#else /* gcc / mingw / clang */`

			`static inline void vgpu_compiler_barrier(void) { __asm__ __volatile__("" ::: "memory"); }`
			`static inline void vgpu_sfence(void) { __asm__ __volatile__("sfence" ::: "memory"); }`

			`static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) {`
			`__atomic_store_n(p, v, __ATOMIC_RELEASE);`
			`}`

			`static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) {`
			`return __atomic_load_n(p, __ATOMIC_ACQUIRE);`
			`}`

			`#endif`

			`#endif /* VGPU_ATOMIC_SHIM_H */`