#ifndef VGPU_ATOMIC_SHIM_H #define VGPU_ATOMIC_SHIM_H /* atomic-shim.h — x86-TSO memory-order accessors (arch, not OS). * * x86-TSO memory-order shim. NO _Atomic in the shared region type: the consumer * maps the region as raw bytes. Synchronization lives entirely in the producer's * accessors here. Per-compiler implementation, never exposed in the contract. * * On x86_64 every naturally-aligned MOV up to 8 bytes is atomic and stores are * already release / loads already acquire at the hardware level; the only things * we must prevent are (1) compiler reordering across the sync point and * (2) store-buffer visibility delay between the data writes and the publish * store, for which an explicit SFENCE is used at publish boundaries. */ #include #if defined(_MSC_VER) #include static inline void vgpu_compiler_barrier(void) { _ReadWriteBarrier(); } static inline void vgpu_sfence(void) { _mm_sfence(); } static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) { _ReadWriteBarrier(); *p = v; } static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) { uint32_t v = *p; _ReadWriteBarrier(); return v; } #else /* gcc / mingw / clang */ static inline void vgpu_compiler_barrier(void) { __asm__ __volatile__("" ::: "memory"); } static inline void vgpu_sfence(void) { __asm__ __volatile__("sfence" ::: "memory"); } static inline void vgpu_store_release32(volatile uint32_t* p, uint32_t v) { __atomic_store_n(p, v, __ATOMIC_RELEASE); } static inline uint32_t vgpu_load_acquire32(const volatile uint32_t* p) { return __atomic_load_n(p, __ATOMIC_ACQUIRE); } #endif #endif /* VGPU_ATOMIC_SHIM_H */