Zero-copy hot path, correctness hardening

gva_ptr: leaf-bounded zero-copy guest reads. gva_sweep redesigned to drive
on it — large-page leaves are lent to the callback while 4K runs stay
buffered, and the run loop is guarded against wrap at the top of the address
space. gva_gpa fetches PTEs zero-copy; optional W32MS_LTO build option folds
the per-fetch call boundary (shipped -O2 default unchanged).

Correctness: subtract-form bounds check (no add overflow), memcpy decode in
place of type-punned wide loads, zero-init PDB name before compare,
PCI-hole-crossing range rejection, single-sourced VA_CANON and USER bounds.
hot/cold attributes audited across the translation and scan path.
This commit is contained in:
2026-06-15 00:58:27 +03:00
parent 1ec70b7ede
commit 4015e839eb
9 changed files with 84 additions and 39 deletions
+6
View File
@@ -5,6 +5,8 @@ set(CMAKE_C_STANDARD 17) # generation B uses no C23 feature
set(CMAKE_C_STANDARD_REQUIRED ON) set(CMAKE_C_STANDARD_REQUIRED ON)
set(CMAKE_C_EXTENSIONS ON) # deliberate: strnlen (POSIX) + void* arithmetic (GNU) set(CMAKE_C_EXTENSIONS ON) # deliberate: strnlen (POSIX) + void* arithmetic (GNU)
option(W32MS_LTO "Enable LTO" OFF) # build-only; shipped default is -O2, no LTO
# ---- host: VMI core as a static library --------------------------------- # ---- host: VMI core as a static library ---------------------------------
add_library(w32ms STATIC add_library(w32ms STATIC
src/gpa.c src/gpa.c
@@ -19,6 +21,10 @@ target_include_directories(w32ms
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include # public API: include/*.h PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include # public API: include/*.h
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) # private: src/include/*.h via "include/..." PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) # private: src/include/*.h via "include/..."
target_compile_options(w32ms PRIVATE -O2 -Wall -Wextra) target_compile_options(w32ms PRIVATE -O2 -Wall -Wextra)
if(W32MS_LTO)
target_compile_options(w32ms PRIVATE -flto)
target_link_options(w32ms PRIVATE -flto)
endif()
# ---- host: CLI demonstrator over the library ---------------------------- # ---- host: CLI demonstrator over the library ----------------------------
add_executable(w32ms_cli src/cli.c) add_executable(w32ms_cli src/cli.c)
+4 -1
View File
@@ -35,7 +35,9 @@ static void clean_ctx(gpa_ctx* ctx) {
} }
static int out_of_bounds(gpa_ctx* ctx, uintptr_t* offs, const size_t nmemb) { static int out_of_bounds(gpa_ctx* ctx, uintptr_t* offs, const size_t nmemb) {
return gpa_offset(ctx, *offs, offs) || *offs + nmemb > ctx->fsize; return gpa_offset(ctx, *offs, offs)
|| nmemb > ctx->fsize - *offs
|| (*offs < ctx->low && nmemb > ctx->low - *offs); /* range crosses split */
} }
__attribute__((hot)) __attribute__((hot))
@@ -57,6 +59,7 @@ int gpa_write(gpa_ctx* ctx, uintptr_t offs, const void* src, const size_t nmemb)
/* Zero-copy host pointer to [offs, offs+nmemb) GPA, or NULL if that range is not /* Zero-copy host pointer to [offs, offs+nmemb) GPA, or NULL if that range is not
* fully backed by the mapped image. Same split + bounds check as gpa_read. */ * fully backed by the mapped image. Same split + bounds check as gpa_read. */
__attribute__((hot))
void* gpa_ptr(gpa_ctx* ctx, uintptr_t offs, const size_t nmemb) { void* gpa_ptr(gpa_ctx* ctx, uintptr_t offs, const size_t nmemb) {
if (out_of_bounds(ctx, &offs, nmemb)) { if (out_of_bounds(ctx, &offs, nmemb)) {
return NULL; return NULL;
+43 -19
View File
@@ -5,9 +5,6 @@
#include "include/memory.h" #include "include/memory.h"
#include "../include/include.h" #include "../include/include.h"
/* sign-extend a 48-bit canonical VA */
#define VA_CANON(v) (((v) & (1ull << 47)) ? ((v) | 0xFFFF000000000000ull) : (v))
/* PTE permission bits we propagate down the walk. */ /* PTE permission bits we propagate down the walk. */
#define PTE_RW (1ull << 1) #define PTE_RW (1ull << 1)
#define PTE_US (1ull << 2) #define PTE_US (1ull << 2)
@@ -16,15 +13,17 @@
/* ---- single-address translation (hot) ----------------------------------- * /* ---- single-address translation (hot) ----------------------------------- *
* Translate `va` under `cr3` to a GPA. On success: *gpa = GPA of `va`, and * Translate `va` under `cr3` to a GPA. On success: *gpa = GPA of `va`, and
* *leaf (if non-NULL) = bytes from `va` to the end of the containing leaf. */ * *leaf (if non-NULL) = bytes from `va` to the end of the containing leaf. */
__attribute__((hot))
static int gva_gpa(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, static int gva_gpa(gva_ctx* ctx, uintptr_t cr3, uintptr_t va,
uintptr_t* gpa, size_t* leaf) { uintptr_t* gpa, size_t* leaf) {
uint64_t t = cr3 & PFN_MASK, e; uint64_t t = cr3 & PFN_MASK, e;
const uint64_t* pe;
const unsigned i4 = (va >> 39) & 0x1ff, i3 = (va >> 30) & 0x1ff, const unsigned i4 = (va >> 39) & 0x1ff, i3 = (va >> 30) & 0x1ff,
i2 = (va >> 21) & 0x1ff, i1 = (va >> 12) & 0x1ff; i2 = (va >> 21) & 0x1ff, i1 = (va >> 12) & 0x1ff;
if (gpa_read(&p_(ctx), t + i4 * 8, &e, 8) || !(e & PG_P)) return -1; if (!(pe = gpa_ptr(&p_(ctx), t + i4 * 8, 8)) || !((e = *pe) & PG_P)) return -1;
t = e & PFN_MASK; t = e & PFN_MASK;
if (gpa_read(&p_(ctx), t + i3 * 8, &e, 8) || !(e & PG_P)) return -1; if (!(pe = gpa_ptr(&p_(ctx), t + i3 * 8, 8)) || !((e = *pe) & PG_P)) return -1;
if (e & PG_PS) { /* 1 GiB leaf */ if (e & PG_PS) { /* 1 GiB leaf */
const uint64_t off = va & 0x3FFFFFFF; const uint64_t off = va & 0x3FFFFFFF;
*gpa = (e & PFN_MASK & ~0x3FFFFFFFull) + off; *gpa = (e & PFN_MASK & ~0x3FFFFFFFull) + off;
@@ -32,7 +31,7 @@ static int gva_gpa(gva_ctx* ctx, uintptr_t cr3, uintptr_t va,
return 0; return 0;
} }
t = e & PFN_MASK; t = e & PFN_MASK;
if (gpa_read(&p_(ctx), t + i2 * 8, &e, 8) || !(e & PG_P)) return -1; if (!(pe = gpa_ptr(&p_(ctx), t + i2 * 8, 8)) || !((e = *pe) & PG_P)) return -1;
if (e & PG_PS) { /* 2 MiB leaf */ if (e & PG_PS) { /* 2 MiB leaf */
const uint64_t off = va & 0x1FFFFF; const uint64_t off = va & 0x1FFFFF;
*gpa = (e & PFN_MASK & ~0x1FFFFFull) + off; *gpa = (e & PFN_MASK & ~0x1FFFFFull) + off;
@@ -40,13 +39,22 @@ static int gva_gpa(gva_ctx* ctx, uintptr_t cr3, uintptr_t va,
return 0; return 0;
} }
t = e & PFN_MASK; t = e & PFN_MASK;
if (gpa_read(&p_(ctx), t + i1 * 8, &e, 8) || !(e & PG_P)) return -1; if (!(pe = gpa_ptr(&p_(ctx), t + i1 * 8, 8)) || !((e = *pe) & PG_P)) return -1;
const uint64_t off = va & 0xFFF; /* 4 KiB leaf */ const uint64_t off = va & 0xFFF; /* 4 KiB leaf */
*gpa = (e & PFN_MASK) + off; *gpa = (e & PFN_MASK) + off;
if (leaf) *leaf = 0x1000 - off; if (leaf) *leaf = 0x1000 - off;
return 0; return 0;
} }
/* zero-copy borrowed read: leaf-bounded host pointer at `va` (see memory.h). */
__attribute__((hot))
const void* gva_ptr(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, size_t* avail) {
uintptr_t gpa; size_t leaf;
if (gva_gpa(ctx, cr3, va, &gpa, &leaf)) return NULL;
*avail = leaf;
return gpa_ptr(&p_(ctx), gpa, leaf);
}
__attribute__((hot)) __attribute__((hot))
int gva_read(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, void* dst, size_t nmemb) { int gva_read(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, void* dst, size_t nmemb) {
uint8_t* d = dst; uint8_t* d = dst;
@@ -169,6 +177,7 @@ static int rgn_hit(uint64_t base, uint64_t span, uint64_t lo, uint64_t hi) {
return !(end < lo || base > hi); return !(end < lo || base > hi);
} }
__attribute__((hot))
int gva_regions(gva_ctx* ctx, uintptr_t cr3, uint64_t lo, uint64_t hi, int gva_regions(gva_ctx* ctx, uintptr_t cr3, uint64_t lo, uint64_t hi,
uint32_t prot_any, vregion* out, int nmax) { uint32_t prot_any, vregion* out, int nmax) {
if (nmax <= 0) return 0; if (nmax <= 0) return 0;
@@ -230,6 +239,7 @@ int gva_regions(gva_ctx* ctx, uintptr_t cr3, uint64_t lo, uint64_t hi,
#define SWEEP_WIN (1u << 20) /* 1 MiB window (multiple of 8) */ #define SWEEP_WIN (1u << 20) /* 1 MiB window (multiple of 8) */
#define SWEEP_RMAX (1u << 16) /* max runs enumerated per sweep */ #define SWEEP_RMAX (1u << 16) /* max runs enumerated per sweep */
__attribute__((hot))
int gva_sweep(gva_ctx* ctx, uintptr_t cr3, uint64_t lo, uint64_t hi, int gva_sweep(gva_ctx* ctx, uintptr_t cr3, uint64_t lo, uint64_t hi,
uint32_t prot_any, size_t overlap, gva_sweep_cb cb, void* user) { uint32_t prot_any, size_t overlap, gva_sweep_cb cb, void* user) {
if (overlap >= SWEEP_WIN) return -1; if (overlap >= SWEEP_WIN) return -1;
@@ -245,26 +255,40 @@ int gva_sweep(gva_ctx* ctx, uintptr_t cr3, uint64_t lo, uint64_t hi,
for (int r = 0; r < nr && !rc; r++) { for (int r = 0; r < nr && !rc; r++) {
uint64_t base = rg[r].va; /* VA of buf[0] */ uint64_t base = rg[r].va; /* VA of buf[0] */
uint64_t va = rg[r].va; uint64_t va = rg[r].va;
const uint64_t vend = rg[r].va + rg[r].len; const uint64_t vend = rg[r].va + (rg[r].len - 1); /* inclusive last */
size_t fill = 0; size_t fill = 0;
while (va < vend) { while (va <= vend) {
size_t pg = 0x1000 - (size_t)(va & 0xFFF); /* to page edge */ size_t avail;
if (pg > (size_t)(vend - va)) pg = (size_t)(vend - va); const uint8_t* p = gva_ptr(ctx, cr3, va, &avail);
if (pg > SWEEP_WIN - fill) pg = SWEEP_WIN - fill; if (!p) { /* gap: flush+skip */
if (gva_read(ctx, cr3, va, buf + fill, pg)) { /* gap: flush+skip */
if (fill && cb(user, buf, fill, base, overlap, 1)) { rc = 1; break; } if (fill && cb(user, buf, fill, base, overlap, 1)) { rc = 1; break; }
if (vend - va < 0x1000 - (va & 0xFFF)) break; /* skip past top: done */
va += 0x1000 - (va & 0xFFF); va += 0x1000 - (va & 0xFFF);
base = va; fill = 0; base = va; fill = 0;
continue; continue;
} }
fill += pg; va += pg; size_t n = avail; /* leaf-contiguous */
if (n > (size_t)(vend - va + 1)) n = (size_t)(vend - va + 1);
if (n > SWEEP_WIN - fill) n = SWEEP_WIN - fill;
const int end = (n == (size_t)(vend - va + 1)); /* chunk hits vend */
if (fill == SWEEP_WIN) { if (fill == 0 && avail > 0x1000) { /* large-page lend */
const int last = (va >= vend); if (cb(user, p, n, va, 0, end)) { rc = 1; break; }
if (cb(user, buf, fill, base, overlap, last)) { rc = 1; break; } if (end) break; /* avoid va wrap */
if (last || overlap == 0 || overlap >= fill) { va += n;
if (overlap == 0) base = va;
else { memcpy(buf, p + n - overlap, overlap); base = va - overlap; fill = overlap; }
continue;
}
memcpy(buf + fill, p, n); /* buffered window */
fill += n; va += n;
if (end || fill == SWEEP_WIN) {
if (cb(user, buf, fill, base, overlap, end)) { rc = 1; break; }
if (end) { fill = 0; break; } /* avoid va wrap */
if (overlap == 0 || overlap >= fill) {
base = va; fill = 0; base = va; fill = 0;
} else { /* carry overlap */ } else { /* carry overlap */
memmove(buf, buf + fill - overlap, overlap); memmove(buf, buf + fill - overlap, overlap);
+7 -8
View File
@@ -100,11 +100,9 @@ static int find_ntoskrnl(gva_ctx* ctx, uintptr_t cr3, uint64_t* base, uint8_t gu
} }
uint64_t va = (uint64_t)p4<<39 | (uint64_t)p3<<30 | (uint64_t)p2<<21; uint64_t va = (uint64_t)p4<<39 | (uint64_t)p3<<30 | (uint64_t)p2<<21;
if (va & (1ull<<47)) { va = VA_CANON(va);
va |= 0xFFFF000000000000ull; /* canonical sign-extend */
}
uint16_t mz; char pdb[16]; uint16_t mz; char pdb[16] = {0};
if (gva_read(ctx, cr3, va, &mz, 2) || mz != MZ) { if (gva_read(ctx, cr3, va, &mz, 2) || mz != MZ) {
continue; continue;
} }
@@ -132,10 +130,11 @@ static uint32_t ko_export_rva(gva_ctx* ctx, uintptr_t cr3, uint64_t kbase, const
if (gva_read(ctx, cr3, kbase + exp_rva, ed, sizeof ed)) { if (gva_read(ctx, cr3, kbase + exp_rva, ed, sizeof ed)) {
return 0; return 0;
} }
const uint32_t nnames = *(uint32_t*)(ed + 0x18); uint32_t nnames, a_funcs, a_names, a_ords;
const uint32_t a_funcs = *(uint32_t*)(ed + 0x1C); memcpy(&nnames, ed + 0x18, 4);
const uint32_t a_names = *(uint32_t*)(ed + 0x20); memcpy(&a_funcs, ed + 0x1C, 4);
const uint32_t a_ords = *(uint32_t*)(ed + 0x24); memcpy(&a_names, ed + 0x20, 4);
memcpy(&a_ords, ed + 0x24, 4);
for (uint32_t i = 0; i < nnames; i++) { for (uint32_t i = 0; i < nnames; i++) {
uint32_t nrva; char nm[40]; uint32_t nrva; char nm[40];
+9
View File
@@ -10,6 +10,9 @@ struct gva_ctx; /* forward: completed below; lets profile.h name it *
#define PG_P 0x1ull #define PG_P 0x1ull
#define PG_PS 0x80ull #define PG_PS 0x80ull
/* sign-extend a 48-bit canonical VA */
#define VA_CANON(v) (((v) & (1ull << 47)) ? ((v) | 0xFFFF000000000000ull) : (v))
/* Canonical VA window bounds, single-sourced here for every scanning TU. /* Canonical VA window bounds, single-sourced here for every scanning TU.
* USER_MIN is 0x10000: Windows reserves the low 64 KiB, so no live user pointer * USER_MIN is 0x10000: Windows reserves the low 64 KiB, so no live user pointer
* targets below it - starting there drops a class of false positives. */ * targets below it - starting there drops a class of false positives. */
@@ -59,6 +62,12 @@ int gpa_write(gpa_ctx* ctx, uintptr_t offs, const void* src, size_t nmemb);
* whole leaf (or a 4096-byte page table) can be taken in one call. */ * whole leaf (or a 4096-byte page table) can be taken in one call. */
void* gpa_ptr(gpa_ctx* ctx, uintptr_t offs, size_t nmemb); void* gpa_ptr(gpa_ctx* ctx, uintptr_t offs, size_t nmemb);
/* Zero-copy borrowed read: host pointer to the guest byte at `va` (under `cr3`),
* valid for *avail contiguous bytes (to the end of the containing leaf). NULL if
* `va` is not mapped or the leaf is not fully covered by the image (caller falls
* back to gva_read). Borrowed: valid until gva_ctx_free, do NOT retain/free. */
const void* gva_ptr(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, size_t* avail) __attribute__((hot));
/* bootstrap helpers (gva.c) */ /* bootstrap helpers (gva.c) */
int khalf_score(const gva_ctx* ctx, uint64_t pml4) __attribute__((cold)); int khalf_score(const gva_ctx* ctx, uint64_t pml4) __attribute__((cold));
int cr3_recover(gva_ctx* ctx, uint64_t va_self, uint64_t target_pa, uintptr_t* cr3_out) __attribute__((cold)); int cr3_recover(gva_ctx* ctx, uint64_t va_self, uint64_t target_pa, uintptr_t* cr3_out) __attribute__((cold));
+1 -1
View File
@@ -4,7 +4,7 @@
#include "include/memory.h" #include "include/memory.h"
#include "../include/include.h" #include "../include/include.h"
#define pr_(ctx) (ctx->prof) #define pr_(ctx) ((ctx)->prof)
#define RING_GUARD 100000u #define RING_GUARD 100000u
#define MOD_GUARD 4096u #define MOD_GUARD 4096u
+4 -2
View File
@@ -53,7 +53,8 @@ static int discover_core(gva_ctx* ctx, uintptr_t cr3, uint64_t sys_ep) {
int pid_off = -1; int pid_off = -1;
for (int o = 0x80; o + 8 <= name_off; o += 8) { for (int o = 0x80; o + 8 <= name_off; o += 8) {
if (*(uint64_t*)(buf + o) != 4) { uint64_t v; memcpy(&v, buf + o, 8);
if (v != 4) {
continue; continue;
} }
const uint16_t links = (uint16_t)(o + 8); const uint16_t links = (uint16_t)(o + 8);
@@ -70,7 +71,8 @@ static int discover_core(gva_ctx* ctx, uintptr_t cr3, uint64_t sys_ep) {
int dtb_off = -1; int dtb_off = -1;
for (int o = 0x18; o <= 0x60; o += 8) { for (int o = 0x18; o <= 0x60; o += 8) {
const uint64_t c = *(uint64_t*)(buf + o) & PFN_MASK; uint64_t v; memcpy(&v, buf + o, 8);
const uint64_t c = v & PFN_MASK;
uint8_t probe; uint8_t probe;
if (c && khalf_score(ctx, c) >= 16 && !gva_read(ctx, c, sys_ep, &probe, 1)) { if (c && khalf_score(ctx, c) >= 16 && !gva_read(ctx, c, sys_ep, &probe, 1)) {
dtb_off = o; dtb_off = o;
+1
View File
@@ -72,6 +72,7 @@ void sig_free(sig_pattern_t* p) {
p->bytes = p->mask = NULL; p->len = 0; p->bytes = p->mask = NULL; p->len = 0;
} }
__attribute__((hot))
void sig_each(mem_view_t v, const sig_pattern_t* p, void sig_each(mem_view_t v, const sig_pattern_t* p,
int (*cb)(void*, uint64_t), void* user) { int (*cb)(void*, uint64_t), void* user) {
if (!v.data || !p || p->len == 0 || v.size < p->len) return; if (!v.data || !p || p->len == 0 || v.size < p->len) return;
+9 -8
View File
@@ -3,7 +3,7 @@
#include "include/memory.h" #include "include/memory.h"
#include "../include/include.h" #include "../include/include.h"
static void utf8_emit(uint32_t cp, char* dst, size_t size, size_t* need) { static void utf8_emit(uint32_t cp, char* dst, size_t size, size_t* need, size_t* wrote) {
uint8_t b[4]; size_t k; uint8_t b[4]; size_t k;
if (cp < 0x80) { b[0]=(uint8_t)cp; k=1; } if (cp < 0x80) { b[0]=(uint8_t)cp; k=1; }
else if (cp < 0x800) { b[0]=0xC0|(uint8_t)(cp>>6); b[1]=0x80|(cp&0x3F); k=2; } else if (cp < 0x800) { b[0]=0xC0|(uint8_t)(cp>>6); b[1]=0x80|(cp&0x3F); k=2; }
@@ -11,12 +11,13 @@ static void utf8_emit(uint32_t cp, char* dst, size_t size, size_t* need) {
else { b[0]=0xF0|(uint8_t)(cp>>18); b[1]=0x80|((cp>>12)&0x3F); b[2]=0x80|((cp>>6)&0x3F); b[3]=0x80|(cp&0x3F); k=4; } else { b[0]=0xF0|(uint8_t)(cp>>18); b[1]=0x80|((cp>>12)&0x3F); b[2]=0x80|((cp>>6)&0x3F); b[3]=0x80|(cp&0x3F); k=4; }
if (dst && *need + k < size) { if (dst && *need + k < size) {
for (size_t j = 0; j < k; j++) dst[*need + j] = (char)b[j]; for (size_t j = 0; j < k; j++) dst[*need + j] = (char)b[j];
*wrote = *need + k; /* end of last full code point */
} }
*need += k; *need += k;
} }
size_t gva_read_text(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, size_t nmemb, char* dst, size_t size) { size_t gva_read_text(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, size_t nmemb, char* dst, size_t size) {
size_t need = 0; size_t need = 0, wrote = 0;
uint16_t stage[256]; uint16_t stage[256];
uint32_t hi = 0; uint32_t hi = 0;
nmemb &= ~(size_t)1; nmemb &= ~(size_t)1;
@@ -31,21 +32,21 @@ size_t gva_read_text(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, size_t nmemb, ch
uint32_t u = stage[i]; uint32_t u = stage[i];
if (hi) { if (hi) {
if (u >= 0xDC00 && u <= 0xDFFF) { if (u >= 0xDC00 && u <= 0xDFFF) {
utf8_emit(0x10000u + ((hi - 0xD800u) << 10) + (u - 0xDC00u), dst, size, &need); utf8_emit(0x10000u + ((hi - 0xD800u) << 10) + (u - 0xDC00u), dst, size, &need, &wrote);
hi = 0; hi = 0;
continue; continue;
} }
utf8_emit(0xFFFD, dst, size, &need); utf8_emit(0xFFFD, dst, size, &need, &wrote);
hi = 0; hi = 0;
} }
if (u >= 0xD800 && u <= 0xDBFF) hi = u; if (u >= 0xD800 && u <= 0xDBFF) hi = u;
else if (u >= 0xDC00 && u <= 0xDFFF) utf8_emit(0xFFFD, dst, size, &need); else if (u >= 0xDC00 && u <= 0xDFFF) utf8_emit(0xFFFD, dst, size, &need, &wrote);
else utf8_emit(u, dst, size, &need); else utf8_emit(u, dst, size, &need, &wrote);
} }
va += chunk; va += chunk;
nmemb -= chunk; nmemb -= chunk;
} }
if (hi) utf8_emit(0xFFFD, dst, size, &need); if (hi) utf8_emit(0xFFFD, dst, size, &need, &wrote);
if (dst && size) dst[need < size ? need : size - 1] = 0; if (dst && size) dst[need < size ? need : wrote] = 0;
return need; return need;
} }