From 4015e839ebca4304ae46116a31d7080a9fff29ee Mon Sep 17 00:00:00 2001 From: Gregory Lirent Date: Mon, 15 Jun 2026 00:58:27 +0300 Subject: [PATCH] Zero-copy hot path, correctness hardening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gva_ptr: leaf-bounded zero-copy guest reads. gva_sweep redesigned to drive on it — large-page leaves are lent to the callback while 4K runs stay buffered, and the run loop is guarded against wrap at the top of the address space. gva_gpa fetches PTEs zero-copy; optional W32MS_LTO build option folds the per-fetch call boundary (shipped -O2 default unchanged). Correctness: subtract-form bounds check (no add overflow), memcpy decode in place of type-punned wide loads, zero-init PDB name before compare, PCI-hole-crossing range rejection, single-sourced VA_CANON and USER bounds. hot/cold attributes audited across the translation and scan path. --- CMakeLists.txt | 6 +++++ src/gpa.c | 5 +++- src/gva.c | 62 ++++++++++++++++++++++++++++++-------------- src/host.c | 15 +++++------ src/include/memory.h | 9 +++++++ src/proc.c | 2 +- src/profile.c | 6 +++-- src/sigscan.c | 1 + src/text.c | 17 ++++++------ 9 files changed, 84 insertions(+), 39 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f77163..62981f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,8 @@ set(CMAKE_C_STANDARD 17) # generation B uses no C23 feature set(CMAKE_C_STANDARD_REQUIRED ON) set(CMAKE_C_EXTENSIONS ON) # deliberate: strnlen (POSIX) + void* arithmetic (GNU) +option(W32MS_LTO "Enable LTO" OFF) # build-only; shipped default is -O2, no LTO + # ---- host: VMI core as a static library --------------------------------- add_library(w32ms STATIC src/gpa.c @@ -19,6 +21,10 @@ target_include_directories(w32ms PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include # public API: include/*.h PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) # private: src/include/*.h via "include/..." target_compile_options(w32ms PRIVATE -O2 -Wall -Wextra) +if(W32MS_LTO) + target_compile_options(w32ms PRIVATE -flto) + target_link_options(w32ms PRIVATE -flto) +endif() # ---- host: CLI demonstrator over the library ---------------------------- add_executable(w32ms_cli src/cli.c) diff --git a/src/gpa.c b/src/gpa.c index fdb4fd9..99edbea 100644 --- a/src/gpa.c +++ b/src/gpa.c @@ -35,7 +35,9 @@ static void clean_ctx(gpa_ctx* ctx) { } static int out_of_bounds(gpa_ctx* ctx, uintptr_t* offs, const size_t nmemb) { - return gpa_offset(ctx, *offs, offs) || *offs + nmemb > ctx->fsize; + return gpa_offset(ctx, *offs, offs) + || nmemb > ctx->fsize - *offs + || (*offs < ctx->low && nmemb > ctx->low - *offs); /* range crosses split */ } __attribute__((hot)) @@ -57,6 +59,7 @@ int gpa_write(gpa_ctx* ctx, uintptr_t offs, const void* src, const size_t nmemb) /* Zero-copy host pointer to [offs, offs+nmemb) GPA, or NULL if that range is not * fully backed by the mapped image. Same split + bounds check as gpa_read. */ +__attribute__((hot)) void* gpa_ptr(gpa_ctx* ctx, uintptr_t offs, const size_t nmemb) { if (out_of_bounds(ctx, &offs, nmemb)) { return NULL; diff --git a/src/gva.c b/src/gva.c index 2f685b0..6b6921d 100644 --- a/src/gva.c +++ b/src/gva.c @@ -5,9 +5,6 @@ #include "include/memory.h" #include "../include/include.h" -/* sign-extend a 48-bit canonical VA */ -#define VA_CANON(v) (((v) & (1ull << 47)) ? ((v) | 0xFFFF000000000000ull) : (v)) - /* PTE permission bits we propagate down the walk. */ #define PTE_RW (1ull << 1) #define PTE_US (1ull << 2) @@ -16,15 +13,17 @@ /* ---- single-address translation (hot) ----------------------------------- * * Translate `va` under `cr3` to a GPA. On success: *gpa = GPA of `va`, and * *leaf (if non-NULL) = bytes from `va` to the end of the containing leaf. */ +__attribute__((hot)) static int gva_gpa(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, uintptr_t* gpa, size_t* leaf) { uint64_t t = cr3 & PFN_MASK, e; + const uint64_t* pe; const unsigned i4 = (va >> 39) & 0x1ff, i3 = (va >> 30) & 0x1ff, i2 = (va >> 21) & 0x1ff, i1 = (va >> 12) & 0x1ff; - if (gpa_read(&p_(ctx), t + i4 * 8, &e, 8) || !(e & PG_P)) return -1; + if (!(pe = gpa_ptr(&p_(ctx), t + i4 * 8, 8)) || !((e = *pe) & PG_P)) return -1; t = e & PFN_MASK; - if (gpa_read(&p_(ctx), t + i3 * 8, &e, 8) || !(e & PG_P)) return -1; + if (!(pe = gpa_ptr(&p_(ctx), t + i3 * 8, 8)) || !((e = *pe) & PG_P)) return -1; if (e & PG_PS) { /* 1 GiB leaf */ const uint64_t off = va & 0x3FFFFFFF; *gpa = (e & PFN_MASK & ~0x3FFFFFFFull) + off; @@ -32,7 +31,7 @@ static int gva_gpa(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, return 0; } t = e & PFN_MASK; - if (gpa_read(&p_(ctx), t + i2 * 8, &e, 8) || !(e & PG_P)) return -1; + if (!(pe = gpa_ptr(&p_(ctx), t + i2 * 8, 8)) || !((e = *pe) & PG_P)) return -1; if (e & PG_PS) { /* 2 MiB leaf */ const uint64_t off = va & 0x1FFFFF; *gpa = (e & PFN_MASK & ~0x1FFFFFull) + off; @@ -40,13 +39,22 @@ static int gva_gpa(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, return 0; } t = e & PFN_MASK; - if (gpa_read(&p_(ctx), t + i1 * 8, &e, 8) || !(e & PG_P)) return -1; + if (!(pe = gpa_ptr(&p_(ctx), t + i1 * 8, 8)) || !((e = *pe) & PG_P)) return -1; const uint64_t off = va & 0xFFF; /* 4 KiB leaf */ *gpa = (e & PFN_MASK) + off; if (leaf) *leaf = 0x1000 - off; return 0; } +/* zero-copy borrowed read: leaf-bounded host pointer at `va` (see memory.h). */ +__attribute__((hot)) +const void* gva_ptr(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, size_t* avail) { + uintptr_t gpa; size_t leaf; + if (gva_gpa(ctx, cr3, va, &gpa, &leaf)) return NULL; + *avail = leaf; + return gpa_ptr(&p_(ctx), gpa, leaf); +} + __attribute__((hot)) int gva_read(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, void* dst, size_t nmemb) { uint8_t* d = dst; @@ -169,6 +177,7 @@ static int rgn_hit(uint64_t base, uint64_t span, uint64_t lo, uint64_t hi) { return !(end < lo || base > hi); } +__attribute__((hot)) int gva_regions(gva_ctx* ctx, uintptr_t cr3, uint64_t lo, uint64_t hi, uint32_t prot_any, vregion* out, int nmax) { if (nmax <= 0) return 0; @@ -230,6 +239,7 @@ int gva_regions(gva_ctx* ctx, uintptr_t cr3, uint64_t lo, uint64_t hi, #define SWEEP_WIN (1u << 20) /* 1 MiB window (multiple of 8) */ #define SWEEP_RMAX (1u << 16) /* max runs enumerated per sweep */ +__attribute__((hot)) int gva_sweep(gva_ctx* ctx, uintptr_t cr3, uint64_t lo, uint64_t hi, uint32_t prot_any, size_t overlap, gva_sweep_cb cb, void* user) { if (overlap >= SWEEP_WIN) return -1; @@ -245,26 +255,40 @@ int gva_sweep(gva_ctx* ctx, uintptr_t cr3, uint64_t lo, uint64_t hi, for (int r = 0; r < nr && !rc; r++) { uint64_t base = rg[r].va; /* VA of buf[0] */ uint64_t va = rg[r].va; - const uint64_t vend = rg[r].va + rg[r].len; + const uint64_t vend = rg[r].va + (rg[r].len - 1); /* inclusive last */ size_t fill = 0; - while (va < vend) { - size_t pg = 0x1000 - (size_t)(va & 0xFFF); /* to page edge */ - if (pg > (size_t)(vend - va)) pg = (size_t)(vend - va); - if (pg > SWEEP_WIN - fill) pg = SWEEP_WIN - fill; - - if (gva_read(ctx, cr3, va, buf + fill, pg)) { /* gap: flush+skip */ + while (va <= vend) { + size_t avail; + const uint8_t* p = gva_ptr(ctx, cr3, va, &avail); + if (!p) { /* gap: flush+skip */ if (fill && cb(user, buf, fill, base, overlap, 1)) { rc = 1; break; } + if (vend - va < 0x1000 - (va & 0xFFF)) break; /* skip past top: done */ va += 0x1000 - (va & 0xFFF); base = va; fill = 0; continue; } - fill += pg; va += pg; + size_t n = avail; /* leaf-contiguous */ + if (n > (size_t)(vend - va + 1)) n = (size_t)(vend - va + 1); + if (n > SWEEP_WIN - fill) n = SWEEP_WIN - fill; + const int end = (n == (size_t)(vend - va + 1)); /* chunk hits vend */ - if (fill == SWEEP_WIN) { - const int last = (va >= vend); - if (cb(user, buf, fill, base, overlap, last)) { rc = 1; break; } - if (last || overlap == 0 || overlap >= fill) { + if (fill == 0 && avail > 0x1000) { /* large-page lend */ + if (cb(user, p, n, va, 0, end)) { rc = 1; break; } + if (end) break; /* avoid va wrap */ + va += n; + if (overlap == 0) base = va; + else { memcpy(buf, p + n - overlap, overlap); base = va - overlap; fill = overlap; } + continue; + } + + memcpy(buf + fill, p, n); /* buffered window */ + fill += n; va += n; + + if (end || fill == SWEEP_WIN) { + if (cb(user, buf, fill, base, overlap, end)) { rc = 1; break; } + if (end) { fill = 0; break; } /* avoid va wrap */ + if (overlap == 0 || overlap >= fill) { base = va; fill = 0; } else { /* carry overlap */ memmove(buf, buf + fill - overlap, overlap); diff --git a/src/host.c b/src/host.c index 78982d3..8ae0154 100644 --- a/src/host.c +++ b/src/host.c @@ -100,11 +100,9 @@ static int find_ntoskrnl(gva_ctx* ctx, uintptr_t cr3, uint64_t* base, uint8_t gu } uint64_t va = (uint64_t)p4<<39 | (uint64_t)p3<<30 | (uint64_t)p2<<21; - if (va & (1ull<<47)) { - va |= 0xFFFF000000000000ull; /* canonical sign-extend */ - } + va = VA_CANON(va); - uint16_t mz; char pdb[16]; + uint16_t mz; char pdb[16] = {0}; if (gva_read(ctx, cr3, va, &mz, 2) || mz != MZ) { continue; } @@ -132,10 +130,11 @@ static uint32_t ko_export_rva(gva_ctx* ctx, uintptr_t cr3, uint64_t kbase, const if (gva_read(ctx, cr3, kbase + exp_rva, ed, sizeof ed)) { return 0; } - const uint32_t nnames = *(uint32_t*)(ed + 0x18); - const uint32_t a_funcs = *(uint32_t*)(ed + 0x1C); - const uint32_t a_names = *(uint32_t*)(ed + 0x20); - const uint32_t a_ords = *(uint32_t*)(ed + 0x24); + uint32_t nnames, a_funcs, a_names, a_ords; + memcpy(&nnames, ed + 0x18, 4); + memcpy(&a_funcs, ed + 0x1C, 4); + memcpy(&a_names, ed + 0x20, 4); + memcpy(&a_ords, ed + 0x24, 4); for (uint32_t i = 0; i < nnames; i++) { uint32_t nrva; char nm[40]; diff --git a/src/include/memory.h b/src/include/memory.h index b945e46..e4e1677 100644 --- a/src/include/memory.h +++ b/src/include/memory.h @@ -10,6 +10,9 @@ struct gva_ctx; /* forward: completed below; lets profile.h name it * #define PG_P 0x1ull #define PG_PS 0x80ull +/* sign-extend a 48-bit canonical VA */ +#define VA_CANON(v) (((v) & (1ull << 47)) ? ((v) | 0xFFFF000000000000ull) : (v)) + /* Canonical VA window bounds, single-sourced here for every scanning TU. * USER_MIN is 0x10000: Windows reserves the low 64 KiB, so no live user pointer * targets below it - starting there drops a class of false positives. */ @@ -59,6 +62,12 @@ int gpa_write(gpa_ctx* ctx, uintptr_t offs, const void* src, size_t nmemb); * whole leaf (or a 4096-byte page table) can be taken in one call. */ void* gpa_ptr(gpa_ctx* ctx, uintptr_t offs, size_t nmemb); +/* Zero-copy borrowed read: host pointer to the guest byte at `va` (under `cr3`), + * valid for *avail contiguous bytes (to the end of the containing leaf). NULL if + * `va` is not mapped or the leaf is not fully covered by the image (caller falls + * back to gva_read). Borrowed: valid until gva_ctx_free, do NOT retain/free. */ +const void* gva_ptr(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, size_t* avail) __attribute__((hot)); + /* bootstrap helpers (gva.c) */ int khalf_score(const gva_ctx* ctx, uint64_t pml4) __attribute__((cold)); int cr3_recover(gva_ctx* ctx, uint64_t va_self, uint64_t target_pa, uintptr_t* cr3_out) __attribute__((cold)); diff --git a/src/proc.c b/src/proc.c index 235a3f4..64041e6 100644 --- a/src/proc.c +++ b/src/proc.c @@ -4,7 +4,7 @@ #include "include/memory.h" #include "../include/include.h" -#define pr_(ctx) (ctx->prof) +#define pr_(ctx) ((ctx)->prof) #define RING_GUARD 100000u #define MOD_GUARD 4096u diff --git a/src/profile.c b/src/profile.c index 20312fa..c76e65c 100644 --- a/src/profile.c +++ b/src/profile.c @@ -53,7 +53,8 @@ static int discover_core(gva_ctx* ctx, uintptr_t cr3, uint64_t sys_ep) { int pid_off = -1; for (int o = 0x80; o + 8 <= name_off; o += 8) { - if (*(uint64_t*)(buf + o) != 4) { + uint64_t v; memcpy(&v, buf + o, 8); + if (v != 4) { continue; } const uint16_t links = (uint16_t)(o + 8); @@ -70,7 +71,8 @@ static int discover_core(gva_ctx* ctx, uintptr_t cr3, uint64_t sys_ep) { int dtb_off = -1; for (int o = 0x18; o <= 0x60; o += 8) { - const uint64_t c = *(uint64_t*)(buf + o) & PFN_MASK; + uint64_t v; memcpy(&v, buf + o, 8); + const uint64_t c = v & PFN_MASK; uint8_t probe; if (c && khalf_score(ctx, c) >= 16 && !gva_read(ctx, c, sys_ep, &probe, 1)) { dtb_off = o; diff --git a/src/sigscan.c b/src/sigscan.c index a908a26..f141ee8 100644 --- a/src/sigscan.c +++ b/src/sigscan.c @@ -72,6 +72,7 @@ void sig_free(sig_pattern_t* p) { p->bytes = p->mask = NULL; p->len = 0; } +__attribute__((hot)) void sig_each(mem_view_t v, const sig_pattern_t* p, int (*cb)(void*, uint64_t), void* user) { if (!v.data || !p || p->len == 0 || v.size < p->len) return; diff --git a/src/text.c b/src/text.c index 4ba11f0..b1ed11a 100644 --- a/src/text.c +++ b/src/text.c @@ -3,7 +3,7 @@ #include "include/memory.h" #include "../include/include.h" -static void utf8_emit(uint32_t cp, char* dst, size_t size, size_t* need) { +static void utf8_emit(uint32_t cp, char* dst, size_t size, size_t* need, size_t* wrote) { uint8_t b[4]; size_t k; if (cp < 0x80) { b[0]=(uint8_t)cp; k=1; } else if (cp < 0x800) { b[0]=0xC0|(uint8_t)(cp>>6); b[1]=0x80|(cp&0x3F); k=2; } @@ -11,12 +11,13 @@ static void utf8_emit(uint32_t cp, char* dst, size_t size, size_t* need) { else { b[0]=0xF0|(uint8_t)(cp>>18); b[1]=0x80|((cp>>12)&0x3F); b[2]=0x80|((cp>>6)&0x3F); b[3]=0x80|(cp&0x3F); k=4; } if (dst && *need + k < size) { for (size_t j = 0; j < k; j++) dst[*need + j] = (char)b[j]; + *wrote = *need + k; /* end of last full code point */ } *need += k; } size_t gva_read_text(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, size_t nmemb, char* dst, size_t size) { - size_t need = 0; + size_t need = 0, wrote = 0; uint16_t stage[256]; uint32_t hi = 0; nmemb &= ~(size_t)1; @@ -31,21 +32,21 @@ size_t gva_read_text(gva_ctx* ctx, uintptr_t cr3, uintptr_t va, size_t nmemb, ch uint32_t u = stage[i]; if (hi) { if (u >= 0xDC00 && u <= 0xDFFF) { - utf8_emit(0x10000u + ((hi - 0xD800u) << 10) + (u - 0xDC00u), dst, size, &need); + utf8_emit(0x10000u + ((hi - 0xD800u) << 10) + (u - 0xDC00u), dst, size, &need, &wrote); hi = 0; continue; } - utf8_emit(0xFFFD, dst, size, &need); + utf8_emit(0xFFFD, dst, size, &need, &wrote); hi = 0; } if (u >= 0xD800 && u <= 0xDBFF) hi = u; - else if (u >= 0xDC00 && u <= 0xDFFF) utf8_emit(0xFFFD, dst, size, &need); - else utf8_emit(u, dst, size, &need); + else if (u >= 0xDC00 && u <= 0xDFFF) utf8_emit(0xFFFD, dst, size, &need, &wrote); + else utf8_emit(u, dst, size, &need, &wrote); } va += chunk; nmemb -= chunk; } - if (hi) utf8_emit(0xFFFD, dst, size, &need); - if (dst && size) dst[need < size ? need : size - 1] = 0; + if (hi) utf8_emit(0xFFFD, dst, size, &need, &wrote); + if (dst && size) dst[need < size ? need : wrote] = 0; return need; } \ No newline at end of file