From 3199fbf2584bac77c0e281213224bf259206876e Mon Sep 17 00:00:00 2001 From: Gregory Lirent Date: Tue, 16 Jun 2026 18:11:29 +0300 Subject: [PATCH] Add a light x86-64 decoder; back code-xref with it The reversing keystone: a length-disassembly decoder with control-flow and RIP-relative target extraction (x86dec.h), pure over a byte buffer - no vmie_mem, no cr3, no Windows. Table-driven length over the 1-byte / 0F / 0F38 / 0F3A maps, legacy + REX + VEX prefixes, ModRM/SIB, displacements and immediates (66 and REX.W operand-size aware). It reports the instruction length plus the rel and RIP-relative targets of near call/jmp/jcc and any RIP-relative memory operand. EVEX is a documented gap (decodes as length 0). This is the primitive the rest of the static-reversing layer builds on (function inventory, call graph, xref). gva_code_xref now brute-scans with the decoder instead of its own ad-hoc E8/E9 and REX.W-lea heuristic, which is removed - one decoder in the tree. Because a brute scan can re-enter a prefixed instruction one byte in and decode a shorter aliased form with the same target, the scan drops a match that starts inside the extent of an already-accepted one; real, non-overlapping instructions are unaffected. --- CMakeLists.txt | 1 + include/scan.h | 18 +- include/x86dec.h | 56 +++++ src/handlers/codescan.c | 83 ++++---- src/handlers/x86dec.c | 454 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 560 insertions(+), 52 deletions(-) create mode 100644 include/x86dec.h create mode 100644 src/handlers/x86dec.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 3bd20fd..19763b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,7 @@ add_library(vmie STATIC src/handlers/sigscan.c src/handlers/sigset.c src/handlers/codescan.c + src/handlers/x86dec.c src/handlers/pmap.c src/handlers/snapdiff.c) target_include_directories(vmie diff --git a/include/scan.h b/include/scan.h index 30c00f1..0b10763 100644 --- a/include/scan.h +++ b/include/scan.h @@ -70,14 +70,16 @@ int gva_sig_scan_multi(vmie_mem* m, uintptr_t cr3, uint64_t lo, uint64_t hi, uint32_t prot_any, const sigset* s, sig_multi_hit* out, int max); -/* code-xref: every instruction in the X-regions of [lo,hi] whose rel32 operand - * targets `target_va`. Heuristic decoder (NOT a full disassembler): recognizes - * E8 call / E9 jmp (next_rip + disp32) and the RIP-relative ModRM forms - * (mod=00, rm=101) of lea/mov (REX.W 8D / 8B) where target = next_rip + - * (int32)disp. Records each matching instruction-start VA. The sweep forces - * VR_X and carries a >=15-byte overlap (max x86 instruction length) so no - * instruction is cut at a window seam. Writes up to `max` VAs to `out` (NULL to - * count only) and returns the TOTAL number of matches, or -1 on bad input. */ +/* code-xref: every instruction in the X-regions of [lo,hi] whose near rel + * branch or RIP-relative memory operand resolves to `target_va`. Brute-scans + * each byte offset with the light x86-64 decoder (x86dec.h, NOT a full + * disassembler): an E8/E9/EB/Jcc rel branch matches when next_rip + rel == + * target_va, and any RIP-relative operand (ModRM mod=00, rm=101) matches when + * next_rip + disp32 == target_va (this covers lea/mov and any other rip-rel + * form). Records each matching instruction-start VA. The sweep forces VR_X and + * carries a >=15-byte overlap (max x86 instruction length) so no instruction is + * cut at a window seam. Writes up to `max` VAs to `out` (NULL to count only) and + * returns the TOTAL number of matches, or -1 on bad input. */ int gva_code_xref(vmie_mem* m, uintptr_t cr3, uint64_t lo, uint64_t hi, uint64_t target_va, uint64_t* out, int max); diff --git a/include/x86dec.h b/include/x86dec.h new file mode 100644 index 0000000..4b542f0 --- /dev/null +++ b/include/x86dec.h @@ -0,0 +1,56 @@ +/* x86dec.h - light x86-64 length decoder + control-flow / RIP-relative targets. + * + * A PURE primitive: it decodes ONE 64-bit-mode instruction over a raw byte + * buffer and reports its total length plus, when present, the near control-flow + * branch displacement (rel8/rel32) and the RIP-relative memory displacement + * (ModRM mod=00, rm=101). It does NOT decode registers or full operands - just + * enough for length-disassembly and control-flow / memory target recovery + * (function inventory, call graphs, xref databases, IAT and hook detection). + * + * It touches no I/O, no allocations, and no other module: / + * only. Length-correct over legacy prefixes (66/67/F0/F2/F3/segment), REX, the + * 1-byte / 0F two-byte / 0F38 / 0F3A opcode maps, ModRM+SIB, disp8/disp32, the + * immediate sizes (with 66 and REX.W operand-size effects), and VEX (C4/C5). + * + * DOCUMENTED GAP: EVEX (0x62, AVX-512) is NOT decoded - it yields len=0 + * (undecodable). It is rare in ordinary user code; full EVEX support, if ever + * needed, is a separate task. Any byte stream that does not decode, or that + * would run past min(avail,15), also yields len=0. + */ +#ifndef VMIE_X86DEC_H +#define VMIE_X86DEC_H +#include +#include + +typedef enum { + X86_OTHER, /* no tracked control-flow effect */ + X86_CALL, /* E8 rel32, or FF /2 indirect (has_rel=0) */ + X86_JMP, /* E9 rel32 / EB rel8, or FF /4 indirect (has_rel=0) */ + X86_JCC, /* 70-7F rel8 / 0F 80-8F rel32 */ + X86_RET, /* C3 / C2 imm16 / CB / CA */ + X86_INT3 /* CC */ +} x86_flow; + +typedef struct { + uint8_t len; /* total length 1..15; 0 = undecodable / exceeds avail */ + x86_flow flow; /* control-flow class */ + int has_rel; /* 1: a rel8/rel32 branch displacement is present */ + int32_t rel; /* sign-extended branch displacement (if has_rel) */ + int has_riprel; /* 1: RIP-relative memory operand (ModRM mod=00,rm=101) */ + int32_t riprel; /* sign-extended RIP-relative disp32 (if has_riprel) */ +} x86_insn; + +/* Decode ONE 64-bit-mode instruction at `code` (`avail` readable bytes). Fills + * *out and returns the length (1..15), or 0 if the bytes do not decode or would + * exceed min(avail,15). Length-correct over: legacy prefixes (66/67/F0/F2/F3/ + * seg), REX, 1-byte / 0F two-byte / 0F38 / 0F3A maps, ModRM+SIB, disp8/disp32, + * imm8/16/32/64 (66 and REX.W operand-size effects), and VEX (C4/C5). EVEX + * (0x62) is a documented gap: len=0. */ +int x86_decode(const uint8_t* code, size_t avail, x86_insn* out); + +/* Absolute target of a rel branch: ip + insn->len + insn->rel (0 unless has_rel). */ +uint64_t x86_branch_target(uint64_t ip, const x86_insn* insn); +/* Absolute target of a RIP-relative operand: ip + insn->len + insn->riprel (0 unless has_riprel). */ +uint64_t x86_riprel_target(uint64_t ip, const x86_insn* insn); + +#endif /* VMIE_X86DEC_H */ diff --git a/src/handlers/codescan.c b/src/handlers/codescan.c index a9db7c5..81a8b8b 100644 --- a/src/handlers/codescan.c +++ b/src/handlers/codescan.c @@ -1,14 +1,15 @@ -/* codescan.c - windowed multi-pattern scan + heuristic rel32 code-xref. +/* codescan.c - windowed multi-pattern scan + decoder-driven code-xref. * * Both bridges stream guest memory through gva_sweep and report guest VAs: * gva_sig_scan_multi - drives a compiled sigset over each window, seam-deduped * (overlap = longest pattern len - 1). - * gva_code_xref - heuristic decode of the rel32 instruction forms in - * X-regions; records instruction starts whose computed - * target equals target_va. Overlap >= 15 (max x86 insn - * length) keeps an instruction whole across a seam. + * gva_code_xref - brute-scans X-regions with the light x86-64 decoder + * (x86dec.h); records instruction starts whose rel branch + * or RIP-relative operand resolves to target_va. Overlap + * >= 15 (max x86 insn length) keeps an instruction whole + * across a seam. * - * Handler boundary: only memmodel.h / scan.h / sigscan.h. + * Handler boundary: only memmodel.h / scan.h / sigscan.h / x86dec.h. */ #include #include @@ -16,6 +17,7 @@ #include "memmodel.h" #include "sigscan.h" #include "scan.h" +#include "x86dec.h" /* x86-64 maximum instruction length; the code-xref sweep overlap. A decoded * instruction may be up to this long, so a window must carry this many leading @@ -65,37 +67,28 @@ int gva_sig_scan_multi(vmie_mem* m, uintptr_t cr3, uint64_t lo, uint64_t hi, return c.n; } -/* ---- heuristic rel32 code-xref ------------------------------------------- * - * Decode just enough to recover a rel32 target. Two recognized shapes: - * E8/E9 disp32 (call/jmp) : start+5 + disp - * REX.W 8D|8B modrm(00,*,101) disp32 (lea/mov rip) : start+7 + disp - * The lea/mov form REQUIRES the REX.W prefix (0x48..0x4F with W set), per the - * 64-bit operand RIP-relative encoding; a bare 8D/8B is not accepted (it would - * also let the decoder re-recognize the same instruction one byte past its REX - * prefix). Returns the encoded length (>=5) and writes the target via *target, - * or 0 if `p[0..avail)` is not one of the forms. */ -__attribute__((hot)) -static size_t decode_rel32(const uint8_t* p, size_t avail, - uint64_t start_va, uint64_t* target) { - if (avail >= 5 && (p[0] == 0xE8 || p[0] == 0xE9)) { - int32_t disp; memcpy(&disp, p + 1, 4); - *target = start_va + 5 + (int64_t)disp; - return 5; - } - /* REX.W prefix (0x48..0x4F: bit 3 = W), then 8D/8B with RIP-rel ModRM */ - if (avail >= 7 && (p[0] & 0xF8) == 0x48 && (p[1] == 0x8D || p[1] == 0x8B)) { - const uint8_t modrm = p[2]; - if ((modrm & 0xC0) == 0x00 && (modrm & 0x07) == 0x05) { /* mod=00 rm=101 */ - int32_t disp; memcpy(&disp, p + 3, 4); - *target = start_va + 7 + (int64_t)disp; /* rex op modrm disp32 */ - return 7; - } - } - return 0; -} - +/* ---- decoder-driven code-xref -------------------------------------------- * + * Brute-scan every byte offset of the window with the light x86-64 decoder. On + * a decoded instruction (len>0) whose rel branch (x86_branch_target) or RIP- + * relative operand (x86_riprel_target) resolves to the requested target VA, + * record the instruction's start VA. We step +1 regardless of decode length + * because function starts are not known here; the decoder rejects junk at + * non-instruction offsets far better than the old ad-hoc heuristic. + * + * Two distinct de-duplications, both intrinsic to a +1 brute-scan: + * - SEAM: a match that STARTS in the trailing overlap of a non-last window is + * dropped; the next window re-presents that instruction whole in its leading + * overlap (overlap >= X86_MAX_INSN). + * - INTERIOR: a match whose start lies inside the byte extent of an already- + * accepted same-target match is dropped. Real instructions never overlap, so + * such an interior hit is always a decode artifact of stepping into a prefix + * (e.g. the bare `8D 05 disp32` lea re-found one byte past a `48 8D 05 disp32` + * REX.W lea: same next_rip and disp => same target). `cover` tracks the VA + * just past the last accepted match; offsets ascend, so the outermost (real) + * instruction is always seen first. */ struct xref_cb { uint64_t target; + uint64_t cover; /* VA just past the last accepted match */ uint64_t* out; int max, n; }; @@ -103,18 +96,20 @@ __attribute__((hot)) static int xref_sweep_cb(void* u, const uint8_t* data, size_t len, uint64_t base, size_t ov, int last) { struct xref_cb* c = u; - /* Decode at every byte offset (heuristic, overlapping). A match that STARTS - * in the trailing overlap of a non-last window is dropped: the next window - * re-presents that instruction whole in its leading overlap. */ const size_t limit = last ? len : (len > ov ? len - ov : 0); for (size_t off = 0; off < len; off++) { if (!last && off >= limit) { break; } - uint64_t tgt = 0; - const size_t ilen = decode_rel32(data + off, len - off, base + off, &tgt); - if (ilen && tgt == c->target) { - if (c->out && c->n < c->max) { c->out[c->n] = base + off; } - c->n++; - } + const uint64_t va = base + off; + x86_insn in; + const int ilen = x86_decode(data + off, len - off, &in); + if (ilen <= 0) { continue; } + const int hit = (in.has_rel && x86_branch_target(va, &in) == c->target) || + (in.has_riprel && x86_riprel_target(va, &in) == c->target); + if (!hit) { continue; } + if (va < c->cover) { continue; } /* interior alias of a prior hit */ + c->cover = va + (uint64_t)ilen; + if (c->out && c->n < c->max) { c->out[c->n] = va; } + c->n++; } return 0; } diff --git a/src/handlers/x86dec.c b/src/handlers/x86dec.c new file mode 100644 index 0000000..2302fb8 --- /dev/null +++ b/src/handlers/x86dec.c @@ -0,0 +1,454 @@ +/* x86dec.c - light x86-64 length decoder (see x86dec.h). + * + * Length-only + control-flow / RIP-relative target extraction over a raw byte + * buffer. 64-bit mode. Declarative: per-opcode properties live in static const + * tables (one per opcode map); the decode loop reads them, it does not branch + * per opcode. No globals, no allocations - all state is on the stack. + * + * Boundary: includes ONLY x86dec.h (which pulls /). It + * names no other module and no OS object. + * + * Per-opcode property byte (OP_*): + * MODRM - opcode carries a ModRM byte (then maybe SIB / disp / RIP-rel) + * immediate class (low nibble) - how many immediate bytes follow the operand + * encoding, resolved against the effective operand size: + * IM_0 none + * IM_8 1 byte + * IM_16 2 bytes + * IM_32 4 bytes + * IM_Z 2 if 66-prefix else 4 (word/dword immediate, never 8) + * IM_V 2 if 66, 8 if REX.W, else 4 (word/dword/qword immediate; mov r,imm) + * IM_P far ptr: IM_Z + 2 (seg) - legacy, unused in 64-bit but length-safe + * BAD - not decoded (e.g. EVEX prefix 0x62); forces len=0 + */ +#include "x86dec.h" + +/* ---- property-byte layout ------------------------------------------------ */ + +#define OP_MODRM 0x80u /* opcode has a ModRM byte */ +#define OP_BAD 0x40u /* undecodable opcode (forces len=0) */ +#define OP_IMASK 0x07u /* immediate-class field (low 3 bits) */ + +enum { + IM_0 = 0, /* no immediate */ + IM_8, /* imm8 */ + IM_16, /* imm16 */ + IM_32, /* imm32 */ + IM_Z, /* imm16 if 66 else imm32 (never qword) */ + IM_V, /* imm16 if 66, imm64 if REX.W, else imm32 */ + IM_P /* far pointer: IM_Z + 2 (legacy; length only) */ +}; + +#define M OP_MODRM +#define B OP_BAD + +/* ---- one-byte opcode map (no 0F prefix) ---------------------------------- * + * Indexed by the opcode byte. Control-flow opcodes (E8/E9/EB/70-7F/C2/C3/CA/CB/ + * CC/FF) get their immediate size from this table too; their flow class is + * resolved separately in classify_one(). */ +static const uint8_t OP1[256] = { + /* 00 */ M, M, M, M, IM_8, IM_Z, 0, 0, /* ADD; 06 PUSH ES,07 POP ES */ + /* 08 */ M, M, M, M, IM_8, IM_Z, 0, B, /* OR; 0F is escape (handled) */ + /* 10 */ M, M, M, M, IM_8, IM_Z, 0, 0, /* ADC */ + /* 18 */ M, M, M, M, IM_8, IM_Z, 0, 0, /* SBB */ + /* 20 */ M, M, M, M, IM_8, IM_Z, 0, 0, /* AND (26 seg = prefix) */ + /* 28 */ M, M, M, M, IM_8, IM_Z, 0, 0, /* SUB (2E seg = prefix) */ + /* 30 */ M, M, M, M, IM_8, IM_Z, 0, 0, /* XOR (36 seg = prefix) */ + /* 38 */ M, M, M, M, IM_8, IM_Z, 0, 0, /* CMP (3E seg = prefix) */ + /* 40 */ 0, 0, 0, 0, 0, 0, 0, 0, /* REX (prefix, handled before) */ + /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, /* REX */ + /* 50 */ 0, 0, 0, 0, 0, 0, 0, 0, /* PUSH r */ + /* 58 */ 0, 0, 0, 0, 0, 0, 0, 0, /* POP r */ + /* 60 */ 0, 0, M, M, 0, 0, 0, 0, /* 62=EVEX (BAD, special-cased) */ + /* 68 */ IM_Z, M|IM_Z, IM_8, M|IM_8, 0, 0, 0, 0, /* PUSH imm; IMUL; INS/OUTS */ + /* 70 */ IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, /* Jcc rel8 */ + /* 78 */ IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, /* Jcc rel8 */ + /* 80 */ M|IM_8, M|IM_Z, M|IM_8, M|IM_8, M, M, M, M, /* grp1; TEST; XCHG */ + /* 88 */ M, M, M, M, M, M, M, M, /* MOV; LEA(8D); MOV sreg; POP */ + /* 90 */ 0, 0, 0, 0, 0, 0, 0, 0, /* NOP/XCHG */ + /* 98 */ 0, 0, IM_P, 0, 0, 0, 0, 0, /* CWDE..; 9A far CALL (legacy) */ + /* A0 */ IM_V, IM_V, IM_V, IM_V, 0, 0, 0, 0, /* MOV moffs (addr-size=64) */ + /* A8 */ IM_8, IM_Z, 0, 0, 0, 0, 0, 0, /* TEST AL/eAX; STOS/LODS/SCAS */ + /* B0 */ IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, /* MOV r8,imm8 */ + /* B8 */ IM_V, IM_V, IM_V, IM_V, IM_V, IM_V, IM_V, IM_V, /* MOV r,imm(v) */ + /* C0 */ M|IM_8, M|IM_8, IM_16, 0, M, M, M|IM_8, M|IM_Z, /* shift; RET imm16/RET; LES/LDS; MOV imm */ + /* C8 */ IM_16|0, 0, IM_16, 0, 0, IM_8, 0, 0, /* C8 ENTER(imm16+imm8) special below; CA RET far imm16; CB; CD int imm8 */ + /* D0 */ M, M, M, M, IM_8, IM_8, 0, 0, /* shift grp2; AAM/AAD imm8; XLAT */ + /* D8 */ M, M, M, M, M, M, M, M, /* x87 ESC (ModRM) */ + /* E0 */ IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, IM_8, /* LOOP/JCXZ/IN/OUT rel8/imm8 */ + /* E8 */ IM_Z, IM_Z, IM_P, IM_8, 0, 0, 0, 0, /* E8 CALL rel32; E9 JMP rel32; EA far; EB JMP rel8 */ + /* F0 */ 0, B, 0, 0, 0, 0, M, M, /* F0 LOCK(prefix); F1 ICEBP=BAD; F6/F7 grp3 (imm via ext) */ + /* F8 */ 0, 0, 0, 0, 0, 0, M, M /* flags; FE/FF grp */ +}; + +/* ENTER (0xC8) takes imm16 + imm8 = 3 immediate bytes; the table cannot encode + * that combination, so it is added explicitly in decode. */ + +/* ---- 0F two-byte opcode map ---------------------------------------------- * + * Escapes 0F38 / 0F3A are handled before this table (opcodes 0x38 / 0x3A). The + * 0F 80..8F range is Jcc rel32 (immediate IM_Z) - flow set in classify. */ +static const uint8_t OP2[256] = { + /* 00 */ M, M, M, M, B, 0, 0, 0, /* grp6/grp7; 04 invalid */ + /* 08 */ 0, 0, B, B, B, M, B, B, /* WBINVD; UD2; prefetch(0D) */ + /* 10 */ M, M, M, M, M, M, M, M, /* SSE mov* */ + /* 18 */ M, M, M, M, M, M, M, M, /* hint-NOP / prefetch (ModRM) */ + /* 20 */ M, M, M, M, B, B, B, B, /* MOV cr/dr (ModRM) */ + /* 28 */ M, M, M, M, M, M, M, M, /* SSE */ + /* 30 */ 0, 0, 0, 0, 0, 0, B, 0, /* WRMSR/RDTSC/RDMSR/RDPMC */ + /* 38 */ B, B, B, B, B, B, B, B, /* escapes (38/3A done earlier) */ + /* 40 */ M, M, M, M, M, M, M, M, /* CMOVcc */ + /* 48 */ M, M, M, M, M, M, M, M, /* CMOVcc */ + /* 50 */ M, M, M, M, M, M, M, M, /* SSE */ + /* 58 */ M, M, M, M, M, M, M, M, /* SSE */ + /* 60 */ M, M, M, M, M, M, M, M, /* MMX/SSE */ + /* 68 */ M, M, M, M, M, M, M, M, /* MMX/SSE; 6E/6F mov */ + /* 70 */ M|IM_8, M, M, M, M, M, M, 0, /* PSHUF imm8; grp shifts; EMMS */ + /* 78 */ M, M, B, B, M, M, M, M, /* VMREAD/WRITE; SSE */ + /* 80 */ IM_Z, IM_Z, IM_Z, IM_Z, IM_Z, IM_Z, IM_Z, IM_Z, /* Jcc rel32 */ + /* 88 */ IM_Z, IM_Z, IM_Z, IM_Z, IM_Z, IM_Z, IM_Z, IM_Z, /* Jcc rel32 */ + /* 90 */ M, M, M, M, M, M, M, M, /* SETcc */ + /* 98 */ M, M, M, M, M, M, M, M, /* SETcc */ + /* A0 */ 0, 0, 0, M, M|IM_8, M, B, B, /* PUSH/POP FS; CPUID; BT; SHLD */ + /* A8 */ 0, 0, 0, M, M|IM_8, M, B, M, /* PUSH/POP GS; RSM; BTS; SHRD; IMUL */ + /* B0 */ M, M, M, M, M, M, M, M, /* CMPXCHG; LSS/LFS/LGS; MOVZX */ + /* B8 */ M, B, M|IM_8, M, M, M, M, M, /* POPCNT; grp8 BT imm8; BSF/BSR */ + /* C0 */ M, M, M|IM_8, M, M|IM_8, M|IM_8, M|IM_8, M, /* XADD; CMPPS imm8; pinsr/extr; grp9 */ + /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, /* BSWAP */ + /* D0 */ M, M, M, M, M, M, M, M, /* SSE/MMX */ + /* D8 */ M, M, M, M, M, M, M, M, /* SSE/MMX */ + /* E0 */ M, M, M, M, M, M, M, M, /* SSE/MMX */ + /* E8 */ M, M, M, M, M, M, M, M, /* SSE/MMX */ + /* F0 */ M, M, M, M, M, M, M, M, /* SSE/MMX */ + /* F8 */ M, M, M, M, M, M, M, B /* SSE/MMX; FF invalid */ +}; + +/* ---- decode state -------------------------------------------------------- */ + +typedef struct { + int rex_w; /* REX.W set (1) - selects 64-bit operand size */ + int pfx66; /* 0x66 operand-size override present */ +} dstate; + +/* immediate byte count for an immediate class under the effective op-size. */ +static size_t imm_bytes(unsigned imclass, const dstate* st) { + switch (imclass) { + case IM_0: { return 0; } + case IM_8: { return 1; } + case IM_16: { return 2; } + case IM_32: { return 4; } + case IM_Z: { return st->pfx66 ? 2u : 4u; } + case IM_V: { return st->rex_w ? 8u : (st->pfx66 ? 2u : 4u); } + case IM_P: { return (st->pfx66 ? 2u : 4u) + 2u; } + default: { return 0; } + } +} + +/* Decode a ModRM (and any SIB / displacement). `p` points at the ModRM byte, + * `avail` is the bytes remaining from there. On success returns the number of + * bytes consumed (ModRM + SIB + disp) and, for a RIP-relative operand (64-bit + * mod=00 rm=101), sets *has_rip and *rip. Returns 0 if it would run past avail. + * Address-size (67) does not change the ModRM/SIB/disp32 byte layout in long + * mode, so it is not consulted here. */ +__attribute__((hot)) +static size_t decode_modrm(const uint8_t* p, size_t avail, + int* has_rip, int32_t* rip) { + if (avail < 1) { return 0; } + const uint8_t modrm = p[0]; + const unsigned mod = (modrm >> 6) & 3u; + const unsigned rm = modrm & 7u; + size_t n = 1; /* the ModRM byte itself */ + + if (mod == 3u) { return n; } /* register direct: no mem */ + + size_t disp = 0; + int has_sib = 0; + if (rm == 4u) { /* SIB follows */ + has_sib = 1; + if (avail < n + 1) { return 0; } + const uint8_t sib = p[n]; + n += 1; + /* base==101 with mod==00 means disp32 (no base register) */ + if (mod == 0u && (sib & 7u) == 5u) { disp = 4; } + } + + if (mod == 0u) { + if (rm == 5u) { /* RIP-relative disp32 (64-bit) */ + if (avail < n + 4) { return 0; } + int32_t d; + d = (int32_t)((uint32_t)p[n] | ((uint32_t)p[n + 1] << 8) | + ((uint32_t)p[n + 2] << 16) | ((uint32_t)p[n + 3] << 24)); + if (has_rip) { *has_rip = 1; } + if (rip) { *rip = d; } + n += 4; + return n; + } + /* mod=00, rm in {SIB special above handled disp}; else no disp */ + } else if (mod == 1u) { + disp = 1; /* disp8 */ + } else { /* mod == 2 */ + disp = 4; /* disp32 */ + } + (void)has_sib; + + if (disp) { + if (avail < n + disp) { return 0; } + n += disp; + } + return n; +} + +/* ---- VEX (C4 3-byte / C5 2-byte) ----------------------------------------- * + * VEX-encoded instructions carry a ModRM and (rarely) an imm8. We treat them + * as: [VEX bytes] [opcode] [ModRM(+SIB+disp)] [imm8?]. The map (mmmmm in C4) + * selects 0F / 0F38 / 0F3A; we always assume a ModRM follows the opcode (true + * for the VEX-encoded SSE/AVX space) and add the imm8 for the 0F3A map (which + * is the imm8 map) - this is length-correct for the common AVX encodings the + * decoder needs to step over. VEX.W does not change this length. */ +__attribute__((hot)) +static int decode_vex(const uint8_t* code, size_t avail, x86_insn* out) { + size_t n; + unsigned mmmmm; + + if (code[0] == 0xC5u) { /* 2-byte VEX */ + if (avail < 2) { return 0; } + n = 2; /* C5 + byte1 */ + mmmmm = 1u; /* implied 0F map */ + } else { /* 0xC4: 3-byte VEX */ + if (avail < 3) { return 0; } + mmmmm = code[1] & 0x1fu; /* 1=0F, 2=0F38, 3=0F3A */ + n = 3; /* C4 + byte1 + byte2 */ + } + + if (avail < n + 1) { return 0; } /* need an opcode byte */ + n += 1; /* the opcode byte */ + + int rip_present = 0; + int32_t rip = 0; + const size_t m = decode_modrm(code + n, avail - n, &rip_present, &rip); + if (m == 0) { return 0; } + n += m; + + /* 0F3A map is the imm8 map: every opcode carries a trailing imm8. */ + if (mmmmm == 3u) { + if (avail < n + 1) { return 0; } + n += 1; + } + + if (n < 1 || n > 15) { return 0; } + out->len = (uint8_t)n; + out->flow = X86_OTHER; + out->has_rel = 0; + out->rel = 0; + out->has_riprel = rip_present; + out->riprel = rip; + return (int)n; +} + +/* ---- branch displacement read -------------------------------------------- * + * Read a rel8 (bytes==1) or rel32 (bytes==4) branch displacement at `p`, + * sign-extend into out->rel, and mark has_rel. */ +static void read_rel(const uint8_t* p, size_t bytes, x86_insn* out) { + if (bytes == 1) { + out->rel = (int32_t)(int8_t)p[0]; + } else { /* 4 bytes */ + out->rel = (int32_t)((uint32_t)p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | ((uint32_t)p[3] << 24)); + } + out->has_rel = 1; +} + +/* ---- main decode --------------------------------------------------------- */ + +__attribute__((hot)) +int x86_decode(const uint8_t* code, size_t avail, x86_insn* out) { + if (!code || !out || avail == 0) { + if (out) { + out->len = 0; out->flow = X86_OTHER; + out->has_rel = 0; out->rel = 0; + out->has_riprel = 0; out->riprel = 0; + } + return 0; + } + + /* zero the result; on any failure we leave len=0. */ + out->len = 0; out->flow = X86_OTHER; + out->has_rel = 0; out->rel = 0; + out->has_riprel = 0; out->riprel = 0; + + const size_t cap = avail < 15u ? avail : 15u; /* never decode past 15 */ + size_t n = 0; + dstate st = { 0, 0 }; + + /* ---- legacy prefixes (66/67/F0/F2/F3/segment) ---- */ + for (; n < cap; n++) { + const uint8_t b = code[n]; + if (b == 0x66u) { st.pfx66 = 1; continue; } + if (b == 0x67u) { continue; } /* address-size: no length effect */ + if (b == 0xF0u || b == 0xF2u || b == 0xF3u) { continue; } /* lock/rep */ + if (b == 0x2Eu || b == 0x36u || b == 0x3Eu || b == 0x26u || /* seg */ + b == 0x64u || b == 0x65u) { continue; } + break; + } + if (n >= cap) { return 0; } /* prefixes only / out of bytes */ + + /* ---- VEX (C4/C5) must precede REX and is mutually exclusive with it ---- */ + if (code[n] == 0xC5u || code[n] == 0xC4u) { + const int r = decode_vex(code + n, cap - n, out); + if (r == 0) { out->len = 0; return 0; } + const size_t total = n + (size_t)r; + if (total < 1 || total > 15 || total > avail) { out->len = 0; return 0; } + out->len = (uint8_t)total; + return (int)total; + } + + /* ---- EVEX (0x62) is a documented gap: undecodable ---- */ + if (code[n] == 0x62u) { return 0; } + + /* ---- REX prefix (0x40..0x4F): must be the last prefix ---- */ + if ((code[n] & 0xF0u) == 0x40u) { + st.rex_w = (code[n] & 0x08u) ? 1 : 0; + n += 1; + if (n >= cap) { return 0; } + } + + /* ---- opcode: 1-byte, or 0F (two-byte / 0F38 / 0F3A) ---- */ + uint8_t op = code[n]; + const uint8_t* tbl = OP1; + int two_byte = 0; + n += 1; + + if (op == 0x0Fu) { + if (n >= cap) { return 0; } + op = code[n]; + n += 1; + two_byte = 1; + if (op == 0x38u) { /* 0F38 map: all ModRM, no imm */ + if (n >= cap) { return 0; } + op = code[n]; + n += 1; + /* every 0F38 opcode has a ModRM and no immediate. */ + int rip_present = 0; int32_t rip = 0; + const size_t m = decode_modrm(code + n, cap - n, + &rip_present, &rip); + if (m == 0) { return 0; } + n += m; + out->has_riprel = rip_present; out->riprel = rip; + if (n < 1 || n > 15 || n > avail) { return 0; } + out->len = (uint8_t)n; + return (int)n; + } + if (op == 0x3Au) { /* 0F3A map: ModRM + imm8 */ + if (n >= cap) { return 0; } + op = code[n]; + n += 1; + int rip_present = 0; int32_t rip = 0; + const size_t m = decode_modrm(code + n, cap - n, + &rip_present, &rip); + if (m == 0) { return 0; } + n += m; + if (n >= cap) { return 0; } /* trailing imm8 */ + n += 1; + out->has_riprel = rip_present; out->riprel = rip; + if (n < 1 || n > 15 || n > avail) { return 0; } + out->len = (uint8_t)n; + return (int)n; + } + tbl = OP2; + } + + const uint8_t prop = tbl[op]; + if (prop & OP_BAD) { return 0; } + + /* ---- ModRM (+SIB+disp), if any ---- */ + int have_modrm = 0; + uint8_t modrm = 0; /* captured for grp5 classify */ + if (prop & OP_MODRM) { + if (n >= cap) { return 0; } + modrm = code[n]; + have_modrm = 1; + int rip_present = 0; int32_t rip = 0; + const size_t m = decode_modrm(code + n, cap - n, &rip_present, &rip); + if (m == 0) { return 0; } + n += m; + out->has_riprel = rip_present; out->riprel = rip; + } + + /* ---- immediate ---- */ + size_t im = imm_bytes(prop & OP_IMASK, &st); + + /* opcodes the table cannot fully encode (combined immediates) ---- */ + if (!two_byte && op == 0xC8u) { im = 3; } /* ENTER imm16, imm8 */ + + /* grp3 F6/F7: only the TEST sub-opcode (reg field /0 or /1) takes an + * immediate (imm8 for F6, immZ for F7). NOT/NEG/MUL/IMUL/DIV/IDIV (/2../7) + * take none. The table marks F6/F7 as ModRM-only; resolve the immediate + * here from the captured ModRM reg field. */ + if (!two_byte && (op == 0xF6u || op == 0xF7u) && have_modrm) { + const unsigned reg = (modrm >> 3) & 7u; + if (reg <= 1u) { im = (op == 0xF6u) ? 1u : (st.pfx66 ? 2u : 4u); } + } + + if (im) { + if (cap < n + im) { return 0; } + n += im; + } + + if (n < 1 || n > 15 || n > avail) { return 0; } + + /* ---- control-flow classification (near forms) ---- + * `im` is the immediate byte count; a rel branch's displacement is the + * last `im` bytes of the instruction (code + n - im). */ + if (!two_byte) { + switch (op) { + case 0xE8u: { /* CALL rel32 */ + out->flow = X86_CALL; + read_rel(code + (n - im), im, out); + break; + } + case 0xE9u: case 0xEBu: { /* JMP rel32 / rel8 */ + out->flow = X86_JMP; + read_rel(code + (n - im), im, out); + break; + } + case 0xC2u: case 0xC3u: case 0xCAu: case 0xCBu: { /* RET forms */ + out->flow = X86_RET; + break; + } + case 0xCCu: { /* INT3 */ + out->flow = X86_INT3; + break; + } + case 0xFFu: { /* grp5: /2 CALL, /4 JMP (ind) */ + if (have_modrm) { + const unsigned reg = (modrm >> 3) & 7u; + if (reg == 2u || reg == 3u) { out->flow = X86_CALL; } + else if (reg == 4u || reg == 5u) { out->flow = X86_JMP; } + } + break; /* indirect: has_rel stays 0 */ + } + default: { /* 70..7F Jcc rel8 */ + if (op >= 0x70u && op <= 0x7Fu) { + out->flow = X86_JCC; + read_rel(code + (n - im), im, out); + } + break; + } + } + } else if (op >= 0x80u && op <= 0x8Fu) { /* 0F 80..8F Jcc rel32 */ + out->flow = X86_JCC; + read_rel(code + (n - im), im, out); + } + + out->len = (uint8_t)n; + return (int)n; +} + +uint64_t x86_branch_target(uint64_t ip, const x86_insn* insn) { + if (!insn || !insn->has_rel) { return 0; } + return ip + (uint64_t)insn->len + (uint64_t)(int64_t)insn->rel; +} + +uint64_t x86_riprel_target(uint64_t ip, const x86_insn* insn) { + if (!insn || !insn->has_riprel) { return 0; } + return ip + (uint64_t)insn->len + (uint64_t)(int64_t)insn->riprel; +}