mirror of
https://github.com/pound-emu/pound.git
synced 2025-12-11 07:36:57 +00:00
Renames generate_decoder_tests.py to generate_jit_decoder_tests.py. Signed-off-by: Ronald Caesar <github43132@proton.me>
454 lines
17 KiB
Python
454 lines
17 KiB
Python
"""
|
|
This script employs several sophisticated techniques to ensure the quality and
|
|
correctness of the generated tests.
|
|
|
|
|
|
# Instruction Parsing
|
|
|
|
The script begins by parsing the `arm32.inc` file. It uses a regular expression
|
|
to find all occurrences of the `INST()` macro and extracts three key pieces of
|
|
information for each instruction:
|
|
|
|
1. **Mnemonic**: A short, unique identifier (e.g., `ADD_imm`).
|
|
2. **Name**: A human-readable description (e.g., `"ADD (imm)"`).
|
|
3. **Bitstring**: A 32-character string representing the instruction's binary
|
|
encoding.
|
|
|
|
The bitstring is the most critical piece. It's a mix of `'0'`, `'1'`, and
|
|
wildcard characters (like `v`, `n`, `c`) that represent variable fields.
|
|
|
|
|
|
# Randomized Instantiation & Constraint System
|
|
|
|
To create a concrete test case from an abstract bitstring, the script must
|
|
generate a valid 32-bit integer. It does this by:
|
|
|
|
1. Setting the fixed `'0'` and `'1'` bits.
|
|
2. Randomly generating `'0'` or `'1'` for all wildcard bits.
|
|
|
|
However, simple randomization can be problematic. Certain ARM instructions
|
|
are specializations of more general patterns. For example, the `SXTB`
|
|
instruction is a special case of the `SXTAB` instruction where the `Rn`
|
|
register field is `1111`.
|
|
|
|
If we randomly generate an `SXTAB` test case where `Rn` happens to be `1111`,
|
|
the decoder might (correctly) identify it as `SXTB`. This would cause the
|
|
`SXTAB` test to fail.
|
|
|
|
To prevent this, the script uses a **constraint system**
|
|
(`get_instruction_constraints`). This function defines rules to avoid
|
|
generating ambiguous encodings. When generating a test for a general
|
|
instruction (`SXTAB`), it forces the specialized bits (`Rn`) to be a value other
|
|
than the one that would cause it to alias to the more specific instruction
|
|
(`SXTB`). This ensures each test validates exa ctly one unique instruction
|
|
definition.
|
|
|
|
|
|
# Oracle-Based Negative Testing
|
|
|
|
The most powerful feature of this script is its **negative verification**
|
|
strategy. For every fixed `'0'` or `'1'` bit in an instruction's bitstring,
|
|
the script generates a test case where that single bit is flipped.
|
|
This creates an instruction that is intentionally invalid *for that specific
|
|
pattern*.
|
|
|
|
The script then uses an "oracle" — a Python-based reference decoder
|
|
(`python_decode`) — to predict what this corrupted instruction *should*
|
|
decode to. The corrupted value might match a different valid instruction, or
|
|
it might be completely invalid (decode to `NULL`).
|
|
|
|
The generated C++ test then asserts that the actual C++ decoder's output
|
|
exactly matches the oracle's prediction. This guarantees that the decoder
|
|
rejects invalid patterns that are only one bit off from a valid one.
|
|
|
|
|
|
# Fuzz Testing
|
|
|
|
Finally, the script generates a fuzz test that feeds a large number (100,000)
|
|
of completely random 32-bit integers to the decoder. This test serves as a
|
|
stability and integrity check. If the decoder identifies any of these random
|
|
inputs as a valid instruction, it cross-verifies that the input truly matches
|
|
the mask and expected value for that instruction. This ensures the decoder
|
|
never produces "false positives" and is robust against arbitrary data.
|
|
"""
|
|
|
|
#!/usr/bin/env python3
|
|
import re
|
|
import sys
|
|
import argparse
|
|
import random
|
|
from typing import List, Dict, Optional, Tuple
|
|
|
|
CPP_HEADER = """/*
|
|
* GENERATED FILE - DO NOT EDIT
|
|
*
|
|
* This file is generated by scripts/generate_jit_decoder_tests.py
|
|
*
|
|
* PURPOSE:
|
|
* Provides 100% requirements-based test coverage for the ARM32 Instruction Decoder.
|
|
*/
|
|
|
|
#include "jit/frontend/decoder/arm32.h"
|
|
#include <gtest/gtest.h>
|
|
#include <random>
|
|
|
|
class Arm32DecoderGeneratedTest : public ::testing::Test {
|
|
protected:
|
|
void SetUp() override {
|
|
}
|
|
};
|
|
"""
|
|
|
|
|
|
class Instruction:
|
|
"""A container for a parsed instruction definition."""
|
|
|
|
def __init__(
|
|
self,
|
|
mnemonic: str,
|
|
name: str,
|
|
bitstring: str,
|
|
val: int,
|
|
mask: int,
|
|
expected: int,
|
|
):
|
|
self.mnemonic: str = mnemonic
|
|
self.name: str = name
|
|
self.bitstring: str = bitstring
|
|
self.val: int = val # A randomly generated valid encoding
|
|
self.mask: int = mask
|
|
self.expected: int = expected
|
|
|
|
|
|
def get_instruction_constraints(name: str) -> Dict[int, int]:
|
|
"""
|
|
Returns a dictionary of {bit_index: value} to force specific instructions
|
|
to avoid generating encodings that belong to other, more specific instructions.
|
|
This prevents instruction aliasing during randomized test case generation.
|
|
|
|
Args:
|
|
name: The human-readable name of the instruction (e.g., "SXTAB").
|
|
|
|
Returns:
|
|
A dictionary mapping bit positions (31-0) to a required value (0 or 1).
|
|
"""
|
|
constraints: Dict[int, int] = {}
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Load/Store & Coprocessor Collisions
|
|
# -------------------------------------------------------------------------
|
|
# To ensure LDC doesn't look like MRRC, we force P=1 (Bit 24). MRRC requires bit 24=0.
|
|
if name in ["LDC", "LDC2", "STC", "STC2"]:
|
|
constraints[24] = 1
|
|
|
|
# Unprivileged Loads (LDRT/STRT) become aliases if P=0, W=1.
|
|
# Force P=1 for standard loads to avoid these aliases.
|
|
if name in [
|
|
"LDR (reg)",
|
|
"LDRB (reg)",
|
|
"LDRH (reg)",
|
|
"LDRSB (reg)",
|
|
"LDRSH (reg)",
|
|
"STR (reg)",
|
|
"STRB (reg)",
|
|
"STRH (reg)",
|
|
"LDRD (reg)",
|
|
"STRD (reg)",
|
|
]:
|
|
constraints[24] = 1
|
|
if name in ["LDR (imm)", "LDRB (imm)", "STR (imm)", "STRB (imm)"]:
|
|
constraints[24] = 1
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Extend Instructions (e.g., SXTB/SXTAB)
|
|
# -------------------------------------------------------------------------
|
|
# SXTB is SXTAB with Rn=1111 (bits 19-16).
|
|
# When testing the more generic SXTAB, ensure Rn is not 1111. We force 0.
|
|
if name in ["SXTAB", "SXTAB16", "SXTAH", "UXTAB", "UXTAB16", "UXTAH"]:
|
|
constraints[19] = 0
|
|
constraints[18] = 0
|
|
constraints[17] = 0
|
|
constraints[16] = 0
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Multiply Instructions
|
|
# -------------------------------------------------------------------------
|
|
# SMMUL is SMMLA with Ra=1111. Force Ra!=1111 for SMMLA tests.
|
|
if name in ["SMMLA", "SMMLS"]:
|
|
constraints[15] = 0
|
|
constraints[14] = 0
|
|
constraints[13] = 0
|
|
constraints[12] = 0
|
|
|
|
# SMUAD is SMLAD with Ra=1111. Force Ra!=1111 for SMLAD-family tests.
|
|
if name in ["SMLAD", "SMLSD", "SMLALD", "SMLSLD"]:
|
|
constraints[15] = 0
|
|
constraints[14] = 0
|
|
constraints[13] = 0
|
|
constraints[12] = 0
|
|
|
|
return constraints
|
|
|
|
|
|
def calculate_mask_and_expected(bitstring: str) -> Tuple[int, int]:
|
|
"""
|
|
Calculates the mask and expected value from a bitstring pattern.
|
|
- '1' sets the bit in both mask and expected.
|
|
- '0' sets the bit in the mask only.
|
|
- Wildcards leave the bit as 0 in both.
|
|
|
|
Args:
|
|
bitstring: The 32-character instruction pattern.
|
|
|
|
Returns:
|
|
A tuple containing the (mask, expected) integer values.
|
|
"""
|
|
mask: int = 0
|
|
expected: int = 0
|
|
for i, char in enumerate(bitstring):
|
|
bit_pos = 31 - i
|
|
if char == "0":
|
|
mask |= 1 << bit_pos
|
|
elif char == "1":
|
|
mask |= 1 << bit_pos
|
|
expected |= 1 << bit_pos
|
|
return mask, expected
|
|
|
|
|
|
def parse_bitstring_randomized(name: str, bitstring: str) -> int:
|
|
"""
|
|
Generates a concrete, valid instruction word from a bitstring pattern.
|
|
Applies constraints to avoid generating ambiguous instruction aliases.
|
|
|
|
Args:
|
|
name: The human-readable name of the instruction.
|
|
bitstring: The 32-character instruction pattern.
|
|
|
|
Returns:
|
|
A 32-bit integer representing a valid encoding of the instruction.
|
|
"""
|
|
val: int = 0
|
|
if len(bitstring) != 32:
|
|
raise ValueError(f"Invalid bitstring length: {len(bitstring)}")
|
|
|
|
constraints = get_instruction_constraints(name)
|
|
|
|
for i, char in enumerate(bitstring):
|
|
bit_pos = 31 - i
|
|
# Apply constraints first if they exist for this bit
|
|
if bit_pos in constraints:
|
|
if constraints[bit_pos] == 1:
|
|
val |= 1 << bit_pos
|
|
continue
|
|
|
|
# Set fixed bits or randomize wildcard bits
|
|
if char == "1":
|
|
val |= 1 << bit_pos
|
|
elif char not in ("0", "1"):
|
|
if random.choice([True, False]):
|
|
val |= 1 << bit_pos
|
|
return val
|
|
|
|
|
|
def parse_inc_file(input_path: str) -> List[Instruction]:
|
|
"""
|
|
Parses an arm32.inc file and returns a list of Instruction objects.
|
|
|
|
Args:
|
|
input_path: The path to the arm32.inc file.
|
|
|
|
Returns:
|
|
A list of Instruction objects, one for each INST macro found.
|
|
"""
|
|
instructions: List[Instruction] = []
|
|
regex = re.compile(r'INST\(\s*([A-Za-z0-9_]+),\s*"(.*?)",\s*"(.*?)"\s*\)')
|
|
|
|
try:
|
|
with open(input_path, "r") as f:
|
|
lines = f.readlines()
|
|
except FileNotFoundError:
|
|
print(f"Error: Could not find input file: {input_path}")
|
|
sys.exit(1)
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line or line.startswith("//"):
|
|
continue
|
|
|
|
match = regex.search(line)
|
|
if match:
|
|
mnemonic = match.group(1)
|
|
name = match.group(2)
|
|
bitstring = match.group(3)
|
|
|
|
val = parse_bitstring_randomized(name, bitstring)
|
|
mask, expected = calculate_mask_and_expected(bitstring)
|
|
|
|
# Manual Patch for MSR (imm), which has a complex, non-randomizable constraint.
|
|
# The bitstring is cccc00110010mmmm1111rrrrvvvvvvvv, but if `vvvv` fields are all 0,
|
|
# it becomes a different instruction. We force a non-zero immediate to ensure a valid MSR.
|
|
if name == "MSR (imm)":
|
|
val |= 1 << 16 # Set bit 16 of the immediate field
|
|
|
|
instructions.append(
|
|
Instruction(mnemonic, name, bitstring, val, mask, expected)
|
|
)
|
|
return instructions
|
|
|
|
|
|
def python_decode(val: int, instructions: List[Instruction]) -> Optional[str]:
|
|
"""
|
|
Acts as a reference decoder (oracle). Returns the name of the first instruction that matches 'val'.
|
|
This simulates the linear scan of the C decoder to predict the correct result.
|
|
|
|
Args:
|
|
val: The 32-bit instruction word to decode.
|
|
instructions: The list of instruction definitions, in order of precedence.
|
|
|
|
Returns:
|
|
The name of the matching instruction, or None if no match is found.
|
|
"""
|
|
for inst in instructions:
|
|
if (val & inst.mask) == inst.expected:
|
|
return inst.name
|
|
return None
|
|
|
|
|
|
def generate_cpp_tests(instructions: List[Instruction], output_path: str) -> None:
|
|
"""
|
|
Generates the C++ test file content and writes it to the output path.
|
|
|
|
Args:
|
|
instructions: A list of all instruction definitions.
|
|
output_path: The path to write the generated .cpp file.
|
|
"""
|
|
with open(output_path, "w") as f:
|
|
f.write(CPP_HEADER)
|
|
|
|
mnemonic_counts: Dict[str, int] = {}
|
|
|
|
for inst in instructions:
|
|
base_mnemonic = inst.mnemonic
|
|
val = inst.val
|
|
name = inst.name
|
|
bitstring = inst.bitstring
|
|
|
|
# Generate a unique test name, handling multiple definitions for one mnemonic
|
|
if base_mnemonic not in mnemonic_counts:
|
|
mnemonic_counts[base_mnemonic] = 1
|
|
test_name = f"Verify_{base_mnemonic}"
|
|
else:
|
|
mnemonic_counts[base_mnemonic] += 1
|
|
count = mnemonic_counts[base_mnemonic]
|
|
test_name = f"Verify_{base_mnemonic}_{count}"
|
|
|
|
f.write(f"TEST_F(Arm32DecoderGeneratedTest, {test_name}) {{\n")
|
|
|
|
# --- 1. Positive Verification ---
|
|
f.write(
|
|
f' // 1. Positive Verification: Ensures "{name}" is correctly identified.\n'
|
|
)
|
|
f.write(f" const uint32_t valid_inst = {val:#010x};\n")
|
|
f.write(
|
|
f" const pvm_jit_decoder_arm32_instruction_info_t* info = pvm_jit_decoder_arm32_decode(valid_inst);\n\n"
|
|
)
|
|
f.write(
|
|
f' ASSERT_NE(info, nullptr) << "Failed to decode known valid pattern for {name}: {val:#x}";\n'
|
|
)
|
|
f.write(
|
|
f' EXPECT_STREQ(info->name, "{name}") << "Decoded as the wrong instruction variant.";\n'
|
|
)
|
|
f.write(
|
|
f' EXPECT_EQ((valid_inst & info->mask), info->expected) << "Mask/Expected mismatch on positive test.";\n\n'
|
|
)
|
|
|
|
# --- 2. Negative Verification (Oracle Based) ---
|
|
f.write(
|
|
f" // 2. Negative Verification: Flip each fixed bit to ensure correct alternative decoding or rejection.\n"
|
|
)
|
|
|
|
for i, char in enumerate(bitstring):
|
|
bit_pos = 31 - i
|
|
|
|
# We only test the fixed bits, as they define the instruction pattern.
|
|
if char in ("0", "1"):
|
|
mask = 1 << bit_pos
|
|
corrupt_inst = val ^ mask
|
|
|
|
# ORACLE: Determine what this corrupted instruction SHOULD decode to.
|
|
expected_decoded_name = python_decode(corrupt_inst, instructions)
|
|
|
|
f.write(f" {{\n")
|
|
f.write(f" // Test case: Flipping fixed bit {bit_pos}\n")
|
|
f.write(
|
|
f" const uint32_t corrupt_inst = {corrupt_inst:#010x};\n"
|
|
)
|
|
f.write(
|
|
f" const pvm_jit_decoder_arm32_instruction_info_t* neg_info = pvm_jit_decoder_arm32_decode(corrupt_inst);\n\n"
|
|
)
|
|
|
|
if expected_decoded_name is None:
|
|
# Should decode to NOTHING
|
|
f.write(f" // Oracle predicts no match.\n")
|
|
f.write(
|
|
f' EXPECT_EQ(neg_info, nullptr) << "Safety Violation: Should have decoded to nullptr, but got " << (neg_info ? neg_info->name : "nullptr");\n'
|
|
)
|
|
else:
|
|
# Should decode to the OTHER valid instruction
|
|
f.write(
|
|
f" // Oracle predicts this should decode as: {expected_decoded_name}\n"
|
|
)
|
|
f.write(
|
|
f" ASSERT_NE(neg_info, nullptr) << \"Safety Violation: Python Oracle predicted '{expected_decoded_name}' but C++ decoder returned null\";\n"
|
|
)
|
|
f.write(
|
|
f' EXPECT_STREQ(neg_info->name, "{expected_decoded_name}") << "Safety Violation: Incorrect decode on single-bit corruption.";\n'
|
|
)
|
|
|
|
f.write(f" }}\n")
|
|
|
|
f.write(f"}}\n\n")
|
|
|
|
# --- 3. Generate Fuzz Test for overall stability ---
|
|
f.write("TEST_F(Arm32DecoderGeneratedTest, Stability_Fuzz_Test) {\n")
|
|
f.write(
|
|
" // Feeds a large number of random inputs to the decoder to check for crashes or false positives.\n"
|
|
)
|
|
f.write(" std::mt19937 rng(42); // Fixed seed for deterministic runs\n")
|
|
f.write(" std::uniform_int_distribution<uint32_t> dist;\n\n")
|
|
f.write(" for(int i = 0; i < 100000; ++i) {\n")
|
|
f.write(" uint32_t random_inst = dist(rng);\n")
|
|
f.write(
|
|
" const pvm_jit_decoder_arm32_instruction_info_t* info = pvm_jit_decoder_arm32_decode(random_inst);\n"
|
|
)
|
|
f.write(" if (info) {\n")
|
|
f.write(
|
|
" // If the decoder claims a match, it MUST be a valid match.\n"
|
|
)
|
|
f.write(" ASSERT_EQ((random_inst & info->mask), info->expected) \n")
|
|
f.write(
|
|
' << "Integrity Violation: Decoded " << std::hex << random_inst << " as \\"" << info->name << "\\" but mask/expected failed.";\n'
|
|
)
|
|
f.write(" }\n")
|
|
f.write(" }\n")
|
|
f.write("}\n")
|
|
|
|
|
|
def main() -> None:
|
|
"""Main entry point for the script."""
|
|
parser = argparse.ArgumentParser(description="Generate ARM32 Decoder Tests")
|
|
parser.add_argument("input", help="Path to arm32.inc")
|
|
parser.add_argument("output", help="Path to output test_arm32_generated.cpp")
|
|
args = parser.parse_args()
|
|
|
|
print(f"{args.input} -> {args.output}")
|
|
|
|
# Use a fixed seed for deterministic test generation. This is crucial for reproducibility.
|
|
random.seed(12345)
|
|
|
|
instructions = parse_inc_file(args.input)
|
|
generate_cpp_tests(instructions, args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|