""" This script employs several sophisticated techniques to ensure the quality and correctness of the generated tests. # Instruction Parsing The script begins by parsing the `arm32.inc` file. It uses a regular expression to find all occurrences of the `INST()` macro and extracts three key pieces of information for each instruction: 1. **Mnemonic**: A short, unique identifier (e.g., `ADD_imm`). 2. **Name**: A human-readable description (e.g., `"ADD (imm)"`). 3. **Bitstring**: A 32-character string representing the instruction's binary encoding. The bitstring is the most critical piece. It's a mix of `'0'`, `'1'`, and wildcard characters (like `v`, `n`, `c`) that represent variable fields. # Randomized Instantiation & Constraint System To create a concrete test case from an abstract bitstring, the script must generate a valid 32-bit integer. It does this by: 1. Setting the fixed `'0'` and `'1'` bits. 2. Randomly generating `'0'` or `'1'` for all wildcard bits. However, simple randomization can be problematic. Certain ARM instructions are specializations of more general patterns. For example, the `SXTB` instruction is a special case of the `SXTAB` instruction where the `Rn` register field is `1111`. If we randomly generate an `SXTAB` test case where `Rn` happens to be `1111`, the decoder might (correctly) identify it as `SXTB`. This would cause the `SXTAB` test to fail. To prevent this, the script uses a **constraint system** (`get_instruction_constraints`). This function defines rules to avoid generating ambiguous encodings. When generating a test for a general instruction (`SXTAB`), it forces the specialized bits (`Rn`) to be a value other than the one that would cause it to alias to the more specific instruction (`SXTB`). This ensures each test validates exa ctly one unique instruction definition. # Oracle-Based Negative Testing The most powerful feature of this script is its **negative verification** strategy. For every fixed `'0'` or `'1'` bit in an instruction's bitstring, the script generates a test case where that single bit is flipped. This creates an instruction that is intentionally invalid *for that specific pattern*. The script then uses an "oracle" — a Python-based reference decoder (`python_decode`) — to predict what this corrupted instruction *should* decode to. The corrupted value might match a different valid instruction, or it might be completely invalid (decode to `NULL`). The generated C++ test then asserts that the actual C++ decoder's output exactly matches the oracle's prediction. This guarantees that the decoder rejects invalid patterns that are only one bit off from a valid one. # Fuzz Testing Finally, the script generates a fuzz test that feeds a large number (100,000) of completely random 32-bit integers to the decoder. This test serves as a stability and integrity check. If the decoder identifies any of these random inputs as a valid instruction, it cross-verifies that the input truly matches the mask and expected value for that instruction. This ensures the decoder never produces "false positives" and is robust against arbitrary data. """ #!/usr/bin/env python3 import re import sys import argparse import random from typing import List, Dict, Optional, Tuple CPP_HEADER = """/* * GENERATED FILE - DO NOT EDIT * * This file is generated by scripts/generate_jit_decoder_tests.py * * PURPOSE: * Provides 100% requirements-based test coverage for the ARM32 Instruction Decoder. */ #include "jit/frontend/decoder/arm32.h" #include #include class Arm32DecoderGeneratedTest : public ::testing::Test { protected: void SetUp() override { } }; """ class Instruction: """A container for a parsed instruction definition.""" def __init__( self, mnemonic: str, name: str, bitstring: str, val: int, mask: int, expected: int, ): self.mnemonic: str = mnemonic self.name: str = name self.bitstring: str = bitstring self.val: int = val # A randomly generated valid encoding self.mask: int = mask self.expected: int = expected def get_instruction_constraints(name: str) -> Dict[int, int]: """ Returns a dictionary of {bit_index: value} to force specific instructions to avoid generating encodings that belong to other, more specific instructions. This prevents instruction aliasing during randomized test case generation. Args: name: The human-readable name of the instruction (e.g., "SXTAB"). Returns: A dictionary mapping bit positions (31-0) to a required value (0 or 1). """ constraints: Dict[int, int] = {} # ------------------------------------------------------------------------- # Load/Store & Coprocessor Collisions # ------------------------------------------------------------------------- # To ensure LDC doesn't look like MRRC, we force P=1 (Bit 24). MRRC requires bit 24=0. if name in ["LDC", "LDC2", "STC", "STC2"]: constraints[24] = 1 # Unprivileged Loads (LDRT/STRT) become aliases if P=0, W=1. # Force P=1 for standard loads to avoid these aliases. if name in [ "LDR (reg)", "LDRB (reg)", "LDRH (reg)", "LDRSB (reg)", "LDRSH (reg)", "STR (reg)", "STRB (reg)", "STRH (reg)", "LDRD (reg)", "STRD (reg)", ]: constraints[24] = 1 if name in ["LDR (imm)", "LDRB (imm)", "STR (imm)", "STRB (imm)"]: constraints[24] = 1 # ------------------------------------------------------------------------- # Extend Instructions (e.g., SXTB/SXTAB) # ------------------------------------------------------------------------- # SXTB is SXTAB with Rn=1111 (bits 19-16). # When testing the more generic SXTAB, ensure Rn is not 1111. We force 0. if name in ["SXTAB", "SXTAB16", "SXTAH", "UXTAB", "UXTAB16", "UXTAH"]: constraints[19] = 0 constraints[18] = 0 constraints[17] = 0 constraints[16] = 0 # ------------------------------------------------------------------------- # Multiply Instructions # ------------------------------------------------------------------------- # SMMUL is SMMLA with Ra=1111. Force Ra!=1111 for SMMLA tests. if name in ["SMMLA", "SMMLS"]: constraints[15] = 0 constraints[14] = 0 constraints[13] = 0 constraints[12] = 0 # SMUAD is SMLAD with Ra=1111. Force Ra!=1111 for SMLAD-family tests. if name in ["SMLAD", "SMLSD", "SMLALD", "SMLSLD"]: constraints[15] = 0 constraints[14] = 0 constraints[13] = 0 constraints[12] = 0 return constraints def calculate_mask_and_expected(bitstring: str) -> Tuple[int, int]: """ Calculates the mask and expected value from a bitstring pattern. - '1' sets the bit in both mask and expected. - '0' sets the bit in the mask only. - Wildcards leave the bit as 0 in both. Args: bitstring: The 32-character instruction pattern. Returns: A tuple containing the (mask, expected) integer values. """ mask: int = 0 expected: int = 0 for i, char in enumerate(bitstring): bit_pos = 31 - i if char == "0": mask |= 1 << bit_pos elif char == "1": mask |= 1 << bit_pos expected |= 1 << bit_pos return mask, expected def parse_bitstring_randomized(name: str, bitstring: str) -> int: """ Generates a concrete, valid instruction word from a bitstring pattern. Applies constraints to avoid generating ambiguous instruction aliases. Args: name: The human-readable name of the instruction. bitstring: The 32-character instruction pattern. Returns: A 32-bit integer representing a valid encoding of the instruction. """ val: int = 0 if len(bitstring) != 32: raise ValueError(f"Invalid bitstring length: {len(bitstring)}") constraints = get_instruction_constraints(name) for i, char in enumerate(bitstring): bit_pos = 31 - i # Apply constraints first if they exist for this bit if bit_pos in constraints: if constraints[bit_pos] == 1: val |= 1 << bit_pos continue # Set fixed bits or randomize wildcard bits if char == "1": val |= 1 << bit_pos elif char not in ("0", "1"): if random.choice([True, False]): val |= 1 << bit_pos return val def parse_inc_file(input_path: str) -> List[Instruction]: """ Parses an arm32.inc file and returns a list of Instruction objects. Args: input_path: The path to the arm32.inc file. Returns: A list of Instruction objects, one for each INST macro found. """ instructions: List[Instruction] = [] regex = re.compile(r'INST\(\s*([A-Za-z0-9_]+),\s*"(.*?)",\s*"(.*?)"\s*\)') try: with open(input_path, "r") as f: lines = f.readlines() except FileNotFoundError: print(f"Error: Could not find input file: {input_path}") sys.exit(1) for line in lines: line = line.strip() if not line or line.startswith("//"): continue match = regex.search(line) if match: mnemonic = match.group(1) name = match.group(2) bitstring = match.group(3) val = parse_bitstring_randomized(name, bitstring) mask, expected = calculate_mask_and_expected(bitstring) # Manual Patch for MSR (imm), which has a complex, non-randomizable constraint. # The bitstring is cccc00110010mmmm1111rrrrvvvvvvvv, but if `vvvv` fields are all 0, # it becomes a different instruction. We force a non-zero immediate to ensure a valid MSR. if name == "MSR (imm)": val |= 1 << 16 # Set bit 16 of the immediate field instructions.append( Instruction(mnemonic, name, bitstring, val, mask, expected) ) return instructions def python_decode(val: int, instructions: List[Instruction]) -> Optional[str]: """ Acts as a reference decoder (oracle). Returns the name of the first instruction that matches 'val'. This simulates the linear scan of the C decoder to predict the correct result. Args: val: The 32-bit instruction word to decode. instructions: The list of instruction definitions, in order of precedence. Returns: The name of the matching instruction, or None if no match is found. """ for inst in instructions: if (val & inst.mask) == inst.expected: return inst.name return None def generate_cpp_tests(instructions: List[Instruction], output_path: str) -> None: """ Generates the C++ test file content and writes it to the output path. Args: instructions: A list of all instruction definitions. output_path: The path to write the generated .cpp file. """ with open(output_path, "w") as f: f.write(CPP_HEADER) mnemonic_counts: Dict[str, int] = {} for inst in instructions: base_mnemonic = inst.mnemonic val = inst.val name = inst.name bitstring = inst.bitstring # Generate a unique test name, handling multiple definitions for one mnemonic if base_mnemonic not in mnemonic_counts: mnemonic_counts[base_mnemonic] = 1 test_name = f"Verify_{base_mnemonic}" else: mnemonic_counts[base_mnemonic] += 1 count = mnemonic_counts[base_mnemonic] test_name = f"Verify_{base_mnemonic}_{count}" f.write(f"TEST_F(Arm32DecoderGeneratedTest, {test_name}) {{\n") # --- 1. Positive Verification --- f.write( f' // 1. Positive Verification: Ensures "{name}" is correctly identified.\n' ) f.write(f" const uint32_t valid_inst = {val:#010x};\n") f.write( f" const pvm_jit_decoder_arm32_instruction_info_t* info = pvm_jit_decoder_arm32_decode(valid_inst);\n\n" ) f.write( f' ASSERT_NE(info, nullptr) << "Failed to decode known valid pattern for {name}: {val:#x}";\n' ) f.write( f' EXPECT_STREQ(info->name, "{name}") << "Decoded as the wrong instruction variant.";\n' ) f.write( f' EXPECT_EQ((valid_inst & info->mask), info->expected) << "Mask/Expected mismatch on positive test.";\n\n' ) # --- 2. Negative Verification (Oracle Based) --- f.write( f" // 2. Negative Verification: Flip each fixed bit to ensure correct alternative decoding or rejection.\n" ) for i, char in enumerate(bitstring): bit_pos = 31 - i # We only test the fixed bits, as they define the instruction pattern. if char in ("0", "1"): mask = 1 << bit_pos corrupt_inst = val ^ mask # ORACLE: Determine what this corrupted instruction SHOULD decode to. expected_decoded_name = python_decode(corrupt_inst, instructions) f.write(f" {{\n") f.write(f" // Test case: Flipping fixed bit {bit_pos}\n") f.write( f" const uint32_t corrupt_inst = {corrupt_inst:#010x};\n" ) f.write( f" const pvm_jit_decoder_arm32_instruction_info_t* neg_info = pvm_jit_decoder_arm32_decode(corrupt_inst);\n\n" ) if expected_decoded_name is None: # Should decode to NOTHING f.write(f" // Oracle predicts no match.\n") f.write( f' EXPECT_EQ(neg_info, nullptr) << "Safety Violation: Should have decoded to nullptr, but got " << (neg_info ? neg_info->name : "nullptr");\n' ) else: # Should decode to the OTHER valid instruction f.write( f" // Oracle predicts this should decode as: {expected_decoded_name}\n" ) f.write( f" ASSERT_NE(neg_info, nullptr) << \"Safety Violation: Python Oracle predicted '{expected_decoded_name}' but C++ decoder returned null\";\n" ) f.write( f' EXPECT_STREQ(neg_info->name, "{expected_decoded_name}") << "Safety Violation: Incorrect decode on single-bit corruption.";\n' ) f.write(f" }}\n") f.write(f"}}\n\n") # --- 3. Generate Fuzz Test for overall stability --- f.write("TEST_F(Arm32DecoderGeneratedTest, Stability_Fuzz_Test) {\n") f.write( " // Feeds a large number of random inputs to the decoder to check for crashes or false positives.\n" ) f.write(" std::mt19937 rng(42); // Fixed seed for deterministic runs\n") f.write(" std::uniform_int_distribution dist;\n\n") f.write(" for(int i = 0; i < 100000; ++i) {\n") f.write(" uint32_t random_inst = dist(rng);\n") f.write( " const pvm_jit_decoder_arm32_instruction_info_t* info = pvm_jit_decoder_arm32_decode(random_inst);\n" ) f.write(" if (info) {\n") f.write( " // If the decoder claims a match, it MUST be a valid match.\n" ) f.write(" ASSERT_EQ((random_inst & info->mask), info->expected) \n") f.write( ' << "Integrity Violation: Decoded " << std::hex << random_inst << " as \\"" << info->name << "\\" but mask/expected failed.";\n' ) f.write(" }\n") f.write(" }\n") f.write("}\n") def main() -> None: """Main entry point for the script.""" parser = argparse.ArgumentParser(description="Generate ARM32 Decoder Tests") parser.add_argument("input", help="Path to arm32.inc") parser.add_argument("output", help="Path to output test_arm32_generated.cpp") args = parser.parse_args() print(f"{args.input} -> {args.output}") # Use a fixed seed for deterministic test generation. This is crucial for reproducibility. random.seed(12345) instructions = parse_inc_file(args.input) generate_cpp_tests(instructions, args.output) if __name__ == "__main__": main()