import re import os import struct import subprocess import random from collections import defaultdict from typing import List, Dict, Tuple, Set US_PATH = '../pokeemerald' def elf_funcs(elf_path: str, unknown=True) -> Dict[str, Tuple[int, int]]: # Find all functions in an ELF file p = subprocess.run(f'readelf -s -W {elf_path}', stdout=subprocess.PIPE, encoding='utf-8', shell=True) lines = p.stdout.split('\n') elf_reg = re.compile(r'\s*\d+:\s([a-fA-F0-9]{8})\s+(\d+)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)') symbols = {} # Map function names to (address, size tuples) for line in lines: m = elf_reg.match(line) if m and m.group(3) == 'FUNC': # Collect functions addr, size, dtype, bind, vis, ndx, name = m.groups() is_unknown = name[:4] == 'sub_' if unknown is not None and is_unknown != unknown: continue addr = int(addr, 16) size = int(size) symbols[name] = (addr-1, size) if unknown is True: unk_str = 'unknown ' elif unknown is False: unk_str = 'known ' else: unk_str = '' print(f'Found {len(symbols)} {unk_str}functions') return symbols # Builds an "instruction graph" # Maps instructions to a dictionary # That dictionary maps following instructions, to the places they follow at # So if you have: # 0 MOV r0, r1 # 2 BX lr # {(MOV r0, r1): {(BX lr): {2}}} def build_graph(rom: str, symbols: Dict[str, Tuple[int, int]]) -> Dict[int, Dict[int, Set[int]]]: graph = defaultdict(lambda: defaultdict(set)) with open(rom, 'rb') as f: count = 0 print('Graphed 0 functions', end='') for name, (addr, size) in symbols.items(): f.seek(addr & 0xfffffe) data = f.read(size) instrs = [t[0] for t in struct.iter_unpack(' Dict[int, int]: # Calculate the size of *addresses*. Crude asf with open(map_path, 'r') as f: lines = list(line for line in f) sizes = {} # Maps addresses to their sizes reg = re.compile(r'\s*(0x[\da-fA-F]+)') for l1, l2 in zip(lines[:-1], lines[1:]): m1 = reg.match(l1) if not m1: continue m2 = reg.match(l2) if not m2: continue addr1, addr2 = int(m1.group(1), 16), int(m2.group(1), 16) if not addr1 & 0xff000000 == addr2 & 0xff000000 == 0x08000000: # Only pay attention to ROM addresses continue size = addr2 - addr1 sizes[addr1] = size return sizes def asm_funcs(asm_path: str, sizes: Dict[int, int], unknown=True) -> Dict[str, Tuple[int, int]]: reg = re.compile(r'(\w+): @ (0x[\da-fA-F]+)') symbols = {} # Maps names to (address, size) tuples with open(asm_path, 'r') as f: for line in f: m = reg.match(line) if m: is_unknown = m.group(1).startswith('sub_') if unknown != is_unknown: # Only want certain functions continue addr = int(m.group(2), 16) if addr not in sizes: continue symbols[m.group(1)] = (addr, sizes[addr]) print(f'Found {len(symbols)} {"unknown" if unknown else "known"} JP functions') return symbols def match_instrs(instrs: List[int], graph): # Match a sequence of instructions matches = [] if not instrs: return [] for base in graph[None][instrs[0]]: last = instrs[0] # print(f'Try: {base:08x}') match_len = 1 for match_len, instr in enumerate(instrs[1:], 1): addr = base + 2*match_len potential = graph[last][instr] # print(f'{instr:04x} {addr:08x} {not sequence_break}') if addr not in potential: # Sequence is broken break last = instr matches.append((match_len, base)) matches.sort(reverse=True) # Get the best match if len(instrs) >= 10 and matches and len(instrs)-1 == matches[0][0]: # Perfect match if not any(t[0] == matches[0][0] for t in matches[1:]): # No other perfect matches return matches[:1] return matches def decode(instrs: List[int]): # Primitive, unfinished THUMB decoder for op in instrs: print(f'{op:04x}', end=' ') if op & 0xf800 == 0x1800: # Add/subtract opcode = (op >> 9) & 3 codes = ('ADD', 'SUB', 'ADD', 'SUB') val = (op >> 6) & 7 rs = (op >> 3) & 7 rd = op & 7 flag = 'r' if opcode < 2 else '#' print(f'{codes[opcode]} r{rd},r{rs},{flag}{val}') elif op & 0xe000 == 0: # Move shifted register opcode = (op >> 11) & 3 codes = ('LSL', 'LSR', 'ASR') offset = (op >> 6) & 31 rs = (op >> 3) & 7 rd = op & 7 print(f'{codes[opcode]} r{rd},r{rs},#{offset}') elif op & 0xe000 == 0x2000: # Add/sub immediate opcode = (op >> 11) & 3 codes = ('MOV', 'CMP', 'ADD', 'SUB') rd = (op >> 8) & 7 nn = op & 255 print(f'{codes[opcode]} r{rd},#{nn}') elif op & 0xfc00 == 0x4000: # ALU ops opcode = (op >> 6) & 15 codes = ('AND', 'EOR', 'LSL', 'LSR', 'ASR', 'ADC', 'SBC', 'ROR', 'TST', 'NEG', 'CMP', 'CMN', 'ORR', 'MUL', 'BIC', 'MVN') rs = (op >> 3) & 7 rd = op & 7 print(f'{codes[opcode]} r{rd}, r{rs}') elif op & 0xf800 == 0x4800: # Load PC-relative r = (op >> 8) & 7 print(f'LDR r{r} PC+{4*(op & 255)}') elif op & 0xf200 == 0x5200: # Load register offset opcode = (op >> 10) & 3 codes = ('STR', 'STB', 'LDR', 'LDB') ro = (op >> 6) & 7 rb = (op >> 3) & 7 rd = op & 7 print(f'{codes[opcode]} r{rd},[r{rb},r{ro}]') elif op & 0xe000 == 0x6000: # Load immediate offset opcode = (op >> 11) & 3 codes = ('STR', 'LDR', 'STB', 'LDB') nn = (op >> 6) & 31 if opcode < 2: nn *= 4 rb = (op >> 3) & 7 rd = op & 7 print(f'{codes[opcode]} r{rd},[r{rb},#{nn}]') elif op & 0xfc00 == 0x4400: # High register ops opcode = (op >> 8) & 3 codes = ('ADD', 'CMP', 'MOV', 'NOP', 'BX') MSBd = (op >> 7) & 1 MSBs = (op >> 6) & 1 rs = (op >> 3) & 7 rd = op & 7 rs |= MSBs << 3 rd += MSBd << 3 if opcode == 3: flag = 'BLX' if MSBd else 'BX' print(f'{flag} r{rs}') else: print(f'{codes[opcode]} r{rd},r{rs}') else: print() def match_iter(instrs, graph, slices=10): # Iteratively try to find a match matches = match_instrs(instrs, graph) # Try to match the entire sequence first if len(matches) == 1: # One perfect match return matches[0][1] elif not matches: # No match anywhere return None potential = {t[1] for t in matches} # All potential matches for _ in range(slices): # Slice the list randomly and search for matches start = random.randint(1, len(instrs)) end = random.randint(1, len(instrs)) if start == end: end += 1 elif end < start: start, end = end, start matches = match_instrs(instrs[start:end], graph) new_set = {t[1]-2*start for t in matches} potential = potential & new_set # Take intersection of old and new locations if not potential: return None elif len(potential) == 1: addr = potential.pop() return addr return None def match_loop(rom: str, unknown, jp_known, us_known: dict, graph) -> Dict[str, str]: # Interactive rv_known = {addr: name for name, (addr, size) in us_known.items() if name not in jp_known} print(f'Pool: {len(rv_known)}') keys = list(unknown.keys()) random.shuffle(keys) unk_exp = re.compile(r'sub_[\dA-Fa-f]+') replace: Dict[str, str] = {} # Maps old names to new names # print('At any time, type "quit" to quit and replace all correct names') match_count = 0 print('Matched 0 functions', end='', flush=True) with open(rom, 'rb') as f: for name in keys: addr, size = unknown[name] if not size: continue f.seek(addr & 0xfffffe) data = f.read(size) instrs = [t[0] for t in struct.iter_unpack('