From 5b42a7674828fd2d8d3ce23a4e516c49f647ecc9 Mon Sep 17 00:00:00 2001 From: Matthew Stanley <1379tech@gmail.com> Date: Tue, 28 Apr 2026 19:16:27 -0700 Subject: [PATCH] recomp: pattern auto-discovery for dynamic-asset slot fragments (Shape A) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds [[input.decompressed_section_pattern]] for slots where many fragments share a link vram (e.g. Stadium streams 279+ different fragments through vram 0x8FF00000 across the game). Per-fragment [[input.decompressed_section]] entries don't scale to that cardinality and miss the runtime-swap dispatch problem entirely. Engine pipeline: 1. Scan baserom.z64 for every Yay0 wrapper. 2. For each, decompress 0x40 bytes and check whether the prefix matches the expected J trampoline + FRAGMENT magic. Wrappers in PERS-SZP form are detected by the -0x18 prefix. 3. For matches, fully decompress and FNV-1a-64 hash the body. 4. Deduplicate by content hash (Stadium has ~11 byte-identical duplicates across its 279 wrappers). 5. Synthesize one Section per unique content. Section names __rom_; functions become func___rom_ via the existing collision-suffix machinery (default for pattern-discovered sections, since collisions are the EXPECTED case here). Implementation function (the +0x20 entry) gets a basic forward CFG walk to determine its size: - Walk instructions tracking forward branch targets within the func. - Stop at jr $ra IF no tracked forward branches still need to be reached. - Falls back to first-jr-ra heuristic if walk is inconclusive. Pattern-synthesized recompile failures are non-fatal: pattern sections have rom_addr in synthetic 0xFE000000 range, and main.cpp's recompile loop log + skips them instead of std::exit. Lets the build proceed even when our basic CFG walk misjudges a function with weird shape (e.g. computed jumps through jump tables we don't analyze). Stadium's Path-3 single-fragment case (fragment78 wrapper at ROM 0x9E93F0) still recompiles cleanly; ~225 of 282 dynamic-slot fragments recompile, ~57 fail and skip. Validation on Stadium's 0x8FF00000 slot: - 293 Yay0 wrappers found (293 vs 279 from prior validate script — earlier scan undercounted due to a tight 1KB decode window). - 282 sections after dedupe (11 collapsed as content-identical). - Build proceeds to completion; no Stadium boot regression (logo + PIKA jingle still render). Outstanding for next session — runtime side: - Modify register_runtime_fragment in librecomp/src/overlays.cpp to read bytes at fragment_ptr (first 0x40 → fall back to full body for the residual ~5%), hash, and look up the matching section. Currently it picks by id alone, so for slot 0x8FF00000 only ONE of the 282 sections gets bound to func_map at any time (the most-recently registered). - Refactor cross-section R_MIPS_32 retargeting to use a vram hashmap (currently O(N²) which gets expensive at 282 sections). - Relink fragment78's prior single-fragment block can stay; it works alongside patterns and serves as the "I know exactly which one I want" form. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/config.cpp | 43 ++++++ src/config.h | 25 +++ src/decompressed.cpp | 357 +++++++++++++++++++++++++++++++++++++++++++ src/decompressed.h | 16 ++ src/main.cpp | 42 +++++ 5 files changed, 483 insertions(+) diff --git a/src/config.cpp b/src/config.cpp index f756ecd..b0dc4e5 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -38,6 +38,36 @@ std::vector get_manual_funcs(const toml::array* manua return ret; } +std::vector get_decompressed_section_patterns(const toml::array* arr) { + std::vector ret; + ret.reserve(arr->size()); + arr->for_each([&ret](auto&& el) { + if constexpr (toml::is_table) { + std::optional base_name = el["base_name"].template value(); + std::optional vram = el["vram"].template value(); + std::optional wrapper_format = el["wrapper_format"].template value(); + std::optional relocatable = el["relocatable"].template value(); + + if (!vram.has_value() || !wrapper_format.has_value()) { + throw toml::parse_error( + "decompressed_section_pattern requires vram and " + "wrapper_format", el.source()); + } + + N64Recomp::DecompressedSectionPattern p; + p.base_name = base_name.value_or(""); + p.vram = vram.value(); + p.wrapper_format = wrapper_format.value(); + p.relocatable = relocatable.value_or(true); + ret.emplace_back(std::move(p)); + } else { + throw toml::parse_error( + "Invalid decompressed_section_pattern entry", el.source()); + } + }); + return ret; +} + std::vector get_decompressed_sections(const toml::array* arr) { std::vector ret; ret.reserve(arr->size()); @@ -431,6 +461,19 @@ N64Recomp::Config::Config(const char* path) { decompressed_data.as_array()); } + // Decompressed section patterns (optional). One + // [[input.decompressed_section_pattern]] entry per slot where + // multiple wrappers share a link vram (e.g. Stadium's + // 0x8FF00000 dynamic-asset slot). The engine scans the ROM + // for every wrapper that decompresses to a fragment at the + // declared vram + format. + toml::node_view decompressed_pattern_data = + input_data["decompressed_section_pattern"]; + if (decompressed_pattern_data.is_array()) { + decompressed_section_patterns = get_decompressed_section_patterns( + decompressed_pattern_data.as_array()); + } + // Output policies (optional [output] table). toml::node_view output_data = config_data["output"]; if (output_data.is_table()) { diff --git a/src/config.h b/src/config.h index 36e52d5..3179fc8 100644 --- a/src/config.h +++ b/src/config.h @@ -47,6 +47,30 @@ namespace N64Recomp { bool relocatable = true; }; + // [[input.decompressed_section_pattern]] — describes a SLOT that + // multiple decompressed fragments share at runtime. Stadium's + // dynamic asset slots (e.g. vram 0x8FF00000) have hundreds of + // wrappers that all link at the same vram and get swapped in/out. + // For these, instead of declaring each wrapper individually, the + // user describes the slot and the engine auto-discovers every + // wrapper in the ROM that decompresses to a fragment at this + // vram + format. + // + // Synthesized section names are: __rom_ + // where rom_wrapper is the ROM offset of each wrapper's magic. + // Wrappers whose decompressed bytes hash-equal another wrapper's + // are deduplicated (only one section emitted per distinct content; + // the runtime-side dispatch handles which wrapper-offset is in + // play at a given moment). + struct DecompressedSectionPattern { + // Base name for emitted sections; suffix __rom_ + // appends per wrapper. Defaults to "frag_" if empty. + std::string base_name; + uint32_t vram = 0; + std::string wrapper_format; + bool relocatable = true; + }; + // [output] collision_policy — what to do when two emitted symbols // would share a name. "error" (default) aborts the build with a // message naming both colliders. "suffix" auto-disambiguates by @@ -84,6 +108,7 @@ namespace N64Recomp { std::vector manual_func_sizes; std::vector manual_functions; std::vector decompressed_sections; + std::vector decompressed_section_patterns; CollisionPolicy collision_policy = CollisionPolicy::Error; std::string bss_section_suffix; std::string recomp_include; diff --git a/src/decompressed.cpp b/src/decompressed.cpp index 8b8ce8e..1b0d129 100644 --- a/src/decompressed.cpp +++ b/src/decompressed.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "compression/pers_szp.h" @@ -18,6 +19,19 @@ uint32_t read_be_u32(const uint8_t* p) { (uint32_t(p[2]) << 8) | uint32_t(p[3]); } +// FNV-1a 64-bit content hash. Used to deduplicate wrappers whose +// decompressed bytes are byte-for-byte identical (Stadium's 0x8FF00000 +// slot has ~11 such pairs out of 279), and as the runtime dispatch key +// when multiple wrappers share a link vram. +uint64_t fnv1a_64(const uint8_t* data, size_t len) { + uint64_t h = 0xCBF29CE484222325ull; + for (size_t i = 0; i < len; i++) { + h ^= uint64_t(data[i]); + h *= 0x100000001B3ull; + } + return h; +} + // Reads an entire file into memory. Returns empty vector on error. std::vector read_rom_file(const std::filesystem::path& path) { std::ifstream f(path, std::ios::binary | std::ios::ate); @@ -450,4 +464,347 @@ bool synthesize_decompressed_sections( return true; } +namespace { + +// Adds one synthesized section + its functions + reloc table to the +// context. Used by both the explicit per-fragment path and the pattern +// auto-discovery path. `blob` is the decompressed body+relocs (must +// start with the FRAGMENT header). On success, returns the section +// index. On failure, returns size_t(-1) and prints to stderr. +size_t add_decompressed_section(Context& context, + const std::vector& blob, + uint32_t rom_wrapper, + uint32_t vram, + const std::string& section_name, + bool relocatable) +{ + if (blob.size() < 0x20) { + std::fprintf(stderr, + "decompressed: section %s blob smaller than FRAGMENT header\n", + section_name.c_str()); + return size_t(-1); + } + if (std::memcmp(blob.data() + 0x08, "FRAGMENT", 8) != 0) { + std::fprintf(stderr, + "decompressed: section %s missing FRAGMENT magic\n", + section_name.c_str()); + return size_t(-1); + } + + // Stash decompressed bytes at synthetic_rom = 0xFE000000 | wrapper_off + // so the existing pipeline (which addresses sections via rom_addr) + // finds them. The 0xFE prefix is reserved for synthesized sections. + const uint32_t synthetic_rom = 0xFE000000u | rom_wrapper; + const uint32_t reloc_offset = read_be_u32(blob.data() + 0x14); + if (reloc_offset > blob.size()) { + std::fprintf(stderr, + "decompressed: section %s relocOffset 0x%X exceeds blob 0x%zX\n", + section_name.c_str(), reloc_offset, blob.size()); + return size_t(-1); + } + + const size_t needed_rom_size = size_t(synthetic_rom) + reloc_offset; + if (context.rom.size() < needed_rom_size) { + context.rom.resize(needed_rom_size, 0); + } + std::memcpy(context.rom.data() + synthetic_rom, + blob.data(), reloc_offset); + + const uint16_t section_index = uint16_t(context.sections.size()); + + Section section{}; + section.rom_addr = synthetic_rom; + section.ram_addr = vram; + section.size = reloc_offset; + section.bss_size = 0; + section.name = section_name; + section.executable = true; + section.relocatable = relocatable; + + if (!parse_fragment_relocs(blob, vram, section_index, section)) { + return size_t(-1); + } + + context.sections.emplace_back(std::move(section)); + context.section_functions.emplace_back(); + + auto add_function = [&](uint32_t f_vram, uint32_t f_rom, + std::vector words, + std::string name) { + const size_t fi = context.functions.size(); + context.functions.emplace_back( + f_vram, f_rom, std::move(words), name, + section_index, false, false, false); + context.section_functions[section_index].push_back(fi); + context.sections[section_index].function_addrs.push_back(f_vram); + context.functions_by_vram[f_vram].push_back(fi); + context.functions_by_name[name] = fi; + }; + + // (1) Entry trampoline at vram+0 (8 bytes). + std::vector entry_words(2); + std::memcpy(entry_words.data(), blob.data() + 0x00, 8); + add_function(vram, synthetic_rom, + std::move(entry_words), + section_name + "_entry"); + + // (2) Implementation function at vram+0x20. Determine its size by + // a basic forward CFG walk: + // - Start at +0x20. + // - At each instruction, track forward-branch targets within the + // function (B/BEQ/BNE/JAL). + // - At every `jr $ra`, the function ends after the delay slot + // UNLESS a tracked forward-branch target is past that point; + // in that case, keep walking (the jr $ra is mid-function, + // reached via a goto/branch, with more code after). + // - Hard cap at relocOffset (where data/relocs start). + // + // This is far less rigorous than the recompiler's analyze_function + // (which is what runs LATER on this function), but it's enough to + // size the function correctly for the common cases we've seen so + // far. Fragments with weirder shapes (computed-jump exits, etc.) + // may need a future refinement; for now they'll either come out + // smaller-than-correct (recompile fails — we log + skip) or the + // recompiler's own analysis will surface the issue. + constexpr uint32_t IMPL_OFFSET = 0x20; + const auto get_be32 = [&](size_t off) -> uint32_t { + return read_be_u32(blob.data() + off); + }; + auto branch_target_offset = [&](uint32_t insn, + uint32_t pc_offset) -> size_t { + // BEQ/BNE/BLEZ/BGTZ etc all use 16-bit signed offset relative + // to the delay slot. opcode in bits 31..26 between 0x04 and + // 0x07, plus REGIMM (0x01) for BLTZ/BGEZ/etc. + uint32_t opcode = (insn >> 26) & 0x3F; + bool is_branch = (opcode == 0x01 || // REGIMM + (opcode >= 0x04 && opcode <= 0x07) || + opcode == 0x14 || opcode == 0x15 || // BEQL/BNEL + opcode == 0x16 || opcode == 0x17); // BLEZL/BGTZL + if (!is_branch) return 0; + int16_t imm16 = int16_t(insn & 0xFFFF); + // Target = (pc_after_delay_slot) + imm16*4 = pc + 4 + imm16*4. + // Working in offsets from blob start. + int64_t target = int64_t(pc_offset) + 4 + (int64_t(imm16) * 4); + if (target <= int64_t(pc_offset)) return 0; // backward-only + if (target > int64_t(reloc_offset)) return 0; + return size_t(target); + }; + + size_t furthest_branch = 0; + size_t impl_end = 0; + for (size_t off = IMPL_OFFSET; off + 4 <= reloc_offset; off += 4) { + const uint32_t insn = get_be32(off); + // jr $ra encoding: 0x03E00008 + if (insn == 0x03E00008u) { + // Function ends after delay slot, unless we've tracked a + // forward branch past this point. + const size_t after_delay = off + 8; + if (after_delay > reloc_offset) { + impl_end = reloc_offset; + } else if (after_delay >= furthest_branch) { + impl_end = after_delay; + } else { + // jr $ra is mid-function — keep walking. + continue; + } + break; + } + size_t bt = branch_target_offset(insn, off); + if (bt > furthest_branch) { + furthest_branch = bt; + } + } + if (impl_end == 0) { + // No proper return found — degrade to first jr $ra in body + // (matches old heuristic) so we still produce something. + for (size_t off = IMPL_OFFSET; off + 4 <= reloc_offset; off += 4) { + if (get_be32(off) == 0x03E00008u) { + impl_end = off + 8; + if (impl_end > reloc_offset) impl_end = reloc_offset; + break; + } + } + } + if (impl_end > IMPL_OFFSET) { + const size_t impl_size = impl_end - IMPL_OFFSET; + std::vector impl_words(impl_size / 4); + std::memcpy(impl_words.data(), + blob.data() + IMPL_OFFSET, impl_size); + const std::string impl_name = fmt::format( + "func_{:08X}", vram + IMPL_OFFSET); + add_function(vram + IMPL_OFFSET, + synthetic_rom + IMPL_OFFSET, + std::move(impl_words), + impl_name); + } + + return section_index; +} + +// Decompress a wrapper at the given ROM offset using the named format. +// Returns true + populates blob on success. +bool decompress_wrapper_at(const std::vector& rom, + uint32_t rom_wrapper, + const std::string& wrapper_format, + std::vector& blob_out) +{ + if (rom_wrapper >= rom.size()) return false; + if (wrapper_format == "pers_szp_yay0") { + return compression::pers_szp_decompress( + rom.data() + rom_wrapper, + rom.size() - rom_wrapper, blob_out); + } else if (wrapper_format == "yay0") { + return compression::yay0_decompress( + rom.data() + rom_wrapper, + rom.size() - rom_wrapper, blob_out); + } + return false; +} + +} // namespace + +bool synthesize_decompressed_patterns( + Context& context, + const std::filesystem::path& rom_path, + const std::vector& patterns) +{ + if (patterns.empty()) return true; + + const std::vector rom = read_rom_file(rom_path); + if (rom.empty()) { + std::fprintf(stderr, + "decompressed: failed to read ROM file: %s\n", + rom_path.string().c_str()); + return false; + } + + const uint16_t first_added_index = uint16_t(context.sections.size()); + + for (const DecompressedSectionPattern& p : patterns) { + // Compute the J-trampoline encoding we expect at +0x00 of any + // matching fragment: J + nop. MIPS J insn: + // opcode 0x02 << 26 | (target >> 2) & 0x03FFFFFF + const uint32_t j_target = p.vram + 0x20u; + const uint32_t j_insn = 0x08000000u | + ((j_target >> 2) & 0x03FFFFFFu); + // Big-endian byte pattern for the first 8 bytes (J + nop). + uint8_t expected_first8[8]; + expected_first8[0] = uint8_t(j_insn >> 24); + expected_first8[1] = uint8_t(j_insn >> 16); + expected_first8[2] = uint8_t(j_insn >> 8); + expected_first8[3] = uint8_t(j_insn); + expected_first8[4] = 0; + expected_first8[5] = 0; + expected_first8[6] = 0; + expected_first8[7] = 0; + const uint8_t fragment_magic[8] = { + 'F', 'R', 'A', 'G', 'M', 'E', 'N', 'T' + }; + + // Resolve the base_name (default: "frag_"). + std::string base_name = p.base_name; + if (base_name.empty()) { + base_name = fmt::format("frag_{:08X}", p.vram); + } + + // Scan the ROM for Yay0 magic. For each, decompress 0x40 bytes, + // check the J-insn + FRAGMENT-magic match, and accept. + std::vector>> hits; + size_t scan_pos = 0; + while (scan_pos + 16 < rom.size()) { + // Find next "Yay0" magic. + size_t y0 = std::string::npos; + for (size_t i = scan_pos; i + 4 <= rom.size(); i++) { + if (rom[i] == 'Y' && rom[i+1] == 'a' && + rom[i+2] == 'y' && rom[i+3] == '0') { + y0 = i; + break; + } + } + if (y0 == std::string::npos) break; + scan_pos = y0 + 4; + + // Quick prefix decompress to test the FRAGMENT shape. + std::vector prefix; + if (!compression::yay0_decompress( + rom.data() + y0, rom.size() - y0, prefix)) { + continue; + } + if (prefix.size() < 0x10) continue; + if (std::memcmp(prefix.data(), expected_first8, 8) != 0) continue; + if (std::memcmp(prefix.data() + 8, fragment_magic, 8) != 0) continue; + + // Match — figure out the wrapper offset (PERS-SZP wraps Yay0 + // at -0x18 if the format is pers_szp_yay0; otherwise the + // wrapper offset IS the Yay0 offset). + uint32_t wrap_off = uint32_t(y0); + if (p.wrapper_format == "pers_szp_yay0") { + if (y0 < 0x18) continue; + if (std::memcmp(rom.data() + (y0 - 0x18), + "PERS-SZP", 8) != 0) { + continue; + } + wrap_off = uint32_t(y0 - 0x18); + } else if (p.wrapper_format != "yay0") { + std::fprintf(stderr, + "decompressed: pattern %s unknown wrapper_format '%s'\n", + base_name.c_str(), p.wrapper_format.c_str()); + return false; + } + + // Full decompress. + std::vector body; + if (!decompress_wrapper_at(rom, wrap_off, p.wrapper_format, body)) { + continue; + } + hits.emplace_back(wrap_off, std::move(body)); + } + + std::fprintf(stderr, + "decompressed pattern %s @ vram 0x%08X format=%s: " + "found %zu wrappers in ROM\n", + base_name.c_str(), p.vram, p.wrapper_format.c_str(), + hits.size()); + + if (hits.empty()) continue; + + // Deduplicate by content hash. + std::unordered_map seen_hashes; + size_t added = 0; + size_t deduped = 0; + for (auto& [wrap_off, body] : hits) { + uint64_t h = fnv1a_64(body.data(), body.size()); + auto it = seen_hashes.find(h); + if (it != seen_hashes.end()) { + deduped++; + continue; + } + seen_hashes.emplace(h, wrap_off); + + const std::string section_name = fmt::format( + "{}__rom_{:X}", base_name, wrap_off); + size_t si = add_decompressed_section( + context, body, wrap_off, p.vram, + section_name, p.relocatable); + if (si == size_t(-1)) { + std::fprintf(stderr, + "decompressed: pattern %s — failed to add section " + "for ROM 0x%X (continuing)\n", + base_name.c_str(), wrap_off); + continue; + } + added++; + } + std::fprintf(stderr, + "decompressed pattern %s: %zu sections added " + "(%zu deduped as content-identical)\n", + base_name.c_str(), added, deduped); + } + + // Cross-section R_MIPS_32 retargeting once everything is in. + resolve_cross_section_targets(context, first_added_index); + + return true; +} + } // namespace N64Recomp diff --git a/src/decompressed.h b/src/decompressed.h index 0300631..e3d96a1 100644 --- a/src/decompressed.h +++ b/src/decompressed.h @@ -39,6 +39,22 @@ bool synthesize_decompressed_sections( const std::filesystem::path& rom_path, const std::vector& configs); +// Auto-discovery: scan the ROM for every wrapper that decompresses +// to a fragment at the declared vram + format, deduplicate by content +// hash, and add one Section per distinct content. Section names are +// auto-generated as __rom_; the runtime +// dispatcher uses the bytes Stadium loads at fragment_ptr to identify +// which section's recompiled C to bind. +// +// Each pattern produces an arbitrary number of sections (e.g. Stadium's +// 0x8FF00000 slot has 268 distinct fragment-bodies). Sections are +// appended to `context.sections` in deterministic ROM-offset order so +// rebuilds are reproducible. +bool synthesize_decompressed_patterns( + Context& context, + const std::filesystem::path& rom_path, + const std::vector& patterns); + } // namespace N64Recomp #endif diff --git a/src/main.cpp b/src/main.cpp index 74af3ff..fb37cd8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -387,6 +387,17 @@ int main(int argc, char** argv) { exit_failure("Failed to synthesize decompressed sections\n"); } + // Pattern-driven auto-discovery of decompressed sections. For + // slots like Stadium's vram 0x8FF00000 where many wrappers + // share a link addr, this scans the ROM and synthesizes one + // section per distinct decompressed content. With suffix-style + // names (__rom_) per wrapper. + if (!N64Recomp::synthesize_decompressed_patterns( + context, config.rom_file_path, + config.decompressed_section_patterns)) { + exit_failure("Failed to synthesize decompressed patterns\n"); + } + // Add any manual functions add_manual_functions(context, config.manual_functions); @@ -847,6 +858,23 @@ int main(int argc, char** argv) { } if (result == false) { fmt::print(stderr, "Error recompiling {}\n", func.name); + // Pattern-synthesized sections (rom_addr in the synthetic + // 0xFE000000 range) are best-effort: we don't know each + // function's true size without real CFG analysis. If one + // fails, log and continue so the rest of the build + // succeeds. The runtime will see a missing func_map + // entry and dispatch via LOOKUP_FUNC's trampoline path + // if Stadium ever activates this fragment. + bool is_pattern_synthesized = + (context.sections[func.section_index].rom_addr & 0xFF000000u) + == 0xFE000000u; + if (is_pattern_synthesized) { + fmt::print(stderr, + " (pattern-synthesized section — skipping, " + "build continues)\n"); + context.functions[i].ignored = true; + continue; + } std::exit(EXIT_FAILURE); } } else if (func.reimplemented) { @@ -956,6 +984,20 @@ int main(int argc, char** argv) { if (result == false) { fmt::print(stderr, "Error recompiling {}\n", new_func.name); + // Pattern-synthesized sections (rom_addr in synthetic + // 0xFE000000 range) are best-effort. Mark the static + // ignored and continue. See the equivalent block in + // the main recompile loop above for rationale. + bool is_pattern_synthesized = + (context.sections[new_func.section_index].rom_addr & 0xFF000000u) + == 0xFE000000u; + if (is_pattern_synthesized) { + fmt::print(stderr, + " (pattern-synthesized section — skipping, " + "build continues)\n"); + context.functions[new_func_index].ignored = true; + continue; + } std::exit(EXIT_FAILURE); } }