#!/usr/bin/env python3 import argparse, ast, math, os, re, struct import bisect from mips_isa import * from multiprocessing import * parser = argparse.ArgumentParser() parser.add_argument('-j', dest='jobs', type=int, default=1, help='number of processes to run at once') args = parser.parse_args() jobs = args.jobs ASM_OUT = "asm/" DATA_OUT = "data/" MIPS_BRANCH_LIKELY_INSNS = [ MIPS_INS_BEQL, MIPS_INS_BGEZALL, MIPS_INS_BGEZL, MIPS_INS_BGTZL, MIPS_INS_BLEZL, MIPS_INS_BLTZALL, MIPS_INS_BLTZL, MIPS_INS_BNEL, MIPS_INS_BC1TL, MIPS_INS_BC1FL, ] MIPS_BRANCH_INSNS = [ *MIPS_BRANCH_LIKELY_INSNS, MIPS_INS_BEQ, MIPS_INS_BGEZ, MIPS_INS_BGEZAL, MIPS_INS_BGTZ, MIPS_INS_BNE, MIPS_INS_BLTZ, MIPS_INS_BLTZAL, MIPS_INS_BLEZ, MIPS_INS_BC1T, MIPS_INS_BC1F, MIPS_INS_BEQZ, MIPS_INS_BNEZ, MIPS_INS_B, ] MIPS_JUMP_INSNS = [ MIPS_INS_JAL, MIPS_INS_JALR, MIPS_INS_J, MIPS_INS_JR ] MIPS_FP_LOAD_INSNS = [ MIPS_INS_LWC1, MIPS_INS_LDC1 ] MIPS_FP_STORE_INSNS = [ MIPS_INS_SWC1, MIPS_INS_SDC1 ] MIPS_FP_LOAD_STORE_INSNS = [ *MIPS_FP_LOAD_INSNS, *MIPS_FP_STORE_INSNS ] MIPS_STORE_INSNS = [ MIPS_INS_SB, MIPS_INS_SH, MIPS_INS_SW, MIPS_INS_SWL, MIPS_INS_SWR, MIPS_INS_SD, MIPS_INS_SDL, MIPS_INS_SDR, MIPS_INS_SC, MIPS_INS_SCD, *MIPS_FP_STORE_INSNS ] MIPS_LOAD_STORE_INSNS = [ MIPS_INS_LB, MIPS_INS_LBU, MIPS_INS_LH, MIPS_INS_LHU, MIPS_INS_LW, MIPS_INS_LWL, MIPS_INS_LWR, MIPS_INS_LWU, MIPS_INS_LD, MIPS_INS_LDL, MIPS_INS_LDR, MIPS_INS_LL, MIPS_INS_LLD, *MIPS_STORE_INSNS, *MIPS_FP_LOAD_STORE_INSNS ] functions = set() # vram of functions data_labels = set() # vram of data labels, TODO this + functions can merge into something more general eventually branch_labels = set() # vram of branch labels via branch/jump instructions jtbls = set() # vram of jump tables jtbl_labels = set() # vram of branch labels via jtbls, higher output priority than regular branch labels floats = set() # vram of floats doubles = set() # vram of doubles prospective_strings = set() # vram of possible strings strings = set() # vram of confirmed strings symbols = {} # lui/addiu pairs : {addr of %hi : full symbol} AND {addr of %lo : full symbol} , twice the space complexity but much less time complexity constants = {} # lui/ori pairs : {addr of %hi : full constant} AND {addr of %lo : full constant} , twice the space complexity but much less time complexity dwords = set() # doublewords multiply_referenced_rodata = set() # rodata with more than 1 reference outside of the same function cannot be migrated files = set() # vram start of file vrom_variables = list() # (name,addr) vrom_addrs = set() # set of addrs from vrom_variables, for faster lookup functions_ast = None variables_ast = None variable_addrs = None def proper_name(symbol, in_data=False, is_symbol=True): # hacks if symbol == 0x809C46F0: # ovl_En_Encount4 fake symbol at the very end of the data section return variables_ast[0x809C46DC][0] + " + 0x14" elif symbol == 0x801EF66D: # z_message_nes constant-folding stray fairy array return variables_ast[0x801EF670][0] + f" - 0x{0x801EF670 - 0x801EF66D:X}" elif symbol == 0x80A09740: # boss_07 symbol with large addend folded into %lo return variables_ast[0x80A09A60][0] + f" - 0x{0x80A09A60 - 0x80A09740:X}" elif symbol == 0x80B80248: # bg_ikana_mirror symbol with large addend folded into %lo return variables_ast[0x80B801A8][0] + f" + 0x{0x80B80248 - 0x80B801A8:X}" elif symbol == 0x8084D2FC: # player symbol with very large addend folded into %lo, since we don't know the real symbol just use the first data symbol for now return variables_ast[0x8085B9F0][0] + f" - 0x{0x8085B9F0 - 0x8084D2FC:X}" elif symbol == 0x001ABAB0 or symbol == 0x001E3BB0: # OS_K0_TO_PHYSICAL on rspS2DEX text and data symbols return variables_ast[symbol + 0x80000000][0] + " - 0x80000000" elif symbol == 0x00AC0480: # do_action_static + 0x480, this is the only rom segment that has a constant offset folded into it so just hack it return "_do_action_staticSegmentRomStart + 0x480" # real names if symbol in functions and symbol in functions_ast.keys(): return functions_ast[symbol][0] elif symbol in variables_ast.keys(): return variables_ast[symbol][0] elif symbol in vrom_addrs: # prefer "start" vrom symbols if symbol in [addr for name,addr in vrom_variables if "SegmentRomStart" in name]: return [name for name,addr in vrom_variables if "SegmentRomStart" in name and addr == symbol][0] else: return [name for name,addr in vrom_variables if addr == symbol][0] # addends if is_symbol: symbol_index = bisect.bisect(variable_addrs, symbol) if symbol_index: vram_addr = variable_addrs[symbol_index-1] symbol_name, _, _, symbol_size = variables_ast[vram_addr] if vram_addr < symbol < vram_addr + symbol_size: return f"{symbol_name} + 0x{symbol-vram_addr:X}" # generated names if symbol in functions and not in_data: return f"func_{symbol:08X}" elif symbol in jtbl_labels: return f"L{symbol:08X}" elif symbol in jtbls: return f"jtbl_{symbol:08X}" else: if is_symbol: return f"D_{symbol:08X}" else: return f"0x{symbol:08X}" def lookup_name(symbol, word): # hacks for vrom variables in data if word in vrom_addrs: if word == 0: # no makerom segment start return "0x00000000" if symbol in [0x801AE4A0, # effect table 0x801C2740, # object table 0x801C2650, # elf_message table 0x801C3CA0, # scene table 0x801AEFD0, # actor table 0x801D0B70, # kaleido table 0x801D0BB0, # fbdemo table 0x801BE4D4, # kankyo skybox table 0x801BD910, # gamestate table 0x801C2660 # scene textures table ]: # data labels that are allowed to have vrom variables return proper_name(word, is_symbol=False) else: return f"0x{word:08X}" # hacks for variables with references to variables with addends (i.e. gSaveContext + 0x...) if symbol in [0x800980C4, 0x801AE8F0]: # __osViNext and flg_set table return proper_name(word, is_symbol=True) return proper_name(word, is_symbol=False) def as_double(b): return struct.unpack(">d", b)[0] def as_float(b): return struct.unpack(">f", b)[0] def as_dword(b): return struct.unpack(">Q", b)[0] def as_word(b): return struct.unpack(">I", b)[0] def as_hword(b): return struct.unpack(">H", b)[0] def as_double_list(b): return [i[0] for i in struct.iter_unpack(">d", b)] def as_float_list(b): return [i[0] for i in struct.iter_unpack(">f", b)] def as_dword_list(b): return [i[0] for i in struct.iter_unpack(">Q", b)] def as_word_list(b): return [i[0] for i in struct.iter_unpack(">I", b)] def as_hword_list(b): return [h[0] for h in struct.iter_unpack(">H", b)] STR_INDENT = " " def try_decode_string(data_bytes, output_ends = True, force_ascii = False): """ Swiss Cheese """ hex_digits = "0123456789abcdefABCDEF" result = "" directive = "ascii" if force_ascii else "asciz" if 0x8C in data_bytes or 0x8D in data_bytes or 0x1B in data_bytes: char = "" index = None if 0x8C in data_bytes: char = "\\x8C" index = data_bytes.index(0x8C) elif 0x8D in data_bytes: char = "\\x8D" index = data_bytes.index(0x8D) elif 0x1B in data_bytes: char = "\\x1B" index = data_bytes.index(0x1B) else: assert False , "???" part1 = try_decode_string(data_bytes[0:index], output_ends=False) part2 = try_decode_string(data_bytes[index+1:], output_ends=False) if part2 != "": result = part1 + (("\"\n" + STR_INDENT + ".ascii \"") if part2[0] in hex_digits and part1 != "" else "") + \ char + (("\"\n" + STR_INDENT + ".asciz \"") if part2[0] in hex_digits else "") + part2 if output_ends and part2[0] in hex_digits and part1 == "": directive = "ascii" else: result = part1 + (("\"\n" + STR_INDENT + ".ascii \"") if part2[0] in hex_digits and part1 != "" else "") + char + "\"\n" else: result = data_bytes.decode("EUC-JP").replace("\n", "\\n").replace("\r", "\\r").replace("\t", "\\t").replace("\0", "") return (("." + directive + " \"") if output_ends else "") + result + ("\"" if output_ends else "") def reduce_float(flt_str, is_double=False): def n_digits(str): if "." not in str: return 0 return len(str.split(".")[1].strip()) def str_round(string, precision): if "." not in string or precision < 0: return string int_part = string.split(".")[0] frac_part = string.split(".")[1] while len(frac_part) > precision: last_digit = int(frac_part[-1]) frac_part = frac_part[:-1] # round up & carry if last_digit >= 5: while len(frac_part) != 0: last_digit = int(frac_part[-1]) frac_part = frac_part[:-1] if last_digit != 9: # complete carry last_digit += 1 frac_part += repr(last_digit) break if len(frac_part) == 0: int_part = repr(int(int_part) + 1) frac_part = "0" return int_part + "." + frac_part def to_binary(flt_str): return as_dword(struct.pack(">d", float(flt_str))) if is_double else as_word(struct.pack(">f", float(flt_str))) exponent = "" if "e" in flt_str: exponent = "e" + flt_str.split("e")[1] flt_str = flt_str.split("e")[0] original_binary = to_binary(flt_str + exponent) precision = n_digits(flt_str) if precision == 0: if exponent != "": return flt_str + exponent return flt_str + (".0" if not flt_str.endswith(".") else "0") while precision > 1: precision -= 1 old_str = flt_str flt_str = str_round(flt_str, precision) if to_binary(flt_str + exponent) != original_binary: flt_str = old_str break if flt_str.endswith("."): flt_str += "0" return flt_str + exponent def format_f64(d_dwd): d = as_double(struct.pack(">Q", d_dwd)) if math.isnan(d): return f".long 0x{d_dwd:016X}" return f".double {reduce_float(repr(d), is_double=True)}" def format_f32(f_wd): # GNU AS isn't ieee compliant? if f_wd in [0xB8E4AECD, 0xB8E4C3AA, 0x38D1B718]: return f".word 0x{f_wd:08X}" if f_wd == 0x7F7FFFFF: # FLT_MAX return ".float 3.4028235e+38" f = as_float(struct.pack(">I", f_wd)) if math.isnan(f): return f".word 0x{f_wd:08X}" return f".float {reduce_float(repr(f))}" def put_symbol(symbols_dict, setname, symbol): if setname in symbols_dict: symbols_dict[setname].add(symbol) else: symbols_dict[setname] = {symbol} def put_symbols(symbols_dict, setname, symbols): if setname in symbols_dict: symbols_dict[setname].update(symbols) else: symbols_dict[setname] = symbols.copy() def update_symbols_from_dict(symbols_dict): for setname, symbol in symbols_dict.items(): if setname == "functions": functions.update(symbol) elif setname == "data_labels": data_labels.update(symbol) elif setname == "branch_labels": branch_labels.update(symbol) elif setname == "jtbls": jtbls.update(symbol) elif setname == "jtbl_labels": jtbl_labels.update(symbol) elif setname == "floats": floats.update(symbol) elif setname == "doubles": doubles.update(symbol) elif setname == "prospective_strings": prospective_strings.update(symbol) elif setname == "strings": strings.update(symbol) elif setname == "symbols": symbols.update(symbol) elif setname == "constants": constants.update(symbol) elif setname == "dwords": dwords.update(symbol) elif setname == "files": files.update(symbol) def find_symbols_in_text(data, rodata, vram, vram_rodata, data_regions, relocs, segment_name): symbols_dict = dict() print(f"Finding symbols from .text in {segment_name}") if rodata is not None: rodata_words = as_word_list(rodata) raw_insns = as_word_list(data) assert vram % 0x10 == 0 put_symbol(symbols_dict, "functions", vram) put_symbol(symbols_dict, "files", vram) # lui trackers are saved at branches to be restored when the branch target is reached saved_lui_trackers = {} # vram: tracker list def save_tracker(restore_at, tracker): if restore_at in saved_lui_trackers.keys(): saved_lui_trackers[restore_at].update(tracker) else: saved_lui_trackers.update({restore_at: tracker}) lui_tracker = {} # register : (addr, immediate) prospective_jtbls = set() func_branch_labels = set() func_jtbl_labels = set() individual_jtbl_labels = {} clobber_later = {} # addr : reg delay_slot = False delayed_insn = None next_jtbl_jr = 0 def clobber_conditionally(insn): # Don't clobber immediately if in a branch likely's delay slot as it only runs if the branch is taken, # instead clobber at the branch target. # This may appear suspicious at first since something could have set a good register in another code path, # however that does not make sense since all code paths must resolve a valid symbol, so if one code path clobbers, # that means no code path produces a valid symbol at he convergent code, unless the convergent code provides all the # new registers. # assert insn.value_forname(insn.fields[0]) is not None # print(f"insn: {insn.mnemonic}, rt: {insn.rt}, first: {insn.value_forname(insn.fields[0])}") # assert insn.id not in [MIPS_INS_ORI, MIPS_INS_ADDIU, *MIPS_LOAD_STORE_INSNS] or insn.rt == insn.value_forname(insn.fields[0]) if delay_slot and delayed_insn is not None and delayed_insn.id in MIPS_BRANCH_LIKELY_INSNS: clobber_later.update({delayed_insn.offset : insn.value_forname(insn.fields[0])}) else: lui_tracker.pop(insn.value_forname(insn.fields[0]), None) for region in data_regions: assert region[1] != 0 put_symbol(symbols_dict, "functions", region[1]) if region[1] % 0x10 == 0: put_symbol(symbols_dict, "files", region[1]) if relocs is not None: """ Relocation info is our friend. Get symbols via relocations first. """ prev_hi = None hi_vram = -1 for reloc in relocs: insn = decode_insn(raw_insns[reloc[2]//4], vram + reloc[2]) if reloc[1] == 2: # R_MIPS_32 assert False , "R_MIPS_32 in .text section?" elif reloc[1] == 4: # R_MIPS_26 """ Relocated jump targets give us functions in this section """ assert insn.id == MIPS_INS_JAL , f"R_MIPS_26 applied to {insn.mnemonic} when it should be JAL" put_symbol(symbols_dict, "functions", insn.target) elif reloc[1] == 5: # R_MIPS_HI16 """ Relocated %hi gives us %hi values to match with associated %lo """ assert insn.id == MIPS_INS_LUI , f"R_MIPS_HI16 applied to {insn.mnemonic} when it should be LUI" prev_hi = insn.imm hi_vram = vram + reloc[2] elif reloc[1] == 6: # R_MIPS_LO16 """ Relocated %lo + a %hi to match with gives us relocated symbols in data sections """ assert insn.id == MIPS_INS_ADDIU or insn.id in MIPS_LOAD_STORE_INSNS , f"R_MIPS_HI16 applied to {insn.mnemonic} when it should be ADDIU or a load/store" symbol_value = (prev_hi << 0x10) + insn.imm put_symbols(symbols_dict, "symbols", {hi_vram : symbol_value}) put_symbols(symbols_dict, "symbols", {vram + reloc[2] : symbol_value}) else: assert False , "Invalid relocation type encountered" for i,raw_insn in enumerate(raw_insns,0): i *= 4 vaddr = vram + i insn = decode_insn(raw_insn, vaddr) if insn.id == MIPS_INS_JR and insn.rs != MIPS_REG_RA: # It's hard to find when two jump tables next to each other end, so do a naive first pass # to try and find as many jump tables as possible. # Luckily IDO has a very homogeneous output for jump tables for which it is very unlikely # that other instructions will break up: # lui $at, %hi(jtbl) # addu $at, $at, $reg # lw $reg, %lo(jtbl)($at) # jr $reg insn_m1 = decode_insn(raw_insns[i//4 - 1], vaddr - 0x4) insn_m2 = decode_insn(raw_insns[i//4 - 2], vaddr - 0x8) insn_m3 = decode_insn(raw_insns[i//4 - 3], vaddr - 0xC) if insn_m1.id == MIPS_INS_LW and insn_m2.id == MIPS_INS_ADDU and insn_m3.id == MIPS_INS_LUI: prospective_jtbls.add((insn_m3.imm << 0x10) + insn_m1.imm) for i,raw_insn in enumerate(raw_insns,0): i *= 4 vaddr = vram + i insn = decode_insn(raw_insn, vaddr) # sometimes there's data embedded in .text in handwritten asm files that don't respect section contents in_data = False for region in data_regions: if vaddr in range(region[0], region[1], 4): in_data = True break if in_data: continue if vaddr in functions or vaddr in symbols_dict["functions"]: # add func branch labels to all branch labels put_symbols(symbols_dict, "branch_labels", func_branch_labels) func_branch_labels.clear() # add func jtbl labels to all jtbl labels put_symbols(symbols_dict, "jtbl_labels", func_jtbl_labels) func_jtbl_labels.clear() # clear individual jtbl labels individual_jtbl_labels.clear() # destroy lui tracking state lui_tracker.clear() saved_lui_trackers.clear() else: # restore lui tracking state if previously saved lui_tracker.update(saved_lui_trackers.pop(vaddr, {})) # register clobbering in branch likely delay slots cl = clobber_later.pop(vaddr, None) if cl is not None: lui_tracker.pop(cl, None) if insn.id in MIPS_BRANCH_INSNS: func_branch_labels.add(insn.offset) delayed_insn = insn elif insn.id == MIPS_INS_ERET: put_symbol(symbols_dict, "functions", vaddr + 4) elif insn.id in MIPS_JUMP_INSNS: if insn.id == MIPS_INS_JAL: # mark function at target put_symbol(symbols_dict, "functions", insn.target) elif insn.id == MIPS_INS_J: # mark label at target func_branch_labels.add(insn.target) elif insn.id == MIPS_INS_JR: # check if anything branches past it in either branch or jtbl labels if vaddr >= max(func_branch_labels, default=0) and vaddr >= max(func_jtbl_labels, default=0): # vaddr being largest in list means nothing branches past here # mark next function after delay slot and after any nop padding if present if len(raw_insns) > i//4 + 2: n_padding = 0 end = False while (raw_insns[i//4 + 2 + n_padding] == 0): n_padding += 1 if len(raw_insns) <= i//4 + 2 + n_padding: end = True break if not end: if n_padding > 0: assert (vaddr + 8 + n_padding * 4) % 0x10 == 0 , f"padding to non-0x10 alignment?, 0x{vaddr + 8 + n_padding * 4:08X}" put_symbol(symbols_dict, "functions", vaddr + 8 + n_padding * 4) put_symbol(symbols_dict, "functions", vaddr + 8 + n_padding * 4) delayed_insn = insn elif insn.id == MIPS_INS_LUI: """ Process LUI instruction All symbol loading involves LUI, and tracking how %hi values loaded via LUI instructions propagate through a function is key to identifying symbol references """ # add lui to tracker if delay_slot and delayed_insn is not None and delayed_insn.id in MIPS_BRANCH_LIKELY_INSNS: # for branch likelies, the current tracker does not update but the lui is saved to be tracked at the branch target save_tracker(delayed_insn.offset, {insn.rt : (vaddr, insn.imm)}) else: lui_tracker.update({insn.rt : (vaddr, insn.imm)}) elif insn.id == MIPS_INS_ADDIU or insn.id in MIPS_LOAD_STORE_INSNS: # try match with tracked lui and mark symbol hi_vram, imm_value = lui_tracker.get(insn.rs if insn.id == MIPS_INS_ADDIU else insn.base, (None, None)) # if a match was found, validate and record the symbol, TODO improve validation if hi_vram != None and ((((imm_value >> 0x8) & 0xF) != 0 and imm_value < 0x1000) or \ (imm_value >= 0x8000 and imm_value < 0x80D0) or \ (imm_value >= 0xA400 and imm_value < 0xA480) or (imm_value < 0x0400 and insn.id == MIPS_INS_ADDIU)): lo_vram = vaddr symbol_value = (imm_value << 0x10) + insn.imm put_symbols(symbols_dict, "symbols", {hi_vram : symbol_value}) put_symbols(symbols_dict, "symbols", {lo_vram : symbol_value}) if insn.id == MIPS_INS_LW: # try find jr within the same block cur_idx = i//4 lookahead_insn = decode_insn(raw_insns[cur_idx], vram + cur_idx * 4) # TODO fix vaddr here # still in same block unless one of these instructions are found while lookahead_insn.id not in [*MIPS_BRANCH_INSNS, MIPS_INS_JAL, MIPS_INS_JALR, MIPS_INS_J]: if lookahead_insn.id == MIPS_INS_JR: if lookahead_insn.rs == (insn.ft if insn.id in MIPS_FP_LOAD_STORE_INSNS else insn.rt): # read the jtbl and add targets to func_jtbl_labels assert rodata is not None , "A rodata section should be present if functions use jump tables" next_jtbl_jr = vaddr + cur_idx * 4 individual_jtbl_labels.update({next_jtbl_jr : set()}) offset = (symbol_value - vram_rodata)//4 label = rodata_words[offset] while label >= vram and label < vram + len(data) and \ (vram_rodata + offset * 4 not in prospective_jtbls or vram_rodata + offset * 4 == symbol_value): func_jtbl_labels.add(label) individual_jtbl_labels[next_jtbl_jr].add(label) offset += 1 if offset >= len(rodata_words): break label = rodata_words[offset] put_symbol(symbols_dict, "jtbls", symbol_value) # found a jr that matches, no need to keep searching break elif (insn.ft if insn.id in MIPS_FP_LOAD_STORE_INSNS else insn.rt) in [lookahead_insn.value_forname(field) for field in lookahead_insn.fields[1:]]: # register clobbered, no need to keep searching break cur_idx += 1 # found end before finding jr insn, not a jtbl if cur_idx >= len(raw_insns): break lookahead_insn = decode_insn(raw_insns[cur_idx], vram + cur_idx * 4) # TODO fix vaddr here elif insn.id == MIPS_INS_LD: # doubleword loads put_symbol(symbols_dict, "dwords", symbol_value) elif insn.id in [MIPS_INS_LWC1, MIPS_INS_SWC1]: # float load/stores # add float put_symbol(symbols_dict, "floats", symbol_value) elif insn.id in [MIPS_INS_LDC1, MIPS_INS_SDC1]: # double load/stores # add double put_symbol(symbols_dict, "doubles", symbol_value) elif insn.id == MIPS_INS_ADDIU and vaddr % 4 == 0: # strings seem to only ever be 4-byte aligned # add possible string put_symbol(symbols_dict, "prospective_strings", symbol_value) # clear lui tracking state if register is clobbered by the addiu/load instruction itself # insn.rt == (insn.rs if insn.id == MIPS_INS_ADDIU else insn.base) and if insn.id not in MIPS_STORE_INSNS and insn.id not in MIPS_FP_LOAD_INSNS: clobber_conditionally(insn) elif insn.id == MIPS_INS_ORI: # try match with tracked lui and mark constant hi_vram, imm_value = lui_tracker.get(insn.rs, (None, None)) if hi_vram != None: # found match lo_vram = vaddr const_value = (imm_value << 0x10) | insn.imm put_symbols(symbols_dict, "constants", {hi_vram : const_value}) put_symbols(symbols_dict, "constants", {lo_vram : const_value}) # clear lui tracking state if register is clobbered by the ori instruction itself # insn.rt == insn.rs or if insn.rt in lui_tracker.keys(): clobber_conditionally(insn) else: # clear lui tracking if register is clobbered by something unrelated if insn.id == MIPS_INS_ADDU and insn.rs in lui_tracker.keys() and (insn.rd == insn.rs): # array accesses optimisation: # lui reg, %hi(symbol) # addu reg, reg, idx # lw reg, %lo(symbol)(reg) # instead of clobbering, keep it if the second operand is also the first # if the output is not currently tracked it will behave as intended anyway pass # insns listed either write to fprs/cop0 or don't write to any elif insn.id not in [MIPS_INS_MTC0, MIPS_INS_MTC1, MIPS_INS_DMTC1, MIPS_INS_MULT, MIPS_INS_MULTU, MIPS_INS_DMULT, MIPS_INS_DMULTU, MIPS_INS_DIV, MIPS_INS_DIVU, MIPS_INS_DDIV, MIPS_INS_DDIVU, MIPS_INS_MTHI, MIPS_INS_MTLO, MIPS_INS_CTC1, MIPS_INS_NOP, MIPS_INS_BREAK, MIPS_INS_TLBP, MIPS_INS_TLBR, MIPS_INS_TLBWI, MIPS_INS_MOV_S, MIPS_INS_MOV_D, MIPS_INS_C_LT_S, MIPS_INS_C_LT_D, MIPS_INS_DIV_S, MIPS_INS_MUL_S, MIPS_INS_TRUNC_W_S, MIPS_INS_CVT_S_W, MIPS_INS_SUB_S, MIPS_INS_ADD_S]: clobber_conditionally(insn) if delay_slot and delayed_insn is not None: if delayed_insn.id == MIPS_INS_JAL or delayed_insn.id == MIPS_INS_JALR or \ (delayed_insn.id == MIPS_INS_JR and delayed_insn.rs == MIPS_REG_RA): # destroy lui tracking state lui_tracker.clear() elif delayed_insn.id == MIPS_INS_JR and vaddr == next_jtbl_jr + 4: # save lui tracking state for each jtbl label for label in individual_jtbl_labels[next_jtbl_jr]: save_tracker(label, lui_tracker.copy()) next_jtbl_jr = 0 else: # save lui tracking state save_tracker(delayed_insn.offset, lui_tracker.copy()) # destroy current lui tracking state for unconditional branches if delayed_insn.id == MIPS_INS_B: lui_tracker.clear() delayed_insn = None delay_slot = delayed_insn is not None put_symbols(symbols_dict, "branch_labels", func_branch_labels) put_symbols(symbols_dict, "jtbl_labels", func_jtbl_labels) return symbols_dict def find_symbols_in_data(data, vram, end, relocs, segment_name): symbols_dict = dict() print(f"Finding symbols from .data in {segment_name}") # read relocations for symbols if relocs is not None: for reloc in relocs: if reloc[1] == 2: # R_MIPS_32 put_symbol(symbols_dict, "data_labels", as_word(data[reloc[2] : reloc[2] + 4])) elif reloc[1] == 4: # R_MIPS_26 assert False , "R_MIPS_26 in .data section?" elif reloc[1] == 5: # R_MIPS_HI16 assert False , "R_MIPS_HI16 in .data section?" elif reloc[1] == 6: # R_MIPS_LO16 assert False , "R_MIPS_LO16 in .data section?" else: assert False , "Invalid relocation type encountered" return symbols_dict # TODO more # matches several codepoint regions of EUC-JP strings_regex = re.compile(rb'^(?:[\x00\x1A\x1B\n\t\x20-\x7E\x8C\x8D]|\x30[\x00-\xFF]|[\x4E-\x9F][\x00-\xFF]|[\xA4\xA5\xBB][\xA1-\xF6]|[\xA1-\xA3\xB0-\xBF\xC0-\xCF][\xA1-\xFE]|\xFF[\x00-\xEF])+$') def find_symbols_in_rodata(data, vram, end, relocs, segment): # read relocations for symbols if relocs is not None: for reloc in relocs: if reloc[1] == 2: # R_MIPS_32 data_labels.add(as_word(data[reloc[2] : reloc[2] + 4])) elif reloc[1] == 4: # R_MIPS_26 assert False , "R_MIPS_26 in .rodata section?" elif reloc[1] == 5: # R_MIPS_HI16 assert False , "R_MIPS_HI16 in .rodata section?" elif reloc[1] == 6: # R_MIPS_LO16 assert False , "R_MIPS_LO16 in .rodata section?" else: assert False , "Invalid relocation type encountered" section_symbols = set([sym for sym in symbols.values() if sym >= vram and sym < end]) section_symbols.add(vram) section_symbols.update(set([sym for sym in data_labels if sym >= vram and sym < end])) # TODO temp hack, move for data_file_st in [sym for sym in segment[4].keys() if sym >= vram and sym < end]: section_symbols.add(data_file_st) rodata_starts = [addr for addr,name in segment[4].items() if addr >= vram and addr < end] section_symbols.update(set(rodata_starts)) section_symbols = list(section_symbols) section_symbols.sort() section_late_rodata = [sym for sym in section_symbols if sym in floats or sym in doubles or sym in jtbls] # string finding is best done per-file as the late_rodata sections are not merged per-segment, so you get stripes of # rodata - late_rodata -(file boundary)- rodata - late_rodata -(file boundary)- etc. # all rodata file starts for fi,rodata_start in enumerate(rodata_starts,0): next_rodata_start = rodata_starts[fi + 1] if fi < len(rodata_starts) - 1 else end # symbols in this file file_symbols = [sym for sym in section_symbols if sym >= rodata_start and sym < next_rodata_start] # the first (minimum st_value) symbol in section_late_rodata in this file is the start of late_rodata late_rodata_start = min([sym for sym in file_symbols if sym in section_late_rodata], default=next_rodata_start) for i,symbol in enumerate(file_symbols,0): next_symbol = file_symbols[i + 1] if i < len(file_symbols) - 1 else next_rodata_start # not late_rodata if symbol >= late_rodata_start: break else: if symbol in dwords or symbol % 4 != 0: continue data_offset = symbol - vram data_size = next_symbol - symbol string_data = data[data_offset:data_offset+data_size] if len(string_data) > 0 and string_data[0] == 0: # empty strings that are auto-detected are dubious continue # charset validation if strings_regex.match(string_data) != None: k = 0 done = False for j,b in enumerate(string_data,0): if b == 0 and (j + 1 >= len(string_data) or string_data[j + 1] != 0): # try: # decoded = try_decode_string(string_data[k:string_data.index(0)]) # except UnicodeDecodeError: # # not a string, also suggests nothing past here is a string either # done = True # if done: # break data_labels.add(symbol + k) strings.add(symbol + k) data_labels.add(symbol + j + 1) k = j + 1 # TODO more def asm_header(section_name): return f""".include "macro.inc" # assembler directives .set noat # allow manual use of $at .set noreorder # don't insert nops after branches .set gp=64 # allow use of 64-bit general purpose registers .section {section_name} .balign 16 """ def disassemble_text(data, vram, data_regions, segment): result = asm_header(".text") raw_insns = as_word_list(data) cur_file = "" segment_dirname = ("" if segment[2] != "overlay" else "overlays/") + segment[0] delayed_insn = None delay_slot = False for i,raw_insn in enumerate(raw_insns,0): i *= 4 vaddr = vram + i insn = decode_insn(raw_insns[i//4], vaddr) mnemonic = insn.mnemonic op_str = insn.op_str if vaddr in segment[4].keys(): if cur_file != "": os.makedirs(f"{ASM_OUT}/{segment_dirname}/", exist_ok=True) with open(f"{ASM_OUT}/{segment_dirname}/{cur_file}.text.s", "w") as outfile: outfile.write(result) result = asm_header(".text") cur_file = segment[4][vaddr] if cur_file == "": cur_file = f"{segment[0]}_{vaddr:08X}" if cur_file == "[PADDING]": # workaround for assumed linker bug continue # DATA EMBEDDED IN TEXT in_data = False for region in data_regions: if vaddr in range(region[0], region[1], 4): in_data = True break if in_data: if vaddr in functions: # TODO not really a function if it falls in a data region... result += f"\nglabel {proper_name(vaddr, True)}\n" result += f"/* {i:06X} {vaddr:08X} {raw_insn:08X} */ .word 0x{raw_insn:08X}\n" continue # LABELS if vaddr in functions: result += f"\nglabel {proper_name(vaddr)}\n" if vaddr in jtbl_labels: result += f"glabel L{vaddr:08X}\n" if vaddr in branch_labels: result += f".L{vaddr:08X}:\n" # INSTRUCTIONS FORMATTING/CORRECTING if insn.id in MIPS_BRANCH_INSNS: op_str_parts = [] for field in insn.fields: if field == 'offset': op_str_parts.append(f".L{insn.offset:08X}") else: op_str_parts.append(insn.format_field(field)) op_str = ", ".join(op_str_parts) delayed_insn = insn elif insn.id in MIPS_JUMP_INSNS: op_str_parts = [] for field in insn.fields: if field == 'target': op_str_parts.append(proper_name(insn.target, is_symbol=True)) else: op_str_parts.append(insn.format_field(field)) op_str = ", ".join(op_str_parts) delayed_insn = insn elif insn.id == MIPS_INS_LUI: symbol_value = symbols.get(vaddr, None) if symbol_value is not None: op_str = f"{mips_gpr_names[insn.rt]}, %hi({proper_name(symbol_value)})" else: constant_value = constants.get(vaddr, None) if constant_value is not None: op_str = f"{mips_gpr_names[insn.rt]}, (0x{constant_value:08X} >> 16)" elif insn.id == MIPS_INS_ADDIU or insn.id in MIPS_LOAD_STORE_INSNS: symbol_value = symbols.get(vaddr, None) if symbol_value is not None: if insn.id == MIPS_INS_ADDIU: op_str = f"{mips_gpr_names[insn.rt]}, {mips_gpr_names[insn.rs]}, %lo({proper_name(symbol_value)})" else: op_str = f"{mips_fpr_names[insn.ft] if insn.id in MIPS_FP_LOAD_STORE_INSNS else mips_gpr_names[insn.rt]}, %lo({proper_name(symbol_value)})({mips_gpr_names[insn.base]})" elif insn.id == MIPS_INS_ORI: constant_value = constants.get(vaddr, None) if constant_value is not None: op_str = f"{mips_gpr_names[insn.rt]}, {mips_gpr_names[insn.rs]}, (0x{constant_value:08X} & 0xFFFF)" if delay_slot: mnemonic = " " + mnemonic delayed_insn = None delay_slot = False if delayed_insn is not None: delay_slot = True result += f"/* {i:06X} {vaddr:08X} {raw_insn:08X} */ {mnemonic:12}{op_str}\n" os.makedirs(f"{ASM_OUT}/{segment_dirname}/", exist_ok=True) with open(f"{ASM_OUT}/{segment_dirname}/{cur_file}.text.s", "w") as outfile: outfile.write(result) def disassemble_data(data, vram, end, segment): section_symbols = [sym for sym in symbols.values() if sym >= vram and sym < end] section_symbols.append(vram) section_symbols.extend([sym for sym in data_labels if sym >= vram and sym < end]) section_symbols.extend([sym for sym in variables_ast.keys() if sym >= vram and sym < end]) # TODO temp hack, move section_symbols.extend([sym for sym in segment[4].keys() if sym >= vram and sym < end]) # remove symbols that are addends of other symbols section_symbols = [sym for sym in section_symbols if " + 0x" not in proper_name(sym, True) and " - 0x" not in proper_name(sym, True)] section_symbols = list(set(section_symbols)) section_symbols.sort() segment_dirname = ("" if segment[2] != "overlay" else "overlays/") + segment[0] result = asm_header(".data") cur_file = "" for i,symbol in enumerate(section_symbols,0): next_symbol = section_symbols[i + 1] if i < len(section_symbols) - 1 else end # assert symbol % 4 == 0 , f"Unaligned .data symbol 0x{symbol:08X}" if symbol in segment[4].keys(): if cur_file != "": os.makedirs(f"{DATA_OUT}/{segment_dirname}/", exist_ok=True) with open(f"{DATA_OUT}/{segment_dirname}/{cur_file}.data.s", "w") as outfile: outfile.write(result) result = asm_header(".data") cur_file = segment[4][symbol] if cur_file == "": cur_file = f"{segment[0]}_{symbol:08X}" data_offset = symbol - vram data_size = next_symbol - symbol if data_offset == len(data): continue result += f"\nglabel {proper_name(symbol, True)}\n" if symbol % 8 == 0 and data_size % 8 == 0 and symbol in doubles: result += "\n".join(\ [f"/* {data_offset + j * 8:06X} {symbol + j * 8:08X} {dbl_dwd:016X} */ {format_f64(dbl_dwd)}" for j,dbl_dwd in enumerate(as_dword_list(data[data_offset:data_offset + data_size]), 0)]) + "\n" elif symbol % 8 == 0 and data_size % 8 == 0 and symbol in dwords: result += "\n".join(\ [f"/* {data_offset + j * 8:06X} {symbol + j * 8:08X} */ .quad 0x{dword:08X}" for j,dword in enumerate(as_dword_list(data[data_offset:data_offset + data_size]), 0)]) + "\n" elif symbol % 4 == 0 and data_size % 4 == 0: if symbol in floats: result += "\n".join(\ [f"/* {data_offset + j * 4:06X} {symbol + j * 4:08X} {flt_wd:08X} */ {format_f32(flt_wd)}" for j,flt_wd in enumerate(as_word_list(data[data_offset:data_offset + data_size]), 0)]) + "\n" else: result += "\n".join(\ [f"/* {data_offset + j * 4:06X} {symbol + j * 4:08X} */ .word {lookup_name(symbol, word)}" for j,word in enumerate(as_word_list(data[data_offset:data_offset + data_size]), 0)]) + "\n" elif symbol % 2 == 0 and data_size % 2 == 0: result += "\n".join(\ [f"/* {data_offset + j * 2:06X} {symbol + j * 2:08X} */ .half 0x{hword:04X}" for j,hword in enumerate(as_hword_list(data[data_offset:data_offset + data_size]), 0)]) + "\n" else: result += "\n".join(\ [f"/* {data_offset + j:06X} {symbol + j:08X} */ .byte 0x{byte:02X}" for j,byte in enumerate(data[data_offset:data_offset + data_size], 0)]) + "\n" os.makedirs(f"{DATA_OUT}/{segment[0]}/", exist_ok=True) with open(f"{DATA_OUT}/{segment[0]}/{cur_file}.data.s", "w") as outfile: outfile.write(result) def disassemble_rodata(data, vram, end, segment): section_symbols = [sym for sym in symbols.values() if sym >= vram and sym < end] section_symbols.append(vram) section_symbols.extend([sym for sym in data_labels if sym >= vram and sym < end]) section_symbols.extend([sym for sym in variables_ast.keys() if sym >= vram and sym < end]) # TODO temp hack, move section_symbols.extend([sym for sym in segment[4].keys() if sym >= vram and sym < end]) # remove symbols that are addends of other symbols section_symbols = [sym for sym in section_symbols if " + 0x" not in proper_name(sym, True) and " - 0x" not in proper_name(sym, True)] section_symbols = list(set(section_symbols)) section_symbols.sort() segment_dirname = ("" if segment[2] != "overlay" else "overlays/") + segment[0] result = asm_header(".rodata") cur_file = "" force_ascii_str = False # hack for non-null-terminated strings in .data for i,symbol in enumerate(section_symbols,0): next_symbol = section_symbols[i + 1] if i < len(section_symbols) - 1 else end # assert symbol % 4 == 0 , f"Unaligned .data symbol 0x{symbol:08X}" if symbol in segment[4].keys(): if cur_file != "": os.makedirs(f"{DATA_OUT}/{segment_dirname}/", exist_ok=True) with open(f"{DATA_OUT}/{segment_dirname}/{cur_file}.rodata.s", "w") as outfile: outfile.write(result) result = asm_header(".rodata") cur_file = segment[4][symbol] if cur_file == "": cur_file = f"{segment[0]}_{symbol:08X}" data_offset = symbol - vram data_size = next_symbol - symbol if data_offset == len(data): continue force_ascii_str = symbol in [0x801D0708] result += f"\nglabel {proper_name(symbol, True)}\n" if symbol in strings: string_data = data[data_offset:data_offset+data_size] string_data = string_data[:string_data.index(0)] assert all([b != 0 for b in string_data[:-1]]) , f"{symbol:08X} , {data_size:X} , {string_data}" # ensure strings don't have a null char midway through result += f"/* {data_offset:06X} {symbol:08X} */ {try_decode_string(string_data, force_ascii=force_ascii_str)}\n{STR_INDENT}.balign 4\n" elif symbol % 8 == 0 and data_size % 8 == 0 and symbol in doubles: result += "\n".join(\ [f"/* {data_offset + j * 8:06X} {symbol + j * 8:08X} {dbl_dwd:016X} */ {format_f64(dbl_dwd)}" for j,dbl_dwd in enumerate(as_dword_list(data[data_offset:data_offset + data_size]), 0)]) + "\n" elif symbol % 4 == 0 and data_size % 4 == 0: if symbol in floats: result += "\n".join(\ [f"/* {data_offset + j * 4:06X} {symbol + j * 4:08X} {flt_wd:08X} */ {format_f32(flt_wd)}" for j,flt_wd in enumerate(as_word_list(data[data_offset:data_offset + data_size]), 0)]) + "\n" else: result += "\n".join(\ [f"/* {data_offset + j * 4:06X} {symbol + j * 4:08X} */ .word {lookup_name(symbol, word)}" for j,word in enumerate(as_word_list(data[data_offset:data_offset + data_size]), 0)]) + "\n" elif symbol % 2 == 0 and data_size % 2 == 0: result += "\n".join(\ [f"/* {data_offset + j * 2:06X} {symbol + j * 2:08X} */ .half 0x{hword:04X}" for j,hword in enumerate(as_hword_list(data[data_offset:data_offset + data_size]), 0)]) + "\n" else: result += "\n".join(\ [f"/* {data_offset + j:06X} {symbol + j:08X} */ .byte 0x{byte:02X}" for j,byte in enumerate(data[data_offset:data_offset + data_size], 0)]) + "\n" os.makedirs(f"{DATA_OUT}/{segment[0]}/", exist_ok=True) with open(f"{DATA_OUT}/{segment[0]}/{cur_file}.rodata.s", "w") as outfile: outfile.write(result) def disassemble_bss(vram, end, segment): section_symbols = [sym for sym in symbols.values() if sym >= vram and sym < end] section_symbols.append(vram) section_symbols.extend([sym for sym in data_labels if sym >= vram and sym < end]) section_symbols.extend([sym for sym in variables_ast.keys() if sym >= vram and sym < end]) # TODO temp hack, move section_symbols.extend([sym for sym in segment[4].keys() if sym >= vram and sym < end]) # remove symbols that are addends of other symbols section_symbols = [sym for sym in section_symbols if " + 0x" not in proper_name(sym, True) and " - 0x" not in proper_name(sym, True)] section_symbols = list(set(section_symbols)) section_symbols.sort() # ("" if segment[2] != "overlay" else "overlays/" segment_dirname = segment[0] result = asm_header(".bss") cur_file = "" for i,symbol in enumerate(section_symbols,0): next_symbol = section_symbols[i + 1] if i < len(section_symbols) - 1 else end if symbol in segment[4].keys(): if cur_file != "": os.makedirs(f"{DATA_OUT}/{segment_dirname}/", exist_ok=True) with open(f"{DATA_OUT}/{segment_dirname}/{cur_file}.bss.s", "w") as outfile: outfile.write(result) result = asm_header(".bss") cur_file = segment[4][symbol] if cur_file == "": cur_file = f"{segment[0]}_{symbol:08X}" result += f"\nglabel {proper_name(symbol, True)}\n" result += f"/* {symbol - vram:06X} {symbol:08X} */ .space 0x{next_symbol - symbol:X}\n" os.makedirs(f"{DATA_OUT}/{segment_dirname}/", exist_ok=True) with open(f"{DATA_OUT}/{segment_dirname}/{cur_file}.bss.s", "w") as outfile: outfile.write(result) def get_overlay_sections(vram, overlay): header_loc = len(overlay) - as_word_list(overlay)[-1] text_size, data_size, rodata_size, bss_size, n_relocs = as_word_list(overlay[header_loc:header_loc + 0x14]) text_vram = vram data_vram = vram + text_size rodata_vram = data_vram + data_size bss_vram = vram + len(overlay) bss_end_vram = bss_vram + bss_size reloc = as_word_list(overlay[header_loc + 0x14: header_loc + 0x14 + n_relocs * 4]) reloc = [((rel >> 30) & ((1 << 2) - 1), (rel >> 24) & ((1 << 6) - 1), (rel) & ((1 << 24) - 1)) for rel in reloc] rel_text = [r for r in reloc if r[0] == 1] rel_data = [r for r in reloc if r[0] == 2] rel_rodata = [r for r in reloc if r[0] == 3] rel_bss = [r for r in reloc if r[0] == 4] return [[text_vram, data_vram, 'text', None, overlay[0:text_size], rel_text], \ [data_vram, rodata_vram, 'data', None, overlay[text_size:text_size+data_size], rel_data], \ [rodata_vram, bss_vram, 'rodata', None, overlay[text_size+data_size:text_size+data_size+rodata_size], rel_rodata], \ [bss_vram, bss_end_vram, 'bss', None, None, rel_bss], \ [bss_end_vram, bss_end_vram, 'reloc', None, overlay[header_loc:], None]] # print("Setting Up") files_spec = None with open("tools/disasm/files.txt", "r") as infile: files_spec = ast.literal_eval(infile.read()) with open("tools/disasm/functions.txt", "r") as infile: functions_ast = ast.literal_eval(infile.read()) with open("tools/disasm/variables.txt", "r") as infile: variables_ast = ast.literal_eval(infile.read()) # Find floats, doubles, and strings for var in sorted(variables_ast.keys()): var_type = variables_ast[var][1] if var_type == "f64": doubles.add(var) elif var_type == "f32": floats.add(var) elif var_type == "char" and variables_ast[var][2].startswith("["): strings.add(var) # Read in binary and relocation data for each segment for segment in files_spec: binary = None with open(segment[1] + "/" + segment[0],"rb") as infile: binary = bytes(infile.read()) if segment[2] == "overlay": segment_start = list(segment[4])[0] # start addr of first file in segment segment[3] = get_overlay_sections(segment_start, binary) segment[4] = {} for section in segment[3]: segment[4].update({ section[0] : segment[0] }) else: segment_start = segment[3][0][0] # start addr of first section of segment's sections # read section binary regions for section in segment[3]: if section[2] == 'bss': continue section.append(binary[section[0] - segment_start:section[1] - segment_start]) # section[4] section.append(None) # section[5] print(f"Finding Symbols") for segment in files_spec: if segment[2] == 'makerom': continue print(f"Finding segment positions in {segment[0]}") # vram segment start if segment[3][0][0] not in variables_ast: variables_ast.update({segment[3][0][0] : (f"_{segment[0]}SegmentStart","u8","[]",0x1)}) # vram segment end if segment[3][-1][1] not in variables_ast: variables_ast.update({segment[3][-1][1] : (f"_{segment[0]}SegmentEnd","u8","[]",0x1)}) # Construct variable_addrs, now that variable_addrs is fully constructed variable_addrs = sorted(variables_ast.keys()) pool = get_context("fork").Pool(jobs) # Find symbols for each segment for segment in files_spec: if segment[2] == 'makerom': continue #print(f"Finding symbols from .text and .data in {segment[0]}") for section in segment[3]: if section[2] == 'text': data_regions = [] if section[3] is not None: for override_region in section[3]: if override_region[2] == 'data': data_regions.append((override_region[0], override_region[1])) # try find a rodata section for find_section in segment[3]: if find_section[2] == 'rodata' and find_section[1] != find_section[0]: rodata_section = find_section break else: rodata_section = None pool.apply_async(find_symbols_in_text, args=(section[4], rodata_section[4] if rodata_section is not None else None, section[0], rodata_section[0] if rodata_section is not None else None, data_regions, section[5], segment[0]), callback=update_symbols_from_dict) elif section[2] == 'data': pool.apply_async(find_symbols_in_data, args=(section[4], section[0], section[1], section[5], segment[0]), callback=update_symbols_from_dict) pool.close() pool.join() for segment in files_spec: if segment[2] == 'makerom': continue print(f"Finding symbols from .rodata and .dmadata in {segment[0]}") for section in segment[3]: if section[2] == 'rodata': find_symbols_in_rodata(section[4], section[0], section[1], section[5], segment) elif section[2] == 'dmadata': filenames = [] with open("tools/disasm/dma_filenames.txt","r") as infile: filenames = ast.literal_eval(infile.read()) dmadata = segment[3][0][4] i = 0 dmadata_entry = dmadata[i*0x10:(i+1)*0x10] while any([word != 0 for word in as_word_list(dmadata_entry)]): vrom_start,vrom_end,prom_start,prom_end = as_word_list(dmadata_entry) # print(f"{vrom_start:08X},{vrom_end:08X},{prom_start:08X},{prom_end:08X} : {filenames[i]}") vrom_variables.append(("_" + filenames[i] + "SegmentRomStart", vrom_start)) vrom_variables.append(("_" + filenames[i] + "SegmentRomEnd", vrom_end)) i += 1 dmadata_entry = dmadata[i*0x10:(i+1)*0x10] # Construct vrom_addrs, now that vrom_variables is fully constructed vrom_addrs = {addr for _, addr in vrom_variables} def disassemble_makerom(segment): os.makedirs(f"{ASM_OUT}/makerom/", exist_ok=True) rom_header = segment[3][0][4] ipl3 = segment[3][1][4] entry = segment[3][2][4] pi_dom1_reg, clockrate, entrypoint, revision, \ chksum1, chksum2, pad1, pad2, \ rom_name, pad3, cart, cart_id, \ region, version = struct.unpack(">IIIIIIII20sII2s1sB", rom_header) out = f"""/* * The Legend of Zelda: Majora's Mask ROM header */ .word 0x{pi_dom1_reg:08X} /* PI BSD Domain 1 register */ .word 0x{clockrate:08X} /* Clockrate setting */ .word 0x{entrypoint:08X} /* Entrypoint function (`entrypoint`) */ .word 0x{revision:08X} /* Revision */ .word 0x{chksum1:08X} /* Checksum 1 */ .word 0x{chksum2:08X} /* Checksum 2 */ .word 0x{pad1:08X} /* Unknown */ .word 0x{pad2:08X} /* Unknown */ .ascii "{rom_name.decode('ascii')}" /* Internal ROM name */ .word 0x{pad3:08X} /* Unknown */ .word 0x{cart:08X} /* Cartridge */ .ascii "{cart_id.decode('ascii')}" /* Cartridge ID */ .ascii "{region.decode('ascii')}" /* Region */ .byte 0x{version:02X} /* Version */ """ with open(ASM_OUT + "/makerom/rom_header.s", "w") as outfile: outfile.write(out) # TODO disassemble this eventually, low priority out = f"{asm_header('.text')}\n.incbin \"baserom/makerom\", 0x40, 0xFC0\n" with open(ASM_OUT + "/makerom/ipl3.s", "w") as outfile: outfile.write(out) # hack: add symbol relocations manually entry_addr = 0x80080000 functions.add(entry_addr) symbols.update({entry_addr + 0x00 : 0x80099500, entry_addr + 0x04 : 0x80099500, entry_addr + 0x20 : 0x80080060, entry_addr + 0x24 : 0x80099EF0, entry_addr + 0x28 : 0x80080060, entry_addr + 0x30 : 0x80099EF0}) branch_labels.add(entry_addr + 0xC) disassemble_text(entry, entry_addr, [], segment) # manually set boot bss size... entry_asm = "" with open(f"{ASM_OUT}/makerom/entry.text.s") as infile: entry_asm = infile.read() entry_asm = entry_asm.replace("0x63b0", "%lo(_bootSegmentBssSize)") with open(f"{ASM_OUT}/makerom/entry.s", "w") as outfile: outfile.write(entry_asm) os.remove(f"{ASM_OUT}/makerom/entry.text.s") #out = f"{asm_header('.text')}\n.incbin \"baserom/makerom\", 0x1000, 0x60\n" #with open(ASM_OUT + "/makerom/entry.s", "w") as outfile: # outfile.write(out) def disassemble_segment(segment): if segment[2] == 'dmadata': os.makedirs(f"{ASM_OUT}/dmadata/", exist_ok=True) out = f""".include "macro.inc" .macro DMA_TABLE_ENTRY segment .4byte _\segment\()SegmentRomStart .4byte _\segment\()SegmentRomEnd .4byte _\segment\()SegmentRomStart .4byte 0x00000000 .endm .macro DMA_TABLE_ENTRY_UNSET segment .4byte _\segment\()SegmentRomStart .4byte _\segment\()SegmentRomEnd .word 0xFFFFFFFF .word 0xFFFFFFFF .endm glabel {variables_ast[0x8009F8B0][0]} """ filenames = [] with open("tools/disasm/dma_filenames.txt","r") as infile: filenames = ast.literal_eval(infile.read()) dmadata = segment[3][0][4] i = 0 dmadata_entry = dmadata[i*0x10:(i+1)*0x10] while any([word != 0 for word in as_word_list(dmadata_entry)]): vrom_start,vrom_end,prom_start,prom_end = as_word_list(dmadata_entry) if prom_start == 0xFFFFFFFF and prom_end == 0xFFFFFFFF: out += f"DMA_TABLE_ENTRY_UNSET {filenames[i]}\n" else: out += f"DMA_TABLE_ENTRY {filenames[i]}\n" i += 1 dmadata_entry = dmadata[i*0x10:(i+1)*0x10] out += """ .space 0x100 .section .bss .space 0x10 """ with open(ASM_OUT + "/dmadata/dmadata.s", "w") as outfile: outfile.write(out) return for section in segment[3]: if (section[0] == section[1] and section[2] != 'reloc') or (section[2] != 'bss' and len(section[4]) == 0): continue print(f"Disassembling {segment[0]} .{section[2]}") if section[2] == 'text': data_regions = [] if section[3] is not None: for override_region in section[3]: if override_region[2] == 'data': data_regions.append((override_region[0], override_region[1])) disassemble_text(section[4], section[0], data_regions, segment) elif section[2] == 'data': disassemble_data(section[4], section[0], section[1], segment) elif section[2] == 'rodata': disassemble_rodata(section[4], section[0], section[1], segment) elif section[2] == 'bss': disassemble_bss(section[0], section[1], segment) elif section[2] == 'reloc': words = as_word_list(section[4]) # ("" if segment[2] != "overlay" else "overlays/") + segment_dirname = segment[0] result = asm_header(".rodata") result += f"\nglabel {segment[0]}_Reloc\n" lines = [words[i*8:(i+1)*8] for i in range(0, (len(words) // 8) + 1)] for line in [line for line in lines if len(line) != 0]: result += f" .word {', '.join([f'0x{word:08X}' for word in line])}\n" os.makedirs(f"{DATA_OUT}/{segment_dirname}/", exist_ok=True) with open(f"{DATA_OUT}/{segment_dirname}/{segment[0]}.reloc.s", "w") as outfile: outfile.write(result) print("Disassembling Segments") disassemble_makerom(next(segment for segment in files_spec if segment[2] == 'makerom')) # Textual disassembly for each segment with get_context("fork").Pool(jobs) as p: p.map(disassemble_segment, (segment for segment in files_spec if segment[2] != 'makerom')) print("Splitting text and migrating rodata") func_regex = re.compile(r'\n\nglabel \S+\n') rodata_symbols_regex = re.compile(r'(?<=\n)glabel (.+)(?=\n)') asm_symbols_regex = re.compile(r'%(?:lo|hi)\((.+?)\)') def rodata_block_size(block): def align(x, n): while (x % n != 0): x += 1 return x accumulator = 0 for part in block.split(' */ .'): part = part.strip() if part.startswith('# '): continue elif part.startswith('asciz '): part = part[len('asciz '):] string = part[1:-1].encode('utf-8', 'ignore').decode('unicode_escape') accumulator += len(string.encode('euc-jp') + b'\x00') elif part.startswith('ascii'): part = part[len('ascii '):] string = part[1:-1].encode('utf-8', 'ignore').decode('unicode_escape') accumulator += len(string.encode('euc-jp')) elif part.startswith('balign '): part = part[len('balign '):] accumulator = align(accumulator, int(part, 16 if part.startswith('0x') else 10)) elif part.startswith('double '): part = part[len('double '):] accumulator = align(accumulator, 8) accumulator += 8 * (part.count(',') + 1) elif part.startswith('float '): part = part[len('float '):] accumulator = align(accumulator, 4) accumulator += 4 * (part.count(',') + 1) elif part.startswith('word '): part = part[len('word '):] accumulator = align(accumulator, 4) accumulator += 4 * (part.count(',') + 1) elif part.startswith('half '): part = part[len('half '):] accumulator = align(accumulator, 2) accumulator += 2 * (part.count(',') + 1) elif part.startswith('byte '): part = part[len('byte '):] accumulator += 1 * (part.count(',') + 1) return accumulator def late_rodata_size(blocks): return sum([rodata_block_size(block) for block in blocks]) def text_block_size(asm): return 4*len([line for line in asm.split("\n") if line.startswith("/*")]) def rodata_syms(rodata): return re.findall(rodata_symbols_regex, rodata) def rodata_blocks(rodata): return ["glabel" + b for b in rodata.split("glabel")[1:]] def find_late_rodata_start(rodata): """ Returns the symbol that is the first late_rodata symbol, or None if there is no late_rodata Note this is approximate as const qualified floats/doubles end up in rdata and not late_rodata, so there may be some overlap. This should not pose much of a problem however. Also note that this method assumes the input rodata consists only of at most one block rodata and one block late_rodata, that is that the file splits are correct. """ first = None for sym,body in rodata: if sym.startswith("jtbl_") and ".word L" in body: # jump tables are always late_rodata, so we can return early # floats and doubles are very rarely not late_rodata, this is mostly seen in libultra, so we cannot return early for those if first is not None: return first else: return sym elif (".float " in body or ".double " in body) and first is None: # may be late_rodata, but we aren't sure yet # it is confirmed either by reaching the end or by finding a jumptable, and it is proven wrong if something that is not late_rodata occurs after it first = sym elif (".asciz" in body or (".word " in body and ".float" not in body) or ".half " in body or ".byte " in body) and first is not None: # reset first if something that is definitely not late_rodata is found # word and not float is due to some floats needing to be output as words either due to being a specific NaN value, or GAS can't convert it properly first = None # May still be None at this point, that just means there is no late_rodata return first # Split files and migrate rodata that should be migrated for root,dirs,files in os.walk(ASM_OUT): for f in files: if "non_matchings" in root: continue asm_path = os.path.join(root,f) rodata_path = asm_path.replace(ASM_OUT + "overlays/", DATA_OUT).replace(ASM_OUT, DATA_OUT).replace(".text.s", ".rodata.s") print(asm_path) asm = "" with open(asm_path,"r") as infile: asm = infile.read() rodata = "" if os.path.exists(rodata_path): # print(rodata_path) with open(rodata_path, "r") as rodata_file: rodata = rodata_file.read() rodata_info = list(zip(rodata_syms(rodata), rodata_blocks(rodata))) # populate rdata and late_rodata lists first_late_rodata = find_late_rodata_start(rodata_info) rdata_info = [] late_rodata_info = [] target = rdata_info # first populate rdata for sym,block in rodata_info: if sym == first_late_rodata: # now populate late_rodata, if there is any target = late_rodata_info target.append((sym,block)) # print([(sym,block.split("\n")[1:]) for sym,block in zip(rdata_syms,rdata_blocks)]) function_info = list(zip([s.replace("glabel","").strip() for s in func_regex.findall(asm)], func_regex.split(asm)[1:])) rdata_map = {} late_rodata_map = {} # pass 1 to resolve rodata splits if rodata != "": last_fn = None referenced_in = {} # key=rodata label , value=function label for sym,_ in rodata_info: all_refs = [label for label,body in function_info if "%lo(" + sym + ")" in body] # ignore multiple refs, take only the first referenced_in.update({ sym : all_refs[0] if len(all_refs) != 0 else None }) def do_splits(out_dict, info): first = True last_ref = None cr = None for sym,block in info: ref = referenced_in[sym] if first: cr = ref or sym if ref is not None and not first: if ref != last_ref: # existing function cr = ref elif last_ref is not None: # new GLOBAL_ASM cr = sym # add to output if cr not in out_dict.keys(): out_dict.update({cr : []}) out_dict[cr].append((sym,block)) # setup next iter last_ref = ref first = False do_splits(rdata_map, rdata_info) do_splits(late_rodata_map, late_rodata_info) # all output files, order is irrelevant at this point all_output = set([label for label,_ in function_info]).union(set(rdata_map.keys()),set(late_rodata_map.keys())) # pass 2 to output splits for label in all_output: fn_body = dict(function_info).get(label, None) text_out = "" rodata_out = "" if fn_body is not None: text_out = "glabel " + label.strip() + "\n" + fn_body.strip() + "\n" if rodata != "": rdata = rdata_map.get(label, None) late_rodata = late_rodata_map.get(label, None) # check for late_rodata_alignment late_rodata_alignment = "" if fn_body is not None and late_rodata is not None: ratio = late_rodata_size([block for _,block in late_rodata]) / text_block_size(fn_body) if ratio > 1./3: # print(f"{label} : {ratio}") # TODO hack: getting the address from a comment first_block_split = late_rodata[0][1].split(" */ .") vaddr = None if first_block_split[1].startswith("float") or first_block_split[1].startswith("double"): vaddr = first_block_split[0].split(" ")[-2] else: vaddr = first_block_split[0].split(" ")[-1] assert vaddr is not None vaddr = int(vaddr,16) if vaddr not in [0x801E0E28, 0x80C1C2B0, 0x80AA7820, 0x80AA3CC0, 0x80AA418C, 0x80BE0160, 0x80B591D8, 0x80B59610, 0x80B59780, 0x80964E00, 0x80964F10, 0x80BB40A0, 0x80952038, 0x80AFB920, 0x80AFBBFC, 0x80AFBE28, 0x80983320]: # hacks for especially badly behaved rodata, TODO these are ALL jumptables associated with # comparatively tiny functions, can we swat these programmatically? late_rodata_alignment = f".late_rodata_alignment {'8' if vaddr % 8 == 0 else '4'}\n" rdata_out = "" if rdata is not None: rdata_out = ".rdata\n" + "".join([block for _,block in rdata]) late_rodata_out = "" if late_rodata is not None: late_rodata_out = ".late_rodata\n" + late_rodata_alignment + "".join([block for _,block in late_rodata]) rodata_out = rdata_out + late_rodata_out + ("\n.text\n" if ((rdata is not None or late_rodata is not None) and fn_body is not None) else "") all_out = rodata_out + text_out # write it out out_path = root.replace(ASM_OUT, f"{ASM_OUT}/non_matchings/") + "/" + ((f.replace(".text.s","") + "/") if "overlays" not in root else "") os.makedirs(out_path, exist_ok=True) with open(out_path + label + ".s", "w") as outfile: outfile.write(all_out.strip() + "\n") # remove rodata and unsplit file # TODO