Add ability to extract joint enums from bmd/bdl files (#2704)

* arc extraction will auto-create resource files * Update enum extraction to extract joint enums * Update enum extraction to use pathlib * Move enum extraction to converters folder * Added check to extract bmd file if src file is modded
2025-09-27 18:30:13 -04:00 · 2025-09-27 18:30:13 -04:00 · 3f37bad921
parent 6242aa6e84
commit 3f37bad921
3 changed files with 325 additions and 167 deletions
--- a/tools/converters/binary_funcs.py
+++ b/tools/converters/binary_funcs.py
@ -48,15 +48,14 @@ def read_f32(binary_file: BinaryIO) -> int:
    return struct.unpack(">f", chunk)[0]

 def read_bytes_until_null(binary_file: BinaryIO) -> bytes:
-  str_length = 0
-  while True:
-    char = binary_file.read(1)
-    if char == b"\0":
-      break
-    else:
-      str_length += 1
-  
-  binary_file.seek(-(str_length+1), os.SEEK_CUR)
-  string = binary_file.read(str_length)
-  
-  return string
+    begin = binary_file.tell()
+    while True:
+        b = binary_file.read(1)
+        if len(b) == 0:
+            raise EOFError()
+        if b[0] == 0:
+            break
+
+    str_length = binary_file.tell() - begin - 1
+    binary_file.seek(begin)
+    return binary_file.read(str_length)
--- a/tools/converters/res_arc.py
+++ b/tools/converters/res_arc.py
@ -0,0 +1,314 @@
+#!/usr/bin/env python3
+
+from binary_funcs import read_bytes_until_null, read_u32, read_u16, read_u8, skip_bytes
+import subprocess
+from os import walk, makedirs
+from pathlib import Path
+from typing import NamedTuple, DefaultDict
+import re
+
+DTK_PATH = str(Path("./build/tools/dtk.exe"))
+OUT_PATH = "build/res"
+INDENT = " " * 4
+ADD_EXT_TO_ENUM = False
+SKIP_STAGE_ARCS = True
+SKIP_DEMO_ARCS = True
+SKIP_FILES_WITH_AT_SIGN = True
+# get when this file was modified last, used to detect if we should re-extract enums
+THIS_MTIME = Path(__file__).stat().st_mtime
+
+class ArcFile(NamedTuple):
+    file_name:str
+    index:int
+    id:int
+
+class ArcNode(NamedTuple):
+    node_type:bytes
+    name:str
+    inds:range
+
+class ArcEnumValue(NamedTuple):
+    enum_value_name:str
+    value:int
+
+class ArcEnum(NamedTuple):
+    enum_name:str
+    values:list[ArcEnumValue]
+
+class JointParsedEnums(NamedTuple):
+    enums:list[ArcEnum]
+    
+def ensure_dir(path:str)->None:
+    makedirs(path, exist_ok=True)
+
+def bin_make_str(s:bytes)->str:
+    s = s.replace(b"\x82\x98", b"x")
+    try:
+        s = s.decode("ascii")
+    except Exception as e:
+        raise AssertionError(f"{e}: {s}")
+    return s
+
+def sanitize_string(s:str)->str:
+    return re.sub(r"[\s@:\.,\-<>*%\"!&()|\+$]", "_", s)
+
+def read_str(binf)->str:
+    return bin_make_str(read_bytes_until_null(binf))
+
+def make_enum(e:ArcEnum):
+    out = []
+    out.append(f"enum {e.enum_name} {{")
+    for v in e.values:
+        out.append(f"{INDENT}{v.enum_value_name}=0x{v.value:X},")
+    out.append("};")
+
+    out_enum = "\n".join(out)
+
+    return out_enum
+
+def parse_bmd(src_path:Path):
+    global ADD_EXT_TO_ENUM
+    with src_path.open("rb") as binf:
+        header_magic = binf.read(4)
+        known_J3D_magic = b"J3D2"
+        assert(header_magic == b"J3D2"), f"Attempted to parse {src_path} as bmd/bdl, but J3D header type doesn't match {known_J3D_magic} : {header_magic}"
+        bmd_magic = binf.read(4)
+        known_bmd_magics = (b"bmd3", b"bmd2", b"bdl4")
+        assert(bmd_magic in known_bmd_magics),  f"Attempted to parse {src_path} as bmd/bdl, but bmd/bdl header type doesn't match any of {known_bmd_magics} : {bmd_magic}"
+        skip_bytes(binf, 4)
+        chunk_count = read_u32(binf)
+        binf.seek(0x20)
+        for _ in range(chunk_count):
+            chunk_begin = binf.tell()
+            name = bin_make_str(binf.read(4))
+            size = read_u32(binf)
+            next_chunk = chunk_begin + size
+            if name == "JNT1":
+                skip_bytes(binf, 12)
+                name_table = read_u32(binf) + chunk_begin
+                binf.seek(name_table)
+                num_strings = read_u16(binf)
+                found_enums = []
+                if ADD_EXT_TO_ENUM:
+                    out_enum_name = sanitize_string(src_path.name.replace(".", "_")).upper() + "_JNT"
+                else:
+                    out_enum_name = sanitize_string(src_path.name.split(".")[0]).upper() + "_JNT"
+
+                for i in range(num_strings):
+                    binf.seek(name_table + 6 + i * 4)
+                    string_offset = read_u16(binf)
+                    binf.seek(name_table + string_offset)
+
+                    joint_name = f"{out_enum_name}_{read_str(binf).upper()}_e"
+                    found_enums.append(ArcEnumValue(joint_name, i))
+                    # print(joint_name)
+                return ArcEnum(out_enum_name, found_enums)
+            binf.seek(next_chunk)
+    return None
+
+def extract_joint_enums(src_path:Path):
+    if ((SKIP_FILES_WITH_AT_SIGN and "@" in str(src_path)) or 
+        (SKIP_STAGE_ARCS and "Stage" in src_path.parts) or
+        (SKIP_DEMO_ARCS and any(x.startswith("Demo") for x in src_path.parts))):
+        return JointParsedEnums([])
+
+    out_jnt_enums:list[ArcEnum] = []
+    internal_files = subprocess.run([DTK_PATH, "vfs", "ls", "-r", f"{src_path}:"], stdout=subprocess.PIPE, text=True).stdout
+    output_folder = Path(str(src_path).replace(".", "__"))
+    for line in internal_files.split("\n"):
+        parts = line.split(" | ")
+        if len(parts) != 3: continue
+    
+        internal_file = parts[1].strip(" ")
+        internal_file_parts = internal_file.split(".")
+        if len(internal_file_parts) < 2: continue
+
+        extension = internal_file_parts[1]
+        if (extension not in ("bmd", "bdl")): continue
+
+        internal_file_path = output_folder / internal_file
+
+        # extract file from archive if either
+        # 1. output file doesn't exist
+        # 2. the archive file is newer than the output file (modded src)
+        if (not internal_file_path.exists() or
+            src_path.stat().st_mtime > internal_file_path.stat().st_mode):
+            ensure_dir(internal_file_path.parent)
+            subprocess.run([DTK_PATH, "vfs", "cp", f"{src_path}:{internal_file}", internal_file_path], stdout=subprocess.PIPE)
+        
+        out_enums = parse_bmd(internal_file_path)
+
+        out_jnt_enums.append(out_enums)
+    
+    return JointParsedEnums(out_jnt_enums)
+
+def convert_binary_to_resource_enum(src_path: Path, dest_path: Path) -> None:    
+    joint_enums = extract_joint_enums(src_path)
+
+    with src_path.open("rb") as binf:
+        opening_bytes = binf.read(4)
+        assert(opening_bytes == b"RARC"), f"Not a rarc file: starts with bytes {opening_bytes}"
+        skip_bytes(binf, 4)
+        data_header_offset = read_u32(binf)
+        binf.seek(data_header_offset)
+        node_count = read_u32(binf)
+        node_offset = read_u32(binf) + data_header_offset
+        total_num_file_entries = read_u32(binf)
+        file_entries_list_offset = read_u32(binf) + data_header_offset
+        skip_bytes(binf, 4)
+        string_list_offset = read_u32(binf) + data_header_offset
+        skip_bytes(binf, 2)
+        sync_flag = read_u8(binf)
+        
+        found_files:list[ArcFile] = []
+        all_file_names = []
+        for entry_index in range(total_num_file_entries):
+            binf.seek(file_entries_list_offset + entry_index * 0x14)
+            file_id = read_u16(binf)
+            skip_bytes(binf, 2)
+            type_and_name_offset = read_u32(binf)
+            entry_type = type_and_name_offset >> 24
+            name_offset = type_and_name_offset & 0x00FFFFFF
+            binf.seek(string_list_offset + name_offset)
+            file_name = read_str(binf)
+            if entry_type & 1: # check to make sure its a file
+                assert(not sync_flag or file_id == entry_index), \
+                    f"Sync flag was set, but ID {file_id} does not match Index {entry_index} for file {file_name}"
+                found_files.append(ArcFile(file_name, entry_index, file_id))
+                all_file_names.append(file_name)
+
+        index_file_lookup = {x.index : x for x in found_files}
+        
+        found_nodes:list[tuple[ArcNode, list[ArcFile]]] = []
+
+        for node_index in range(node_count):
+            binf.seek(node_offset + node_index * 0x10)
+            this_node_type = bin_make_str(binf.read(4))
+            this_node_name_offs = read_u32(binf)
+            skip_bytes(binf, 2)
+            this_node_index_count = read_u16(binf)
+            this_node_first_index = read_u32(binf)
+            this_node_inds = range(this_node_first_index, this_node_first_index + this_node_index_count)
+            binf.seek(string_list_offset + this_node_name_offs)
+            this_node_name = read_str(binf)
+            this_node = ArcNode(this_node_type, this_node_name, this_node_inds)
+            found_nodes.append((this_node, [index_file_lookup.get(x) for x in this_node.inds if x in index_file_lookup]))
+        
+    out_lines:list[str] = []
+    file_stem = src_path.name.split(".")[0]
+    file_stem_upper = sanitize_string(file_stem.upper())
+
+    out_lines.append(f"#ifndef RES_{file_stem_upper}_H")
+    out_lines.append(f"#define RES_{file_stem_upper}_H\n")
+
+    out_ids:list[str] = []
+    out_idxs:list[str] = []
+    
+    appearance_count = DefaultDict(int)
+
+    for node, files in found_nodes:
+        if len(files) == 0: continue
+        file_type_break = f"{INDENT}/* {node.node_type} */"
+        out_ids.append(file_type_break)
+        out_idxs.append(file_type_break)
+        for file in files:
+            parts = file.file_name.split(".")
+            santitized_file_name = sanitize_string(parts[0].upper()) # Sanitize identifier
+                
+            seen_count = appearance_count[santitized_file_name]
+            appearance_count[santitized_file_name] += 1
+
+            ext = ""
+            if len(parts) > 1:
+                ext = sanitize_string(parts[1].upper())
+
+            duplicate_tag = "_"
+            if seen_count > 0:
+                duplicate_tag = f"_{seen_count}_"
+
+            # tiny optimization to do less string formatting
+            begin_part = f"{INDENT}dRes_"
+            mid_part = f"_{file_stem_upper}_{ext}_{santitized_file_name}{duplicate_tag}e=0x"
+            out_idxs.append(f"{begin_part}INDEX{mid_part}{file.index:X},")
+            out_ids.append(f"{begin_part}ID{mid_part}{file.id:X},")
+    
+    out_lines.append(f"enum dRes_INDEX_{file_stem_upper} {{")
+    out_lines.extend(out_idxs)
+    out_lines.append("};\n")
+
+    out_lines.append(f"enum dRes_ID_{file_stem_upper} {{")
+    out_lines.extend(out_ids)
+    out_lines.append("};\n")
+
+    for joint_enum in joint_enums.enums:
+        out_lines.append(make_enum(joint_enum) + "\n")
+
+    out_lines.append(f"#endif /* !RES_{file_stem_upper}_H */")
+
+    out = "\n".join(out_lines)
+    ensure_dir(dest_path.parent)
+    with dest_path.open("w") as f:
+        f.write(out)
+
+def decompress_file(input_file:Path, output_file:Path) -> None:
+    # use pathlib to allow for unix+windows paths
+    subprocess.run([DTK_PATH, "yaz0", "decompress", input_file, "-o", output_file])
+
+def extract_enum_from_file(src_path:Path, dst_path:Path) -> None:
+    assert(src_path.exists())
+
+    # we can skip extracting this file if all of the following are true
+    # 1. The output file exists
+    # 2. The src file is older than the output file (not modded)
+    # 3. This python file is older than the output file (no updates to how we extract enums)
+    if (dst_path.exists() and 
+        src_path.stat().st_mtime < dst_path.stat().st_mtime and
+        THIS_MTIME < dst_path.stat().st_mtime):
+        return
+    
+    # check the first bytes of the file
+    with src_path.open("rb") as f:
+        starting_bytes = f.read(4)
+    
+    if   starting_bytes == b"Yaz0": is_compressed = True
+    elif starting_bytes == b"RARC": is_compressed = False
+    # not an arc file although it has the .arc extensions
+    else: return
+    
+    if is_compressed:
+        # if our file is compressed, then we should decompress it
+        # we only need to decompress if any of these are true
+        # 1. We've never decompressed this file before
+        # 2. The src file is newer than the output file (modded src)
+        new_src_path = src_path.with_suffix(src_path.suffix + ".decompressed")
+        if (not new_src_path.exists() or
+            new_src_path.stat().st_mtime < src_path.stat().st_mtime):
+            decompress_file(src_path, new_src_path)
+        src_path = new_src_path
+
+    convert_binary_to_resource_enum(src_path, dst_path)
+
+def main() -> None:
+    for dir, dirnames, filenames in walk("./orig/"):
+        dirpath = Path(dir)
+        if "res" not in dirpath.parts: continue
+
+        for file in filenames:
+            file_path = dirpath / file
+            if file_path.suffix == ".arc":
+                # the version should be the second part of the path
+                # ./orig/ShieldD/...
+                version = file_path.parts[1]
+                # find the res folder, truncate the path to be the part after the res folder
+                out_path = Path("/".join(file_path.parts[file_path.parts.index("res") + 1:]))
+                # set the output path to be the designated output + the version + the file's heirarchy
+                out_path = OUT_PATH / (version / out_path)
+                # we're going to output to a header file, prefix it with "res_"
+                out_path = out_path.with_name("res_" + out_path.name).with_suffix(".h")
+                try:
+                    extract_enum_from_file(file_path, out_path)
+                except AssertionError as e:
+                    print(f"ERROR: {file_path} -> {out_path}\n{e}\n")
+
+if __name__ == "__main__":
+    main()
--- a/tools/res_arc.py
+++ b/tools/res_arc.py
@ -1,155 +0,0 @@
-#!/usr/bin/env python3
-
-from argparse import ArgumentParser
-
-from binary_funcs import read_bytes_until_null, read_u32, read_u16, read_u8, skip_bytes
-
-import subprocess
-from os.path import exists, getmtime
-from pathlib import Path
-from typing import NamedTuple
-
-class ArcFile(NamedTuple):
-    file_name:str
-    index:int
-    id:int
-
-class ArcNode(NamedTuple):
-    node_type:bytes
-    name:str
-    inds:range
-
-def convert_binary_to_resource_enum(src_path: str, dest_path: str) -> None:    
-    with open(src_path, "rb") as binf:
-        assert(binf.read(4) == b"RARC"), "Not a rarc file"
-        skip_bytes(binf, 4)
-        data_header_offset = read_u32(binf)
-        binf.seek(data_header_offset)
-        node_count = read_u32(binf)
-        node_offset = read_u32(binf) + data_header_offset
-        total_num_file_entries = read_u32(binf)
-        file_entries_list_offset = read_u32(binf) + data_header_offset
-        skip_bytes(binf, 4)
-        string_list_offset = read_u32(binf) + data_header_offset
-        skip_bytes(binf, 2)
-        sync_flag = read_u8(binf)
-        
-        found_files:list[ArcFile] = []
-        
-        for entry_index in range(total_num_file_entries):
-            binf.seek(file_entries_list_offset + entry_index * 0x14)
-            file_id = read_u16(binf)
-            skip_bytes(binf, 2)
-            type_and_name_offset = read_u32(binf)
-            entry_type = type_and_name_offset >> 24
-            name_offset = type_and_name_offset & 0x00FFFFFF
-            binf.seek(string_list_offset + name_offset)
-            file_name = read_bytes_until_null(binf).decode("ascii")
-            if entry_type & 1: # check to make sure its a file
-                assert(not sync_flag or file_id == entry_index), \
-                    f"Sync flag was set, but ID {file_id} does not match Index {entry_index} for file {file_name}"
-                found_files.append(ArcFile(file_name, entry_index, file_id))
-        assert(len(set([x.file_name for x in found_files])) == len(found_files)), "duplicate file names found, unsupported"
-
-        index_file_lookup = {x.index : x for x in found_files}
-        
-        found_nodes:list[tuple[ArcNode, list[ArcFile]]] = []
-
-        for node_index in range(node_count):
-            binf.seek(node_offset + node_index * 0x10)
-            this_node_type = binf.read(4).decode("ascii").strip(" ")
-            this_node_name_offs = read_u32(binf)
-            skip_bytes(binf, 2)
-            this_node_index_count = read_u16(binf)
-            this_node_first_index = read_u32(binf)
-            this_node_inds = range(this_node_first_index, this_node_first_index + this_node_index_count)
-            binf.seek(string_list_offset + this_node_name_offs)
-            this_node_name = read_bytes_until_null(binf).decode("ascii")
-            this_node = ArcNode(this_node_type, this_node_name, this_node_inds)
-            found_nodes.append((this_node, [index_file_lookup.get(x) for x in this_node.inds if x in index_file_lookup]))
-        
-    out_lines:list[str] = []
-    file_stem = Path(src_path).name.split(".")[0]
-    file_stem_upper = file_stem.upper()
-    indent =  "    "
-
-    out_lines.append(f"#ifndef RES_{file_stem_upper}_H")
-    out_lines.append(f"#define RES_{file_stem_upper}_H\n")
-
-    out_ids:list[str] = []
-    out_idxs:list[str] = []
-
-    for node, files in found_nodes:
-        if len(files) == 0: continue
-        file_type_break = f"{indent}/* {node.node_type} */"
-        out_ids.append(file_type_break)
-        out_idxs.append(file_type_break)
-        for file in files:
-            parts = file.file_name.split(".")
-            santitized_file_name = parts[0].upper()
-            ext = parts[1].upper()
-            # tiny optimization to do less string formatting
-            begin_part = f"{indent}dRes_"
-            mid_part = f"_{file_stem_upper}_{ext}_{santitized_file_name}_e=0x"
-            out_idxs.append(f"{begin_part}INDEX{mid_part}{file.index:X},")
-            out_ids.append(f"{begin_part}ID{mid_part}{file.id:X},")
-    
-    out_lines.append(f"enum dRes_INDEX_{file_stem_upper} {{")
-    out_lines.extend(out_idxs)
-    out_lines.append("};\n")
-
-    out_lines.append(f"enum dRes_ID_{file_stem_upper} {{")
-    out_lines.extend(out_ids)
-    out_lines.append("};\n")
-
-    out_lines.append(f"#endif /* !RES_{file_stem_upper}_H */")
-
-    out = "\n".join(out_lines)
-
-    with open(dest_path, "w") as f:
-        f.write(out)
-    
-
-def decompress_file(input_file:str, output_file:str) -> None:
-    # TODO: fix this path to be more flexible
-
-    # use pathlib to allow for unix+windows paths
-    dtk_path = str(Path("./build/tools/dtk.exe"))
-
-    subprocess.run([dtk_path, "yaz0", "decompress", input_file, "-o", output_file])
-
-def main() -> None:
-    parser = ArgumentParser(
-        description="TODO"
-    )
-    parser.add_argument("src_path", type=str, help="Binary source file path")
-    parser.add_argument("dest_path", type=str, help="Destination C include file path")
-    args = parser.parse_args()
-
-    src_path = args.src_path
-    dst_path = args.dest_path
-
-    assert(exists(src_path))
-
-    # if we have already made this file, skip
-    # check if the src_file is newer than the output file, means modded arc, worth updating
-    if (exists(dst_path)) and (getmtime(dst_path) > getmtime(src_path)):
-        return
-
-    with open(src_path, "rb") as f:
-        is_compressed = (f.read(4) == b"Yaz0")
-    
-    if is_compressed:
-        # if our file is compressed, then we should decompress it
-        # but skip decompressing if it exists
-        # check for modded src tho
-        new_src_path = src_path + ".decompressed"
-        if (not exists(new_src_path)) or (getmtime(new_src_path) < getmtime(src_path)):
-            decompress_file(src_path, new_src_path)
-        src_path = new_src_path
-
-    convert_binary_to_resource_enum(src_path, dst_path)
-
-
-if __name__ == "__main__":
-    main()