mm/tools/buildtools/makeyar.py

#!/usr/bin/env python3

# SPDX-FileCopyrightText: © 2023-2024 ZeldaRET
# SPDX-License-Identifier: MIT

# Program to generate compressed yar (Yaz0 ARchive) files.
#
# The program expects an .o elf file and outputs a raw yar binary file and a
# "symbols" elf.
#
# A yar file consists of multiple Yaz0 files compressed individually. The
# archive begins with a header of non-fixed size, which describes the
# location of each individual Yaz0 block within the archive itself. This
# header is followed by each Yaz0 file.
#
# The first word (a 4 byte group) of the header indicates the size in bytes of
# the header itself (also describes the offset of the first Yaz0 block). The
# rest of the header consists of words describing the offsets of each Yaz0
# block relative to the end of the header, because the first Yaz0
# block is omitted from the offsets in the header.
#
# Each Yaz0 block is 0xFF-padded to a multiple of 0x10 in size.
#
# The entire archive is 0-padded to a multiple of 0x10 in size.
#
# The program works by compressing each .data symbol in the input elf file as
# its own Yaz0 compressed file, appending them in order for the generated
# archive. Other elf sections are ignored for the resulting yar file.
#
# The program also outputs an elf file that's identical to the elf input,
# but with its .data section zero'ed out completely. This "symbols" elf can be
# used for referencing each symbol as the whole file were completely
# uncompressed.

from __future__ import annotations

import argparse
import dataclasses
from pathlib import Path
import struct
import crunch64

from elftools.elf.elffile import ELFFile
from elftools.elf.sections import SymbolTableSection


def write_word_as_bytes(buff: bytearray, offset: int, word: int):
    struct.pack_into(f">I", buff, offset, word)


@dataclasses.dataclass
class Symbol:
    name: str
    offset: int
    size: int


def get_data_from_elf(elf_path: Path) -> tuple[bytearray, list[Symbol], int]:
    uncompressed_data = bytearray()
    symbol_list: list[Symbol] = []
    data_offset = -1

    with elf_path.open("rb") as elfFile:
        elf = ELFFile(elfFile)
        for section in elf.iter_sections():
            if section.name == ".data":
                assert len(uncompressed_data) == 0
                uncompressed_data.extend(section.data())
                assert len(uncompressed_data) == section["sh_size"]
                data_offset = section["sh_offset"]
            elif section.name == ".symtab":
                assert isinstance(section, SymbolTableSection)
                for sym in section.iter_symbols():
                    if sym["st_shndx"] == "SHN_UNDEF":
                        continue
                    if sym["st_info"]["type"] != "STT_OBJECT":
                        continue
                    symbol_list.append(
                        Symbol(sym.name, sym["st_value"], sym["st_size"])
                    )
    return uncompressed_data, symbol_list, data_offset


def align_16(val: int) -> int:
    return (val + 0xF) & ~0xF


def create_archive(
    uncompressed_data: bytearray, symbol_list: list[Symbol]
) -> bytearray:
    archive = bytearray()

    first_entry_offset = (len(symbol_list) + 1) * 4

    # Fill with zeroes until the compressed data start
    archive.extend([0] * first_entry_offset)

    write_word_as_bytes(archive, 0, first_entry_offset)

    offset = first_entry_offset

    i = 0
    for sym in symbol_list:
        uncompressed_size = sym.size
        uncompressed_size_aligned = align_16(uncompressed_size)

        input_buf = uncompressed_data[sym.offset : sym.offset + uncompressed_size]
        # Make sure to pad each entry to a 0x10 boundary
        if uncompressed_size_aligned > uncompressed_size:
            input_buf.extend([0x00] * (uncompressed_size_aligned - uncompressed_size))

        compressed = bytearray(crunch64.yaz0.compress(input_buf))
        compressed_size = len(compressed)

        # Pad to 0x10
        compressed_size_aligned = align_16(compressed_size)
        if compressed_size_aligned > compressed_size:
            compressed.extend([0xFF] * (compressed_size_aligned - compressed_size))

        archive.extend(compressed)

        if i > 0:
            write_word_as_bytes(archive, i * 4, offset - first_entry_offset)

        i += 1
        offset += len(compressed)

    write_word_as_bytes(archive, i * 4, offset - first_entry_offset)

    archive_len = len(archive)
    archive_len_aligned = align_16(archive_len)
    if archive_len_aligned > archive_len:
        archive.extend([0x00] * (archive_len_aligned - archive_len))

    return archive


def main():
    parser = argparse.ArgumentParser(
        description="Program to generate compressed yar (Yaz0 ARchive) files from a built C file. Said file must only contain data symbols that do not reference other symbols (i.e. textures)."
    )
    parser.add_argument("in_file", help="Path to built .o file")
    parser.add_argument(
        "out_bin", help="Output path for the generated compressed yar binary"
    )
    parser.add_argument("out_sym", help="Output path for the generated syms elf file")

    args = parser.parse_args()

    in_path = Path(args.in_file)
    out_bin_path = Path(args.out_bin)
    out_sym_path = Path(args.out_sym)

    # Delete output files if they already exist
    out_bin_path.unlink(missing_ok=True)
    out_sym_path.unlink(missing_ok=True)

    elf_bytes = bytearray(in_path.read_bytes())

    uncompressed_data, symbol_list, data_offset = get_data_from_elf(in_path)
    assert len(uncompressed_data) > 0
    assert len(symbol_list) > 0
    assert data_offset > 0

    archive = create_archive(uncompressed_data, symbol_list)

    # Write the compressed archive file as a raw binary
    out_bin_path.write_bytes(archive)

    # Zero out data
    for i in range(data_offset, data_offset + len(uncompressed_data)):
        elf_bytes[i] = 0

    out_sym_path.write_bytes(elf_bytes)


if __name__ == "__main__":
    main()