From 052119c7a79f7aa0caa50cc03d6ee6a193a79f31 Mon Sep 17 00:00:00 2001
From: Erin Moon <erin@hecke.rs>
Date: Sat, 2 Jan 2021 01:22:57 -0600
Subject: [PATCH] a new tool: splitter (#39)

* splitter: v0.1

* basic demangle stuff

* splitter: v0.2

- add from, to options to select line range in .s to process
- infer referred labels from both loads and stores
- dump our own function labels in addition to externs into functions.h,
  to provide forward declarations for labels that other functions might
  use.
- fix off-by-one which was eating the last instruction of some functions

* splitter: v0.3

merged a bunch of work lepelog did, including:
- demangling support
- better function identification
- automatic FORCEACTIVE

and did a little bit of cleanup

* splitter: improve sda hack and format

* splitter: fix comment_out(), patch GQR references

* splitter: some speed optimizations

* remove debug print

* splitter: forceactive options

* refactor demangler, add support for more operators and more mangling symbols

* array and member (still one non working case)

* fix some operands in demangler

* make parents for funcs_out

* splitter: fix off-by-one in last line of last function in some .s files

Co-authored-by: lepelog <lepelog@users.noreply.github.com>
Co-authored-by: Pheenoh <pheenoh@gmail.com>
---
 tools/splitter/demangle.py | 300 +++++++++++++++++++++++++++++
 tools/splitter/parser.py   | 159 ++++++++++++++++
 tools/splitter/split.py    | 373 +++++++++++++++++++++++++++++++++++++
 tools/splitter/util.py     |  17 ++
 4 files changed, 849 insertions(+)
 create mode 100644 tools/splitter/demangle.py
 create mode 100644 tools/splitter/parser.py
 create mode 100644 tools/splitter/split.py
 create mode 100644 tools/splitter/util.py

diff --git a/tools/splitter/demangle.py b/tools/splitter/demangle.py
new file mode 100644
index 00000000000..e0fb342c1e0
--- /dev/null
+++ b/tools/splitter/demangle.py
@@ -0,0 +1,300 @@
+from typing import List, Optional
+from pathlib import Path
+from dataclasses import dataclass, field
+import re
+
+operator_func_re = re.compile(r'^__([a-z]+)')
+
+types = {
+    'i': 'int',
+    'l': 'long',
+    's': 'short',
+    'c': 'char',
+    'f': 'float',
+    'd': 'double',
+    'v': 'void',
+    'x': 'long long',
+    'b': 'bool',
+    'e': 'varargs...',
+}
+
+# {'defctor', 'ops',}
+
+special_funcs = {
+    'eq': 'operator==',
+    'as': 'operator=',
+    'ne': 'operator!=',
+    'dv': 'operator/',
+    'pl': 'operator+',
+    'mi': 'operator-',
+    'ml': 'operator*',
+    'adv': 'operator/=',
+    'apl': 'operator+=',
+    'ami': 'operator-=',
+    'amu': 'operator*=',
+    'lt': 'operator<',
+    'gt': 'operator>',
+    'cl': 'operator()',
+    'dla': 'operator delete[]',
+    'nwa': 'operator new[]',
+    'dl': 'operator delete',
+    'nw': 'operator new',
+}
+
+
+@dataclass
+class Param:
+    name: str = ''
+    pointer_lvl: int = 0
+    is_const: bool = False
+    is_ref: bool = False
+    is_unsigned: bool = False
+    is_signed: bool = False
+
+    def to_str(self) -> str:
+        ret = ''
+        if self.is_const:
+            ret += 'const '
+        if self.is_unsigned:
+            ret += 'unsigned '
+        ret += self.name
+        for _ in range(self.pointer_lvl):
+            ret += '*'
+        if self.is_ref:
+            ret += '&'
+        return ret
+
+
+@dataclass
+class FuncParam:
+    ret_type: Optional[str] = None
+    params: List[str] = field(default_factory=list)
+
+    def to_str(self) -> str:
+        ret = ''
+        if self.ret_type is None:
+            ret += 'void'
+        else:
+            ret += self.ret_type
+        ret += ' (*)('
+        ret += ', '.join(self.params)
+        ret += ')'
+        return ret
+
+
+class ParseError(Exception):
+    ...
+
+
+class ParseCtx:
+    def __init__(self, mangled: str):
+        self.mangled = mangled
+        self.index = 0
+        self.demangled = []
+        self.cur_type = None
+        self.class_name = None
+        self.is_const = False
+        self.func_name = None
+
+    def demangle(self):
+        # this split is still not accurate, but good enough for most cases
+        last_f = self.mangled.rfind('F')
+        if last_f == -1:
+            return
+        split_pos = self.mangled.rfind('__', 0, last_f)
+        if split_pos == -1 or split_pos == 0:
+            return
+        self.func_name = self.mangled[:split_pos]
+        self.mangled = self.mangled[split_pos+2:]
+        if self.func_name.startswith('__'):
+            match = operator_func_re.match(self.func_name)
+            if match:
+                special_func_name = match.group(1)
+                if special_func_name in special_funcs:
+                    self.func_name = special_funcs[special_func_name]
+                else:
+                    if special_func_name == 'ct':
+                        self.func_name = '.ctor'
+                    elif special_func_name == 'dt':
+                        self.func_name = '.dtor'
+        self.demangle_first_class()
+        while self.index < len(self.mangled):
+            self.demangled.append(self.demangle_next_type())
+        if self.func_name == '.ctor':
+            self.func_name = self.class_name
+        if self.func_name == '.dtor':
+            self.func_name = '~' + self.class_name
+    
+    def demangle_first_class(self):
+        if self.peek_next_char().isdecimal():
+            self.class_name = self.demangle_class()
+            if self.peek_next_char() == 'C':
+                self.is_const = True
+                self.index += 1
+            assert self.consume_next_char() == 'F', 'next char should be F!'
+        elif self.peek_next_char() == 'Q':
+            self.index += 1
+            self.class_name = self.demangle_qualified_name()
+            if self.peek_next_char() == 'C':
+                self.is_const = True
+                self.index += 1
+            assert self.consume_next_char() == 'F', 'next char should be F!'
+        else:
+            assert self.consume_next_char() == 'F', 'next char should be F!'
+    
+    def demangle_next_type(self) -> str:
+        cur_type = Param()
+        while True:
+            cur_char = self.peek_next_char()
+            if cur_char.isdecimal():
+                class_name = self.demangle_class()
+                cur_type.name = class_name
+                return cur_type.to_str()
+            elif cur_char in types:
+                type_name = self.demangle_prim_type()
+                cur_type.name = type_name
+                return cur_type.to_str()
+            elif cur_char == 'U':
+                cur_type.is_unsigned = True
+                self.index += 1
+            elif cur_char == 'S':
+                cur_type.is_signed = True
+                self.index += 1
+            elif cur_char == 'C':
+                cur_type.is_const = True
+                self.index += 1
+            elif cur_char == 'P':
+                cur_type.pointer_lvl += 1
+                self.index += 1
+            elif cur_char == 'R':
+                cur_type.is_ref = True
+                self.index += 1
+            elif cur_char == 'F':
+                self.index += 1
+                func = self.demangle_function()
+                return func.to_str()
+            elif cur_char == 'Q':
+                self.index += 1
+                return self.demangle_qualified_name()
+            elif cur_char == 'A':
+                if cur_type.pointer_lvl < 1 and not cur_type.is_ref:
+                    raise ParseError("pointer level for array is wrong!")
+                # decrease pointer level by one, cause one is already handled in the array demangle
+                if not cur_type.is_ref:
+                    cur_type.pointer_lvl -= 1
+                cur_type.name = self.demangle_array()
+                return cur_type.to_str()
+
+            else:
+                raise ParseError(f'unexpected character {cur_char}')
+
+    def demangle_array(self) -> str:
+        sizes = []
+        while self.peek_next_char() == 'A':
+            self.index += 1
+            sizes.append(self.read_next_int())
+            if self.consume_next_char() != '_':
+                raise ParseError("Need to have '_' after Array size!")
+        array_type = self.demangle_next_type()
+        return f'{array_type} []' + ''.join(f'[{i}]' for i in sizes)
+
+    def demangle_function(self) -> FuncParam:
+        func_param = FuncParam()
+        while True:
+            cur_char = self.peek_next_char()
+            if cur_char == '_':
+                self.index += 1
+                func_param.ret_type = self.demangle_next_type()
+                return func_param
+            func_param.params.append(self.demangle_next_type())
+
+    def demangle_qualified_name(self) -> str:
+        part_count = int(self.consume_next_char())
+        parts = []
+        for _ in range(part_count):
+            parts.append(self.demangle_class())
+        return '::'.join(parts)
+
+    def read_next_int(self) -> int:
+        class_len_str = ''
+        cur_char = self.peek_next_char()
+        while cur_char.isdecimal():
+            class_len_str += cur_char
+            self.index += 1
+            cur_char = self.peek_next_char()
+        return int(class_len_str)
+    
+    def demangle_class(self) -> str:
+        if not self.peek_next_char().isdecimal():
+            raise ParseError(f'class mangling must start with number')
+        class_len = self.read_next_int()
+        class_name = self.mangled[self.index : self.index + class_len]
+        self.index += class_len
+        if self.peek_next_char() == 'M':
+            self.index += 1
+            class_name += '::' + self.demangle_class()
+        return class_name
+
+    def demangle_prim_type(self) -> str:
+        ret = types[self.consume_next_char()]
+        return ret
+    
+    def consume_next_char(self) -> str:
+        next_char = self.mangled[self.index]
+        self.index += 1
+        return next_char
+    
+    def peek_next_char(self) -> str:
+        if self.index >= len(self.mangled):
+            return None
+        return self.mangled[self.index]
+    
+    def to_str(self) -> str:
+        if self.func_name is None:
+            return ''
+        elif self.class_name is None:
+            return self.func_name + '(' + ', '.join(self.demangled) + ')'
+        else:
+            return self.class_name + '::' + self.func_name + '(' + ', '.join(self.demangled) + ')' + (' const' if self.is_const else '')
+
+
+def demangle(s):
+    p = ParseCtx(s)
+    p.demangle()
+    return p.to_str()
+
+
+def parse_framework_map(path: Path):
+    address_funcname = {}
+    with path.open() as f:
+        for line in f.readlines():
+            if line.startswith('.ctors'):
+                return address_funcname
+            if not line.startswith('  '):
+                continue
+            funcname = line[30:].split(' ', 1)[0]
+            address = line[18:26]
+            address_funcname[address] = funcname
+    return address_funcname
+
+# def try_demangle_all():
+#     with open('frameworkF.map') as f:
+#         for line in f.readlines():
+#             if line.startswith('.ctors'):
+#                 return
+#             if not line.startswith('  '):
+#                 continue
+#             line = line[30:]
+#             line_spl = line.split(' ',1)[0]
+#             try:
+#                 d = demangle(line_spl)
+#                 if d:
+#                     print(d)
+#             # except NotImplementedError:
+#             #     pass
+#             except Exception as e:
+#                 # print(f'could not demangle {line_spl}: {repr(e)}')
+#                 # raise e
+#                 pass
+
+# try_demangle_all()
\ No newline at end of file
diff --git a/tools/splitter/parser.py b/tools/splitter/parser.py
new file mode 100644
index 00000000000..d5c723f2437
--- /dev/null
+++ b/tools/splitter/parser.py
@@ -0,0 +1,159 @@
+from dataclasses import dataclass
+from parsy import string, regex, seq, generate, line_info
+from typing import Optional, List, Union, Protocol
+
+
+class Emittable(Protocol):
+    def emit(self) -> str:
+        ...
+
+
+@dataclass
+class BlockComment:
+    text: str
+
+    def emit(self) -> str:
+        return f'/*{self.text}*/'
+
+
+@dataclass
+class TrailingComment:
+    text: str
+
+    def emit(self) -> str:
+        return f'# {self.text}'
+
+
+@dataclass
+class Include:
+    file: str
+
+    def emit(self) -> str:
+        return f'.include "{self.file}"'
+
+
+@dataclass
+class Section:
+    name: str
+    flags: Optional[str]
+
+    def emit(self) -> str:
+        directive = f'.section .{self.name}'
+        if self.flags is not None:
+            directive += f', "{self.flags}"'
+
+        return directive
+
+
+@dataclass
+class Global:
+    symbol: str
+
+    def emit(self) -> str:
+        return f'.global {self.symbol}'
+
+
+@dataclass
+class Label:
+    symbol: str
+
+    def emit(self) -> str:
+        return f'{self.symbol}:'
+
+
+@dataclass
+class Instruction:
+    opcode: str
+    operands: List[str]
+
+    def emit(self) -> str:
+        instr = self.opcode
+        if len(self.operands) > 0:
+            instr += ' ' + ', '.join(self.operands)
+
+        return instr
+
+
+@dataclass
+class Line:
+    index: int
+    content: List[
+        Union[
+            BlockComment, TrailingComment, Instruction, Global, Section, Include, Label
+        ]
+    ]
+    body: Optional[Union[Global, Section, Include, Label, Instruction]]
+
+    def emit(self) -> str:
+        return ' '.join([x.emit() for x in self.content])
+
+
+space = regex(r'[ \t]+')
+line_ending = regex('(\n)|(\r\n)').desc('newline')
+pad = regex(r'[ \t]*')
+
+
+block_comment = (
+    string('/*') >> regex(r'[\w\s]*').map(BlockComment) << string('*/')
+).desc('block comment')
+trailing_comment = (
+    string('#') >> pad >> regex(r'[^\n\r]*').map(TrailingComment)
+).desc('trailing comment')
+
+symbolname = regex(r'[a-zA-Z._$][a-zA-Z0-9._$?]*')
+label = (symbolname.map(Label) << string(':')).desc('label')
+
+delimited_string = (string('"') >> regex(r'[^"]*') << string('"')).desc(
+    'double-quote delimited string'
+)
+
+directive_include = string('include') >> space >> delimited_string.map(Include)
+directive_section = seq(
+    name=string('section')
+    >> space
+    >> string('.')
+    >> regex(r'[a-z]+'),
+    flags=(pad >> string(',') >> space >> delimited_string).optional(),
+).combine_dict(Section)
+directive_global = string('global') >> space >> symbolname.map(Global)
+directive = (
+    string('.')
+    >> (
+        directive_include
+        | directive_section
+        | directive_global
+        | string('text').result(Section('text', flags=None))
+        | string('data').result(Section('data', flags=None))
+    )
+).desc('directive')
+
+opcode = regex(r'[a-z_0-9]+\.?').concat().desc('opcode')
+operand = regex(r'[^,#\s]+')
+operands = operand.sep_by(string(',') << pad)
+@generate
+def instruction():
+    op = yield opcode
+    sp = yield space.optional()
+    if sp:
+        oprs = yield operands
+    else:
+        oprs = []
+    return Instruction(op, oprs)
+
+@generate
+def line():
+    line, _ = yield line_info
+
+    content = yield (pad >> block_comment << pad).many()
+    body = yield (directive | label | instruction).optional() << pad
+    if body:
+        content.append(body)
+    content += yield (pad >> block_comment).many()
+    trailing = yield (pad >> trailing_comment).optional()
+    if trailing:
+        content.append(trailing)
+
+    return Line(line, content, body)
+
+
+asm = line.sep_by(line_ending)
diff --git a/tools/splitter/split.py b/tools/splitter/split.py
new file mode 100644
index 00000000000..45260bba916
--- /dev/null
+++ b/tools/splitter/split.py
@@ -0,0 +1,373 @@
+"""
+split.py - 202x erin moon for zeldaret
+"""
+
+from typing import Iterable, List
+from dataclasses import dataclass
+from pathlib import Path, PosixPath
+from textwrap import dedent
+from loguru import logger
+from datetime import datetime
+import re
+import click
+from parser import asm, Emittable, Global, Label, Line, BlockComment, Instruction
+from demangle import parse_framework_map, demangle
+from util import PathPath, pairwise
+from pprint import pprint
+import pickle
+import IPython
+
+SDA_BASE = 0x80458580
+SDA2_BASE = 0x80459A00
+
+__version__ = 'v0.3'
+
+
+def function_global_search(lines: List[Line]) -> Iterable[Line]:
+    i = 0
+    while i < len(lines):
+        if isinstance(lines[i].body, Global):
+            sym = lines[i].body.symbol
+            if isinstance(lines[i + 1].body, Label) and lines[i + 1].body.symbol == sym:
+                yield lines[i]
+            i += 2
+        else:
+            i += 1
+
+
+def emit_lines(lines: List[Line]) -> str:
+    return '\n'.join([line.emit() for line in lines])
+
+
+def comment_out(line: Line) -> Line:
+    return Line(line.index, [BlockComment(line.emit())], None)
+
+
+def fix_sda_base_add(line: Line) -> Line:
+    if 'SDA' in line.body.operands[2]:
+        ops = line.body.operands[2].split('-')
+        lbl_addr = int(ops[0][4:], 16)
+        if ops[1] == '_SDA_BASE_':
+            sda_addr = SDA_BASE
+        elif ops[1] == '_SDA2_BASE_':
+            sda_addr = SDA2_BASE
+        else:
+            logger.error('Unknown SDABASE!')
+            return line
+
+        line.content.append(
+            BlockComment(f'SDA HACK; original: {line.body.operands[2]}')
+        )
+        line.body.operands[2] = f'0x{lbl_addr:X} - 0x{sda_addr:X}'
+    return line
+
+QUANT_REG_RE = re.compile(r'qr(\d+)')
+def patch_gqrs(line: Line) -> Line:
+    line.body.operands = [QUANT_REG_RE.sub(r'\1', o) for o in line.body.operands]
+    
+    return line
+
+@dataclass
+class Function:
+    name: str
+    addr: int
+    lines: List[Line]
+
+    @property
+    def line_count(self):
+        return len(self.lines)
+
+    @property
+    def filename(self) -> str:
+        return f'func_{self.addr:X}.s'
+
+    def include_path(self, base: Path) -> str:
+        return str(PosixPath(base) / PosixPath(self.filename))
+
+
+def find_functions(lines: List[Line], framework_map) -> Iterable[Function]:
+    for func_global_line, next_func_global_line in pairwise(
+        function_global_search(lines)
+    ):
+        # some blocks weren't properly split, use the map to find missing functions
+        fr = func_global_line.index + 2
+        # if no next global, to = None => end of file
+        to = next_func_global_line and next_func_global_line.index
+        func_lines = lines[fr:to]
+        func_idx = []
+        for idx, line in enumerate(func_lines):
+            if isinstance(line.body, Instruction):
+                addr = int(line.content[0].text.strip().split()[0], 16)
+                if f'{addr:x}' in framework_map:
+                    func_idx.append(idx)
+
+        for start_idx, end_idx in pairwise(func_idx):
+            sub_func_lines = func_lines[
+                start_idx : (len(func_lines) if end_idx == None else end_idx)
+            ]
+
+            addr = int(sub_func_lines[0].content[0].text.strip().split()[0], 16)
+
+            yield Function(
+                name=func_global_line.body.symbol
+                if start_idx == 0
+                else f'func_{addr:X}',
+                addr=addr,
+                lines=sub_func_lines,
+            )
+
+
+def emit_cxx_asmfn(inc_base: Path, func: Function) -> str:
+    return dedent(
+        '''\
+        asm void {name}(void) {{
+            nofralloc
+            #include "{inc}"
+        }}'''.format(
+            name=func.name, inc=func.include_path(inc_base)
+        )
+    )
+
+
+def emit_cxx_extern_fns(tu_file: str, labels: Iterable[str]) -> str:
+    def decl(label):
+        return f'void {label}(void);'
+
+    defs = '\n    '.join(decl(label) for label in labels)
+
+    return (
+        f'// additional symbols needed for {tu_file}\n'
+        f'// autogenerated by split.py {__version__} at {datetime.utcnow()}\n'
+        'extern "C" {\n'
+        '    ' + defs + '\n}'
+    )
+
+
+def emit_cxx_extern_vars(tu_file: str, labels: Iterable[str]) -> str:
+    def decl(label):
+        return f'extern u8 {label};'
+
+    return (
+        f'// additional symbols needed for {tu_file}\n'
+        f'// autogenerated by split.py {__version__} at {datetime.utcnow()}\n'
+        + '\n'.join(decl(label) for label in labels)
+        + '\n'
+    )
+
+
+@click.command()
+@click.argument('src', type=PathPath(file_okay=True, dir_okay=False, exists=True))
+@click.argument('cxx_out', type=PathPath(file_okay=True, dir_okay=False))
+@click.option(
+    '--funcs-out',
+    type=PathPath(file_okay=False, dir_okay=True),
+    default='include/funcs',
+)
+@click.option('--s-include-base', type=str, default='funcs')
+@click.option(
+    '--extern-functions-file',
+    type=PathPath(file_okay=True, dir_okay=False),
+    default='include/functions.h',
+)
+@click.option(
+    '--extern-variables-file',
+    type=PathPath(file_okay=True, dir_okay=False),
+    default='include/variables.h',
+)
+@click.option(
+    '--framework-map-file',
+    type=PathPath(file_okay=True, dir_okay=False),
+    default='frameworkF.map',
+)
+@click.option(
+    '--ldscript-file',
+    type=PathPath(file_okay=True, dir_okay=False),
+    default='ldscript.lcf',
+)
+@click.option('--from-line', type=int)
+@click.option('--to-line', type=int)
+@click.option('--preparsed', is_flag=True)
+@click.option('--forceactive',
+    type=click.Choice(['all', 'none', 'missingfunc']), default='missingfunc')
+def split(
+    src,
+    cxx_out,
+    funcs_out,
+    s_include_base,
+    extern_functions_file,
+    extern_variables_file,
+    framework_map_file,
+    ldscript_file,
+    from_line,
+    to_line,
+    preparsed,
+    forceactive
+):
+    funcs_out.mkdir(exist_ok=True, parents=True)
+
+    if preparsed:
+        logger.info('loading preparsed assembly')
+        with src.open('rb') as f:
+            lines = pickle.load(f)
+    else:
+        logger.info('parsing assembly')
+        lines = asm.parse(src.read_text())
+        lines = lines[
+            (from_line - 1 if from_line else 0) : (to_line - 1 if to_line else -1)
+        ]
+
+    logger.info('reading extern func/vars files')
+    extern_funcs_src = extern_functions_file.read_text()
+    extern_vars_src = extern_variables_file.read_text()
+
+    logger.info('parsing map file')
+    framework_map = parse_framework_map(framework_map_file)
+    logger.debug(f'loaded {len(framework_map)} symbols from map')
+
+    logger.info('reading ldscript')
+    ldscript_file_content = ldscript_file.read_text()
+    new_ldfuncs = []
+
+    # -- get all defined labels and jump targets
+    jumped_labels = set()
+    defined_labels = set()
+
+    logger.info('scanning for branch targets')
+    for line in lines:
+        if isinstance(line.body, Label):
+            defined_labels.add(line.body.symbol)
+        if isinstance(line.body, Instruction):
+            if line.body.opcode[0] == 'b' and line.body.operands != []:  # branch
+                jumped_labels.add(line.body.operands[0])  # jump target
+
+    # -- find everything of the form lbl_[hex] that's in an operand on the RHS of a l* instruction
+    #    this is a relatively okay assumption given that any var that's *not* of the form lbl_ has
+    #    probably already been renamed and thus is exported in variables.h
+    LBL_RE = re.compile(r'lbl_[0-9A-F]+')
+
+    def find_labels_in_operands(operands):
+        for operand in operands:
+            if match := LBL_RE.search(operand):
+                yield match.group()
+
+    logger.info('scanning for load/store target labels')
+    loaded_labels = set()
+    for line in lines:
+        if isinstance(line.body, Instruction):
+            if line.body.opcode[0] in {'l', 's'}:  # load and store instructions, ish
+                loaded_labels |= set(find_labels_in_operands(line.body.operands))
+
+    # -- dump new variable labels to variables.h
+    logger.info('dumping variable labels to extern vars header')
+    vars_new = set()
+    for label in loaded_labels:
+        if label not in extern_vars_src:
+            logger.debug(f'adding extern var {label} to {extern_variables_file}')
+            vars_new.add(label)
+
+    if len(vars_new) > 0:
+        with open(extern_variables_file, 'a') as f:
+            f.write('\n\n')
+            f.write(emit_cxx_extern_vars(cxx_out.name, vars_new))
+
+    # -- find all defined functions and split them
+    functions = list(find_functions(lines, framework_map))
+    logger.info('splitting functions')
+    for func in functions:
+
+        logger.debug(
+            f'working on function {func.name} @ {func.addr:X} with {func.line_count} lines'
+        )
+
+        # comment out .globals
+        func.lines = [
+            comment_out(line) if isinstance(line.body, Global) else line
+            for line in func.lines
+        ]
+
+        # fix SDA_BASE addi
+        func.lines = [
+            fix_sda_base_add(line)
+            if isinstance(line.body, Instruction)
+            and line.body.opcode == 'addi'
+            else line
+            for line in func.lines
+        ]
+
+        # remove GQR mnemonics
+        func.lines = [
+            patch_gqrs(line)
+            if isinstance(line.body, Instruction)
+            and line.body.opcode.startswith('psq_')
+            else line
+            for line in func.lines
+        ]
+
+        # check if needs to be defined in ldscript
+        if forceactive != 'none':
+            if not func.name in ldscript_file_content and (forceactive=='all' or func.name.startswith('func_')):
+                new_ldfuncs.append(func.name)
+
+        with open(out_path := funcs_out / func.filename, 'w') as f:
+            logger.debug(f'emitting {out_path}')
+            f.write(emit_lines(func.lines))
+
+    # -- dump new labels to functions.h
+    logger.info('dumping labels to extern functions header')
+    func_labels = jumped_labels - defined_labels
+    # add in everything we def to make sure asm can get backrefs
+    for func in functions:
+        func_labels.add(func.name)
+
+    # get rid of stuff already in functions.h. extremely hacky
+    funcs_new_labels = set()
+    for label in func_labels:
+        if label not in extern_funcs_src:
+            logger.info(f'adding extern func {label} to {extern_functions_file}')
+            funcs_new_labels.add(label)
+
+    if len(funcs_new_labels) > 0:
+        with open(extern_functions_file, 'a') as f:
+            f.write('\n\n')
+            f.write(emit_cxx_extern_fns(cxx_out.name, funcs_new_labels))
+
+    # -- write asm stubs to cxx_out (could've done this as part of previous loop but imo this is cleaner)
+    logger.info(f'emitting c++ asm stubs to {cxx_out}')
+    with open(cxx_out, 'w') as f:
+        f.write(
+            f'/* {cxx_out.name} autogenerated by split.py {__version__} at {datetime.utcnow()} */\n\n'
+        )
+        f.write('#include "global.h"\n\n')
+
+        f.write('extern "C" {\n')
+        for func in functions:
+            logger.debug(f'emitting asm stub for {func.name}')
+            mangled_func_name = framework_map[f'{func.addr:x}']
+            f.write(f'// {mangled_func_name}\n')
+            try:
+                demangled_func_name = demangle(mangled_func_name)
+                f.write(f'// {demangled_func_name}\n')
+            except Exception as e:
+                logger.warning(f"could not demangle symbol '{mangled_func_name}': {e}")
+
+            f.write(emit_cxx_asmfn(s_include_base, func))
+            f.write('\n\n')
+
+        f.write('};\n')  # extern C end
+
+    # -- make defined functions FORCEACTIVE in ldscript.lcf
+    logger.info(f'writing to FORCEACTIVE in linker script')
+    forceactive_start = ldscript_file_content.find('FORCEACTIVE')
+    forceactive_end = ldscript_file_content.find('}', forceactive_start)
+    for func in new_ldfuncs:
+        ldscript_file_content = (
+            ldscript_file_content[:forceactive_end]
+            + func
+            + '\n'
+            + ldscript_file_content[forceactive_end:]
+        )
+    ldscript_file.write_text(ldscript_file_content)
+
+
+if __name__ == '__main__':
+    split()
diff --git a/tools/splitter/util.py b/tools/splitter/util.py
new file mode 100644
index 00000000000..260f6d6a4e0
--- /dev/null
+++ b/tools/splitter/util.py
@@ -0,0 +1,17 @@
+import itertools
+import click
+from pathlib import Path
+
+
+def pairwise(iterable):
+    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
+    a, b = itertools.tee(iterable)
+    next(b, None)
+    return itertools.zip_longest(a, b)
+
+
+class PathPath(click.Path):
+    """A Click path argument that returns a pathlib Path, not a string"""
+
+    def convert(self, value, param, ctx):
+        return Path(super().convert(value, param, ctx))