007/tools/ge-stats.py

734 lines
30 KiB
Python

import datetime
import errno
import functools
import getopt
import os
import re
import shlex
import socket
import subprocess
import sys
import time
"""
re-writing the original ge-stats py. - Ben Burns, Dec 18, 2021
The general approach is to scan for source files in the specified directories (*.s, *.c).
Each file is evaluated, and a list of assembly function definitions is generated.
The map file is then parsed, giving a list of all known functions.
Stats are then computed for asm functions out of all known functions.
The logic to determine whether a function has an assembly definition is rather simple.
It doesn't handle any c preprocessor macros, so the results are not guaranteed to
be accurate. If a line begins with the text "glabel" and is only followed by one
word, that is assumed to be an assembly function definition. For the purposes
of this script, any assembly definition for any ROM version will count against
the total.
"""
# lower case
__supported_versions = ['us', 'jp', 'eu']
# any function read from .map file exceeding this size will throw an exception
__invalid_func_length_size = 1000000
# attempt to filter out extra glabels that are not function definitions. This is for jump table entries
__re_jpt_label = re.compile(r"\.L[0-9a-fA-F]{8}")
# report output paths
__report_dir = "tools" + os.path.sep + "report"
__report_bin = __report_dir + os.path.sep + "report"
__report_template = __report_dir + os.path.sep + "template.html"
__report_out_us = __report_dir + os.path.sep + "index.html"
__report_out_jp = __report_dir + os.path.sep + "JPN.htm"
__report_out_eu = __report_dir + os.path.sep + "EU.htm"
# default fallback if recently modified timestamps don't work
__mtime_fallback_filename = 'ge.u.z64'
class FunctionInfo:
"""
Info about a function
"""
def __init__(self, name: str):
# function name
self.name = name
# flag to indicate if an asm function definition exists
self.nonmatching = False
# length of function as computed from map file
self.length = 0
# parent SourceFileContent
self.parent = None
def __str__(self):
return self.name
def __repr__(self):
return self.name
class SourceFileContent:
"""
Information about a source file
"""
def __init__(self, path: str):
# path to file
self.path = path
# list of all asm function definitions found in the file
self.asm_functions = set()
# list of FunctionInfo as parsed from map file
self.all_functions = []
if path.endswith(".c") or path.endswith(".s") or path.endswith(".o"):
self.path_without_ext = path[:-2]
else:
self.path_without_ext = path
# reference to SearchDir
self.parent = None
# last modified time, in seconds since epoch
self.mtime = 0
def __str__(self):
return self.path
def __repr__(self):
return self.path
def __eq__(self, other):
if not isinstance(other, SourceFileContent):
# don't attempt to compare against unrelated types
return NotImplemented
return self.path == other.path
class SearchDir:
"""
container to define search paramaters.
"""
def __init__(self, path: str, recurse: bool, ignore = None, completed = None):
"""
default constructor
:param str path: folder path, relative to root where script is run. Should not
contain leading or trailing slash.
:param bool recurse: whether or not to recursively search contents of path.
:param ignore: optional files to ignore. List of strings. This is an explicit list of
files with any necessary folder prefix. Do not include leading slash. Wildcards
are not supported.
:param completed: optional files to mark as completed. List of strings. This is an
explicit list of files with any necessary folder prefix. Do not include leading
slash. Wildcards are not supported.
"""
self.path = path
self.recurse = not not recurse
# (config option) ignored files won't be added to source files
if ignore is None:
ignore = []
self.ignore = ignore
# (config option) completed files won't have asm definitions added
if completed is None:
completed = []
self.completed = completed
# (runtime stat) count of functions found in/under this directory
self.function_count = 0
# (runtime stat) length in bytes for all functions found
self.function_byte_count = 0
# (runtime stat) count of non matching functions found in/under this directory
self.nonmatching_count = 0
# (runtime stat) length in bytes for all non matching functions found
self.nonmatching_byte_count = 0
# (runtime stat) count of found files
self.file_count = 0
# (runtime stat) count of found files with no asm definitions
self.completed_file_count = 0
# fix path seperator if this is run on windows os (visual only, it will still run...)
self.path = self.path.replace('/', os.path.sep)
def __str__(self):
return self.path
def __repr__(self):
return self.path
def __eq__(self, other):
if not isinstance(other, SearchDir):
# don't attempt to compare against unrelated types
return NotImplemented
return self.path == other.path
class StatResults:
def __init__(self):
# "now" starting point, modification times must be before this datetime.
# Only applies to git log, OS modified time is unaffected.
self.now = datetime.datetime.now()
self.search_dirs = []
# reference to SourceFileContent with highest mtime
self.last_modified_file = None
# status flag. Used to determine if last modified timestamps are valid.
self.last_mtime_valid = False
# count of all functions
self.total_function_count = 0
# length in bytes for all functions
self.total_function_byte_count = 0
# count of non matching functions
self.total_nonmatching_count = 0
# length in bytes for all non matching functions
self.total_nonmatching_byte_count = 0
# count of found files with no asm definitions
self.total_completed_file_count = 0
# list of SourceFileContent
self.source_files = []
def mtime_os(file, now, qlocal = False):
if not qlocal:
print('using os')
return int(os.path.getmtime(file))
def mtime_git(file, now, qlocal = False):
if not qlocal:
print('using git')
try:
date_str = now.strftime('%Y-%m-%dT%H:%M:%S%z')
result = subprocess.run(['git', 'log', '-1', '--format=\"%ct\"', '--date=local', '--before=\"' + date_str + '\"', '--', file], stdout=subprocess.PIPE, universal_newlines=True)
except:
print ('fatal error reading git log history, maybe use OS modified time resolver, --mtime_os option. File: "' + file + '", date_str: "' + date_str + '"')
sys.exit(7)
log_out = result.stdout.strip().replace('"', '')
if not log_out:
return 0
return int(log_out)
def process_source_files(search, stats: StatResults, mtime_resolver, qlocal = False):
stats.search_dirs = search
# regular expression: a line that starts with text "glabel", followed by
# a single space, followed by (captured) anything that is not whitespace
# one or more times, followed by end of line.
p1 = re.compile(r"^glabel (\S+)$")
# iterate each search location, looking inside source files for asm function definitions
for s in search:
if not os.path.isdir(s.path):
print('fatal: directory not found:', s.path)
sys.exit(3)
for root, dir_names, filenames in os.walk(s.path):
for filename in filenames:
file = os.path.join(root, filename)
if not (filename.lower().endswith(".c") or filename.lower().endswith(".s")):
continue
if filename in s.ignore:
continue
#print(file)
sfc = SourceFileContent(file)
sfc.asm_functions = set()
sfc.parent = s
sfc.mtime = mtime_resolver(file, stats.now, qlocal)
import datetime
x = datetime.datetime.fromtimestamp(sfc.mtime)
if not qlocal:
print(sfc.path, ' = ', x.strftime('%c'))
# The `completed` list is manually configured to specify which files should not be
# counted against the total. This is done by simply not adding asm function
# definitions into the asm_functions property.
if filename not in s.completed:
with open(file) as fp:
# The "in" operator will be faster than regex here, but will include stuff
# we don't want. Do an initial pass with "in", then refine results with
# regex.
# The container is a hashset for cases where the function is
# declared multiple times (e.g., different versions)
file_glabel_lines = [line for line in fp.readlines() if "glabel" in line]
for line in file_glabel_lines:
r1 = p1.findall(line)
if r1 and len(r1) == 1 and not re.match(__re_jpt_label, r1[0]):
sfc.asm_functions.add(r1[0])
stats.source_files.append(sfc)
if not s.recurse:
break
return stats
def apply_build_map(stats: StatResults, version: str):
"""
This iterates over the build map file.
It scans for the start of a `.text` definition indicating a new file.
Then it parses each following line to get function names within the file
until a new section is encountered.
"""
version_code = version[0]
map_file_path = 'build' + os.path.sep + version_code + os.path.sep + 'ge007.' + version_code + '.map'
infile = False
prevromaddr = 0
# regex:
# a line that starts with text " .text", followed by any whitespace (at least one), then a 64 bit hex address.
# the lower 32 bits of the address are captured in a group. Followed by any whitespace (at least one),
# followed by a hex address (capturing everything after "0x"), followed by any whitespace (at least one).
# It then matches a build path, based on version, capturing everything after the version and slash
# up to the file extension, followed by end of line.
p1 = re.compile(r"^ \.text\s+0x00000000([0-9a-f]{8})\s+0x([0-9a-f]+)\s+build\/" + re.escape(version_code) + r"\/([^\s]+)$")
# regex to match subsequent lines after the above regex matches. This will
# match any whitespace, then 64 bit address (capturing the lower 32 bits), then
# any whitespace (at least one), then capture the remaining non-whitespace-text until end of line (in capture group).
# That last capture group is the function name.
p2 = re.compile(r"^\s+0x00000000([0-9a-f]{8})\s+(\S+)$")
current_sfc = None
current_fi = None
with open(map_file_path, 'r') as map_file:
lines = map_file.readlines()
for line in lines:
m1 = p1.findall(line)
m2 = p2.findall(line)
# m1 match means this is a new .text section (new file)
if m1:
filestart = int(m1[0][0], 16)
filelen = int(m1[0][1], 16)
# normalize paths
segment = (m1[0][2]).replace('/', os.path.sep)
infile = True
prevromaddr = 0
prevfuncname = None
current_fi = None
if segment.endswith(".c") or segment.endswith(".s") or segment.endswith(".o"):
segment_without_ext = segment[:-2]
else:
segment_without_ext = segment
# get the SourceFileContent container for this file listing
current_sfc = next((x for x in stats.source_files if x.path_without_ext == segment_without_ext), None)
# if there's no existing reference, that means this function is listed
# in the map file but not found when searching directories previously.
# In that case, this file/function should be ignored.
if current_sfc is None:
infile = False
elif infile and m2:
romaddr = int(m2[0][0], 16)
funcname = m2[0][1]
if re.match(__re_jpt_label, funcname):
continue
if current_sfc is None:
print('invalid state: current_sfc not set')
sys.exit(4)
if current_fi is not None:
current_fi.length = romaddr - prevromaddr
if current_fi.length > __invalid_func_length_size:
raise ValueError("function length too large: " + str(funcname) + " len = " + str(current_fi.length))
# get the function info for this function, or create a new one
current_fi = next((x for x in current_sfc.all_functions if x.name == funcname), None)
if current_fi is None:
current_fi = FunctionInfo(funcname)
current_fi.parent = current_sfc
# check to see if there was an asm definition for this function
if funcname in current_sfc.asm_functions:
current_fi.nonmatching = True
current_sfc.all_functions.append(current_fi)
prevromaddr = romaddr
prevfuncname = funcname
elif infile:
infile = False
# this is a one line definition in the map file
if current_fi is None:
current_fi = FunctionInfo(funcname)
current_fi.parent = current_sfc
# check to see if there was an asm definition for this function
if funcname in current_sfc.asm_functions:
current_fi.nonmatching = True
# since this is only one entry, use the entire size
current_fi.length = filelen
current_sfc.all_functions.append(current_fi)
else:
current_fi.length = (filestart + filelen) - prevromaddr
if current_fi.length > __invalid_func_length_size:
raise ValueError("function length too large: " + str(funcname) + " len = " + str(current_fi.length))
def percent_complete(num, den):
if not (den > 0):
return 1
elif num == den:
return 1
frac = num / den
if frac <= 0:
return 0
elif frac > 0 and frac < 0.01:
return 0.01
elif frac >= 0.01 and frac <= 0.99:
return frac
elif frac >= 0.99 and frac < 1:
return 0.99
raise ValueError("Can't compute percent complete: " + str(num) + " / " + str(den))
def generate_default_stats(stats: StatResults):
"""
Iterate each file, and each function in each file, to compute
the total bytes/functions/files in each searchdir, and sum
the totals for the entire search.
"""
# track unique modification times
seen_mtime = set()
for file in stats.source_files:
# if this file comes from the .map file and does not have the
# parent association setup then it should be ignored.
if file.parent is None:
continue
if stats.last_modified_file is None or stats.last_modified_file.mtime < file.mtime:
stats.last_modified_file = file
seen_mtime.add(file.mtime)
for func in file.all_functions:
file.parent.function_count += 1
file.parent.function_byte_count += func.length
stats.total_function_count += 1
stats.total_function_byte_count += func.length
if func.nonmatching:
file.parent.nonmatching_count += 1
file.parent.nonmatching_byte_count += func.length
stats.total_nonmatching_count += 1
stats.total_nonmatching_byte_count += func.length
file.parent.file_count += 1
if len(file.asm_functions) == 0:
file.parent.completed_file_count += 1
stats.total_completed_file_count += 1
# More hacks to check for valid modification times.
# 10 is an arbitrary number, but this says that if there are
# at least 10 unique modification times, assume the timestamps
# are valid. Otherwise this will fallbac kto show the default file.
if len(seen_mtime) > 10:
stats.last_mtime_valid = True
# If it wasn't possible to determine recently modified file (i.e., github actions)
# then fallback to "ge.u.z64"
if not stats.last_mtime_valid:
stats.last_modified_file = SourceFileContent(__mtime_fallback_filename)
def print_default_stats(stats: StatResults, version):
print()
print('~> Decomp Statistics for ' + version.upper() +' version')
print('__________________________________________________________________')
print()
print('FILES')
print('\trecently modified:')
if stats.last_mtime_valid:
recent_files = sorted(stats.source_files, key=lambda x: x.mtime, reverse=True)[:5]
for rf in recent_files:
print('\t\t' + rf.path)
else:
print('\t\t' + __mtime_fallback_filename)
print()
for s in stats.search_dirs:
print('\t{:12}\t{:5,} / {:5,} \t{:6.2f}%'.format(s.path, int(s.completed_file_count), int(s.file_count), (percent_complete(s.completed_file_count, s.file_count) * 100)))
print()
print('\ttotal \t{:5,} / {:5,} \t{:6.2f}%'.format(int(stats.total_completed_file_count), len(stats.source_files), (percent_complete(stats.total_completed_file_count, len(stats.source_files)) * 100)))
print('__________________________________________________________________')
print()
print('FUNCTIONS')
for s in stats.search_dirs:
d_completed_func_count = s.function_count - s.nonmatching_count
print('\t{:12}\t{:7,} / {:7,} \t{:6.2f}%'.format(s.path, int(d_completed_func_count), int(s.function_count), (percent_complete(d_completed_func_count, s.function_count) * 100)))
print()
print('\ttotal \t{:7,} / {:7,} \t{:6.2f}%'.format(int(stats.total_nonmatching_count), int(stats.total_function_count), (percent_complete(stats.total_nonmatching_count, stats.total_function_count) * 100)))
print('__________________________________________________________________')
print()
print('BYTES')
for s in stats.search_dirs:
d_completed_byte_count = s.function_byte_count - s.nonmatching_byte_count
print('\t{:12}\t{:11,} / {:11,} \t{:6.2f}%'.format(s.path, int(d_completed_byte_count), int(s.function_byte_count), (percent_complete(d_completed_byte_count, s.function_byte_count) * 100)))
print()
completed_byte_count = stats.total_function_byte_count - stats.total_nonmatching_byte_count
print('\ttotal \t{:11,} / {:11,} \t{:6.2f}%'.format(int(completed_byte_count), int(stats.total_function_byte_count), (percent_complete(completed_byte_count, stats.total_function_byte_count) * 100)))
print('__________________________________________________________________')
print()
def print_non_matching_stats(stats: StatResults, version):
"""
List all non matching functions. Sort by byte length.
Print as csv of filename, function name, function length.
"""
functions = []
for file in stats.source_files:
for func in file.all_functions:
if func.nonmatching:
functions.append(func)
functions.sort(key=lambda x: x.length, reverse=True)
try:
print("filename, function, length")
for f in functions:
print(f.parent.path + ", " + f.name + ", " + str(f.length))
except socket.error as e:
if e.errno == errno.EPIPE:
sys.exit(0)
raise
def bytes_to_words(b):
return (int(b) >> 2) << 2
def generate_report(stats: StatResults, version):
"""
Send values over to the c report app.
Arguments must be added in the following order:
note: changing from "libultra" to "libultrare"
SRC_DIR: int, size in 32-bit words of completed functions, in src/ directory
SRC_DIR_MAX: int, size in 32-bit words of all tracked functions, in src/ directory
GAME_DIR: int, size in 32-bit words of completed functions, in src/game/ directory
GAME_DIR_MAX: int, size in 32-bit words of all tracked functions, in src/game/ directory
INFLATE_DIR: int, size in 32-bit words of completed functions, in src/inflate/ directory
INFLATE_DIR_MAX: int, size in 32-bit words of all tracked functions, in src/inflate/ directory
LIBULTRA_DIR: int, size in 32-bit words of completed functions, in src/libultrare/ directory
LIBULTRA_DIR_MAX: int, size in 32-bit words of all tracked functions, in src/libultrare/ directory
DECOMPILED_WORDS: int, size in 32-bit words of all completed functions
DECOMPILED_WORDS_MAX: int, size in 32-bit words of all tracked functions
DECOMPILED_FILES: int, count of all fully completed files
DECOMPILED_FILES_MAX: int, count of all tracked files
HTML_TEMPLATE: string: path to html template input file
HTML_OUTPUT: string: path to generated html file
LAST_MODIFIED_FILE: string: name of last modified file
LOG_LEVEL: int (enum): LOG_MIN = 0, LOG_DEF, LOG_MAX
"""
sd = next((x for x in stats.search_dirs if x.path == 'src'), None)
if sd is None:
raise ValueError('Could not find "src" SearchDir')
src_dir = bytes_to_words(sd.function_byte_count - sd.nonmatching_byte_count)
src_dir_max = bytes_to_words(sd.function_byte_count)
sd = next((x for x in stats.search_dirs if x.path == 'src/game'), None)
if sd is None:
raise ValueError('Could not find "src/game" SearchDir')
game_dir = bytes_to_words(sd.function_byte_count - sd.nonmatching_byte_count)
game_dir_max = bytes_to_words(sd.function_byte_count)
sd = next((x for x in stats.search_dirs if x.path == 'src/inflate'), None)
if sd is None:
raise ValueError('Could not find "src/inflate" SearchDir')
inflate_dir = bytes_to_words(sd.function_byte_count - sd.nonmatching_byte_count)
inflate_dir_max = bytes_to_words(sd.function_byte_count)
# note: libultrare
sd = next((x for x in stats.search_dirs if x.path == 'src/libultrare'), None)
if sd is None:
raise ValueError('Could not find "src/libultrare" SearchDir')
libultrare_dir = bytes_to_words(sd.function_byte_count - sd.nonmatching_byte_count)
libultrare_dir_max = bytes_to_words(sd.function_byte_count)
decompiled_words = bytes_to_words(stats.total_function_byte_count - stats.total_nonmatching_byte_count)
decompiled_words_max = bytes_to_words(stats.total_function_byte_count)
decompiled_files = stats.total_completed_file_count
decompiled_files_max = len(stats.source_files)
if version == 'us':
html_output = __report_out_us
elif version == 'jp':
html_output = __report_out_jp
elif version == 'eu':
html_output = __report_out_eu
else:
print('fatal: version', version, 'not supported! Supported versions are: ', ', '.join(__supported_versions))
sys.exit(2)
printstring = __report_bin + ' ' + \
str(src_dir) + ' ' + \
str(src_dir_max) + ' ' + \
str(game_dir) + ' ' + \
str(game_dir_max) + ' ' + \
str(inflate_dir) + ' ' + \
str(inflate_dir_max) + ' ' + \
str(libultrare_dir) + ' ' + \
str(libultrare_dir_max) + ' ' + \
str(decompiled_words) + ' ' + \
str(decompiled_words_max) + ' ' + \
str(decompiled_files) + ' ' + \
str(decompiled_files_max) + ' ' + \
__report_template + ' ' + \
html_output + ' ' + \
stats.last_modified_file.path + ' ' + \
'0'
subprocess.Popen(printstring.split())
def print_help():
print('Usage: python3 ge-stats2.py [OPTION]')
print('Generate Decompilation Stats for Goldeneye 007')
print('Example: python3 ge-stats2.py --version us')
print(' ')
print(' This script expects to be run from the root directory. There must be')
print(' a map file in ./build/u/ge007.u.map (or equivalent version). Source files')
print(' must be located in a folder in the root directory.')
print(' ')
print(' This script does not attempt to parse preprocessor definitions and')
print(' ignores version differences in source files. For the purposes of these')
print(' stats, any version with an asm definition of a function will count')
print(' against completion. However, a version is still required to specify')
print(' which build map to reference against.')
print(' ')
print('Options:')
print(' -v, --version=CODE generate decomp stats for version CODE (us, jp, or eu). Default is us')
print(' -r, --report generate html report using report tool')
print(' -n, --non_matching print csv of all non matching function definitions')
print(' --mtime_os use OS last modified time instead of git log')
print(' -h, --help display this help text and exit')
print(' --qlocal quiet "local" version. Disable all the extra print statements included for github actions.')
def main():
# default to US version
version = 'us'
run_report = False
print_method = 'default'
mtime_use_os = False
qlocal = False
if (len(sys.argv) > 1):
arguments, values = getopt.getopt(sys.argv[1:], "hv:rn", ["help", "version=", "report", "non_matching", "mtime_os", "qlocal"])
for current_argument, current_value in arguments:
if current_argument in ("-h", "--help"):
print_help()
sys.exit()
elif current_argument in ("-v", "--version"):
version = current_value.lower()
elif current_argument in ("-r", "--report"):
run_report = True
elif current_argument in ("-n", "--non_matching"):
print_method = 'non_matching'
elif current_argument in ("--mtime_os"):
mtime_use_os = True
elif current_argument in ("--qlocal"):
qlocal = True
if version not in __supported_versions:
print('fatal: version', version, 'not supported! Supported versions are: ', ', '.join(__supported_versions))
sys.exit(2)
if run_report:
if not os.path.isfile(__report_bin):
print('fatal: file not found: ' + __report_bin)
sys.exit(6)
# if this is just getting status on nonmatching files, ignore modified time lookup.
if print_method == 'non_matching' and not run_report:
mtime_use_os = True
if not qlocal:
print(subprocess.run(['git', 'diff', 'origin/master', '--name-only', '"@{10 minutes ago}"']).stdout)
# Default to using git log to get the file's modified date.
# Git log will be much slower, but cloning a new repo (i.e., github actions online)
# will reset all the modified timestamps to the same value, so will need to
# resolve the last modified time from git history.
mtime_resolver = mtime_os
if not mtime_use_os:
if os.path.isdir('.git'):
mtime_resolver = mtime_git
else:
# ok, do one last check (for github actions)
try:
result = subprocess.run(['git', 'log', '-1', '--format=\"%ct\"', '--', 'readme.md'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
if result.returncode == 0:
mtime_resolver = mtime_git
except:
pass
# files to count as complete, in src/ directory
src_completed_list = [
'_start.s',
'aspboot.s',
'bootcode.s',
'getra.s',
'gspboot.s',
'osMapTLB.s',
'rom_header.s',
'rspboot.s',
'tlb_hardwire.s',
'tlb_resolve.s']
search = []
search.append(SearchDir('src', False, completed=src_completed_list))
search.append(SearchDir('src/game', False))
search.append(SearchDir('src/inflate', False))
search.append(SearchDir('src/libultrare', True))
stats = StatResults()
process_source_files(search, stats, mtime_resolver, qlocal)
apply_build_map(stats, version)
generate_default_stats(stats)
if print_method == 'default':
print_default_stats(stats, version)
elif print_method == 'non_matching':
print_non_matching_stats(stats, version)
if run_report:
generate_report(stats, version)
if __name__ == '__main__':
#start_time = time.time()
main()
#print("--- %s seconds ---" % (time.time() - start_time))