From fdf0ed245bc9736dee8cbb2c1ade1006313c2b9e Mon Sep 17 00:00:00 2001 From: Zach Banks Date: Thu, 13 Jan 2022 01:25:39 -0500 Subject: [PATCH] Improve `find_duplicates.py` performance, ~30x (#618) - Use length heuristic to avoid computing true Lev ratio - Prebuild `sym_bytes` map for `sym_name` -> `query_bytes` --- tools/find_duplicates.py | 49 +++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/tools/find_duplicates.py b/tools/find_duplicates.py index bbe223da12..0695d9ebe1 100755 --- a/tools/find_duplicates.py +++ b/tools/find_duplicates.py @@ -124,6 +124,11 @@ def diff_syms(qb, tb): if len(tb[1]) < 8: return 0 + # The minimum edit distance for two strings of different lengths is `abs(l1 - l2)` + # Quickly check if it's impossible to beat the threshold. If it is, then return 0 + l1, l2 = len(qb[0]), len(tb[0]) + if abs(l1 - l2) / (l1 + l2) > 1.0 - args.threshold: + return 0 r = ratio(qb[0], tb[0]) if r == 1.0 and qb[1] != tb[1]: @@ -261,36 +266,38 @@ def do_cross_query(): ccount = Counter() clusters = [] + sym_bytes = {} for sym_name in map_syms: if not sym_name.startswith("D_") and \ not sym_name.startswith("_binary") and \ not sym_name.startswith("jtbl_") and \ not re.match(r"L[0-9A-F]{8}_[0-9A-F]{5,6}", sym_name): if get_symbol_length(sym_name) > 16: - query_bytes = get_symbol_bytes(map_offsets, sym_name) + sym_bytes[sym_name] = get_symbol_bytes(map_offsets, sym_name) - cluster_match = False - for cluster in clusters: - cluster_first = cluster[0] - cluster_score = get_pair_score(query_bytes, cluster_first) - if cluster_score >= args.threshold: - cluster_match = True - if sym_name.startswith("func") and not cluster_first.startswith("func"): - ccount[sym_name] = ccount[cluster_first] - del ccount[cluster_first] - cluster_first = sym_name - cluster.insert(0, cluster_first) - else: - cluster.append(sym_name) + for sym_name, query_bytes in sym_bytes.items(): + cluster_match = False + for cluster in clusters: + cluster_first = cluster[0] + cluster_score = diff_syms(query_bytes, sym_bytes[cluster_first]) + if cluster_score >= args.threshold: + cluster_match = True + if sym_name.startswith("func") and not cluster_first.startswith("func"): + ccount[sym_name] = ccount[cluster_first] + del ccount[cluster_first] + cluster_first = sym_name + cluster.insert(0, cluster_first) + else: + cluster.append(sym_name) - if cluster_first.startswith("func"): - ccount[cluster_first] += 1 + if cluster_first.startswith("func"): + ccount[cluster_first] += 1 - #if len(cluster) % 10 == 0 and len(cluster) >= 10: - print(f"Cluster {cluster_first} grew to size {len(cluster)} - {sym_name}: {str(cluster_score)}") - break - if not cluster_match: - clusters.append([sym_name]) + #if len(cluster) % 10 == 0 and len(cluster) >= 10: + print(f"Cluster {cluster_first} grew to size {len(cluster)} - {sym_name}: {str(cluster_score)}") + break + if not cluster_match: + clusters.append([sym_name]) print(ccount.most_common(100))