clusterfuzz
clusterfuzz copied to clipboard
question about Crash deduplication logic
I’m extracting ClusterFuzz’s crash deduplication logic into a standalone Python script.
When I parse an ASAN log (A.asan.log) with clusterfuzz.stacktraces.StackParser().parse(), I get a CrashInfo object containing four main fields:
-
crash_type: the sanitizer error kind (e.g. buffer-overflow)
-
crash_state: a signature derived from the “top” user-code frames (line numbers)
-
address: the faulting memory address
-
stacktrace: the full stack trace
I’d like to know exactly what logic to decide that two instances represent the same crash for deduplication.
Does the script I wrote match ClusterFuzz’s logic?
if is_similar(pool.crash_type, corpus.crash_type) and is_similar(pool.crash_state, corpus.crash_state)\
and is_similar(pool.crash_stacktrace, corpus.crash_stacktrace):
full script is as follows
import os
import sys
script_dir = os.path.dirname(__file__)
cf_src = os.path.abspath(os.path.join(script_dir, 'clusterfuzz', 'src'))
sys.path.insert(0, cf_src)
import collections
from clusterfuzz._internal.crash_analysis.crash_comparer import CrashComparer
import clusterfuzz.stacktraces
BUG_DIR = "/ASAN_LOG"
def _load_test_data(path):
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
def collect_log_files(directory):
files = []
for root, _, fs in os.walk(directory):
for name in fs:
if name.endswith('.asan.log') or name.endswith('.log'):
files.append(os.path.join(root, name))
return sorted(files)
def is_similar(data1, data2):
comparer = CrashComparer(data1, data2)
return comparer.is_similar()
def deduplicate_logs(corpuses):
clusters = []
for corpus in corpuses:
is_new = True
for pool in clusters:
if is_similar(pool.crash_type, corpus.crash_type) and is_similar(pool.crash_state, corpus.crash_state)\
and is_similar(pool.crash_stacktrace, corpus.crash_stacktrace):
is_new = False
break
if is_new:
clusters.append(corpus)
return clusters
def covert_asan_log_into_data(files):
stacktraces = []
for file in files:
with open(file, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
parser = clusterfuzz.stacktraces.StackParser(file_path=file) # This code was edited for personal purpose
crash_info = parser.parse(content)
if crash_info:
stacktraces.append(crash_info)
return stacktraces
if __name__ == '__main__':
for sub in sorted(os.listdir(BUG_DIR)):
subdir = os.path.join(BUG_DIR, sub)
if not os.path.isdir(subdir):
continue
print(f"=== Directory: {sub} ===")
log_files = collect_log_files(subdir)
print(f"{len(log_files)} log files Found")
crash_infos = covert_asan_log_into_data(log_files)
print(f"{len(crash_infos)} crash info was collected\n")
clusters = deduplicate_logs(crash_infos)
print(f"{len(clusters)} crash Found.\n")