protofuzz
protofuzz copied to clipboard
Improve performance to limit file reads when generating fuzzed values.
There is a pretty obvious performance bottleneck. In the function _fuzzdb_get_strings and in the function _fuzzdb_integers the function reads the entire fuzzdb files on every call.
for subdir in os.listdir(_get_fuzzdb_path()):
if subdir in ignored:
continue
subdir_abs_path = _get_fuzzdb_path() / Path(subdir)
try:
listing = os.listdir(subdir_abs_path)
except NotADirectoryError:
continue
for filename in listing:
if not filename.endswith(".txt"):
continue
subdir_abs_path_filename = subdir_abs_path / Path(filename)
with open(subdir_abs_path_filename, "rb") as source:
for line in source:
string = line.decode("utf-8").strip()
if not string or string.startswith("#"):
continue
if max_len != 0 and len(line) > max_len:
continue
yield string
This causes a pretty obvious performance bottleneck because the file gets read on every call. Instead of reading the file every time a much better strategy would be to put all of the values in to a list and then just return the stuff from that list from ram instead of reading it from the disk constantly.
strings = []
def _fuzzdb_get_strings(max_len: int = 0) -> Generator:
"""Return strings from fuzzdb."""
if strings == []:
ignored = ["integer-overflow"]
for subdir in os.listdir(_get_fuzzdb_path()):
if subdir in ignored:
continue
subdir_abs_path = _get_fuzzdb_path() / Path(subdir)
try:
listing = os.listdir(subdir_abs_path)
except NotADirectoryError:
continue
for filename in listing:
if not filename.endswith(".txt"):
continue
subdir_abs_path_filename = subdir_abs_path / Path(filename)
with open(subdir_abs_path_filename, "rb") as source:
for line in source:
string = line.decode("utf-8").strip()
if not string or string.startswith("#"):
continue
if max_len != 0 and len(line) > max_len:
continue
strings.append(string)
yield string
else:
for string in strings:
yield string
We can also obviously do the same for the integers:
integers = []
def _fuzzdb_integers(limit: int = 0) -> Generator:
"""Return integers from fuzzdb."""
if integers == []:
path = _get_fuzzdb_path() / Path("integer-overflow/integer-overflows.txt")
with open(path, "rb") as stream:
for line in _limit_helper(stream, limit):
integer = int(line.decode("utf-8"), 0)
integers.append(integer)
yield integer
else:
for integer in integers:
yield integer
When parsing around four thousand protobuf messages and then mutating them originally the cProfile was this:
['60786110', '50.667', '0.000', '100.909', '0.000', 'values.py:90(_fuzzdb_get_strings)']
['60973618/60973382', '15.971', '0.000', '116.882', '0.000', 'values.py:72(_limit_helper)']
['62370202', '10.454', '0.000', '10.454', '0.000', '{method', "'decode'", 'of', "'bytes'", 'objects}']
['60981956', '8.501', '0.000', '8.501', '0.000', '{method', "'startswith'", 'of', "'str'", 'objects}']
the second column is total time spent in the function.
after the optimization it is now this:
['60790067/60790036', '12.243', '0.000', '17.458', '0.000', 'values.py:69(_limit_helper)']
['60786110', '5.203', '0.000', '5.215', '0.000', 'values.py:97(_fuzzdb_get_strings)']
['4630', '4.368', '0.001', '20.797', '0.004', 'protofuzz.py:48(_string_generator)']
['265', '0.984', '0.004', '2.028', '0.008', 'protofuzz.py:55(<listcomp>)']
['371565/34403', '0.318', '0.000', '0.875', '0.000', 'gen.py:188(step_generator)']
['48027', '0.286', '0.000', '0.482', '0.000', 'python_message.py:469(init)']
So just by making this simple optimization I have cut down the processing time by atleast in 45 seconds easily.
I have attached a .zip file which contains a .diff file which implements the stuff. Apply it to the a4fd0936ae501dba0fc89d3af4ead8ef3aaad1d1 commit and you are done. stuff.zip