GrammarReducer not working as expected
Describe the bug
I tried the GrammarReducer (version 0.8.1 from PyPi) of the "Reducing Failure-Inducing Inputs" chapter with an example of processing CSV files, and get a really strange result. Instead of the reduced input, I get the original input with additional nonterminals.
Precisely:
I have an input
Field 1";"Field 2";"Field 3"
17;23;42
5648;13459;"some
multiline
text"
1;2;"now:";4
100;200;300
where the second-but-last contains one more field than defined in the header, this raises in IndexError. My homegrown reduction reduces the input to
""
"";
which really is quite minimal. From the GrammarReducer, I get
<spaces-1>"Field 1<escaped-character-1>"<spaces-2>;<spaces-1>"Field 2<escaped-character-1>"<spaces-2>;<spaces-1>"Field 3<escaped-character-1>"<spaces-2>
<spaces-1>17<simple-character-1><spaces-2>;<spaces-1>23<simple-character-1><spaces-2>;<spaces-1>42<simple-character-1><spaces-2>
<spaces-1>5648<simple-character-1><spaces-2>;<spaces-1>13459<simple-character-1><spaces-2>;<spaces-1>"some
multiline
text<escaped-character-1>"<spaces-2>
<spaces-1>1<simple-character-1><spaces-2>;<spaces-1>2<simple-character-1><spaces-2>;<spaces-1>"now:<escaped-character-1>"<spaces-2>;<spaces-1>4<simple-character-1><spaces-2>
<spaces-1>100<simple-character-1><spaces-2>;<spaces-1>200<simple-character-1><spaces-2>;<spaces-1>300<simple-character-1><spaces-2>
<csv-record-1>
To Reproduce To reproduce the behavior, run the following code:
import string
from fuzzingbook.Fuzzer import Runner
from fuzzingbook.GrammarFuzzer import tree_to_string
from fuzzingbook.Grammars import convert_ebnf_grammar, srange
from fuzzingbook.Parser import EarleyParser
from fuzzingbook.Reducer import GrammarReducer
CSV_EBNF_GRAMMAR = {
"<start>": ["<csv-file>"],
"<csv-file>": ["<csv-record>*"],
"<csv-record>": ["<csv-string-list>\n"],
"<csv-string-list>": ["<raw-string>", "<raw-string>;<csv-string-list>"],
"<raw-string>": ["<spaces>", "<spaces>?<raw-field><spaces>?"],
"<raw-field>": ["<simple-field>", "<quoted-field>"],
"<simple-field>": ["<simple-character>*"],
"<simple-character>": [c for c in srange(string.printable) if c not in ["\n", ";", '"', " ", "\t", "\r"]],
"<quoted-field>": ['"<escaped-field>"'],
"<escaped-field>": ["<escaped-character>*"],
"<escaped-character>": [c for c in srange(string.printable) if c not in ['"']],
"<spaces>": [" ", " <spaces>"],
}
CSV_GRAMMAR = convert_ebnf_grammar(CSV_EBNF_GRAMMAR)
CSV_PARSER = EarleyParser(CSV_GRAMMAR)
class CSVFile:
def __init__(self, header: list, lines: list[list]):
self.header = header
self.lines = []
for line in lines:
the_dict = {}
for i, entry in enumerate(line):
the_dict[header[i]] = entry
self.lines.append(the_dict)
def __getitem__(self, item):
if type(item) is str:
return [line[item] for line in self.lines]
else:
return self.lines[item]
def __iter__(self):
return self.lines.__iter__()
def __str__(self):
result = " ; ".join(self.header)
if self.lines:
result += "\n"
result += "\n".join([" ; ".join(line.values()) for line in self.lines])
return result
def dfs(tree, action=print):
node, children = tree
action(tree)
for child in children:
dfs(child, action)
def read_csv_file(inp):
tree = list(CSV_PARSER.parse(inp))[0]
fields = []
records = []
def collect_records(t):
nonlocal fields, records
node, children = t
if node == "<csv-record>":
record = []
for child in children:
fields = []
dfs(child, collect_fields)
if fields:
record.append(fields)
records.extend(record)
def collect_fields(t):
nonlocal fields
node, children = t
if node == "<csv-string-list>":
fields.append(tree_to_string(children[0]))
dfs(tree, collect_records)
if len(records) < 1:
raise SyntaxError("CSV file must contain at least a header line!")
return CSVFile(records[0], records[1:])
buggy_input = """"Field 1";"Field 2";"Field 3"
17;23;42
5648;13459;"some
multiline
text"
1;2;"now:";4
100;200;300
"""
class ReadCSVFileRunner(Runner):
def run(self, inp):
try:
read_csv_file(inp)
return inp, Runner.PASS
except SyntaxError:
return inp, Runner.UNRESOLVED
except IndexError:
return inp, Runner.FAIL
read_csv_eval = ReadCSVFileRunner()
grammar_reducer = GrammarReducer(
read_csv_eval,
EarleyParser(CSV_GRAMMAR))
print(grammar_reducer.reduce(buggy_input))
Expected behavior The expected printed output should somehow resemble
""
"";
Desktop (please complete the following information):
- OS: macOS Big Sur 11.2.2
- Browser: Does not apply (using PyPi version)
- Python version: 3.9
Additional context
I developed an alternative implementation of GrammarReducer without knowing of the existence of GrammarReducer. Now, I tried to replace my own implementation, but unfortunately did not obtain the desired behavior.
The cause is not the version on PyPi, I just executed a cell with the following (only slightly changed) content in the interactive notebook for the current fuzzing book version, same result:
import string
from Fuzzer import Runner
from GrammarFuzzer import tree_to_string
from Grammars import convert_ebnf_grammar, srange
from Parser import EarleyParser
from Reducer import GrammarReducer
CSV_EBNF_GRAMMAR = {
"<start>": ["<csv-file>"],
"<csv-file>": ["<csv-record>*"],
"<csv-record>": ["<csv-string-list>\n"],
"<csv-string-list>": ["<raw-string>", "<raw-string>;<csv-string-list>"],
"<raw-string>": ["<spaces>", "<spaces>?<raw-field><spaces>?"],
"<raw-field>": ["<simple-field>", "<quoted-field>"],
"<simple-field>": ["<simple-character>*"],
"<simple-character>": [c for c in srange(string.printable) if c not in ["\n", ";", '"', " ", "\t", "\r"]],
"<quoted-field>": ['"<escaped-field>"'],
"<escaped-field>": ["<escaped-character>*"],
"<escaped-character>": [c for c in srange(string.printable) if c not in ['"']],
"<spaces>": [" ", " <spaces>"],
}
CSV_GRAMMAR = convert_ebnf_grammar(CSV_EBNF_GRAMMAR)
CSV_PARSER = EarleyParser(CSV_GRAMMAR)
class CSVFile:
def __init__(self, header, lines):
self.header = header
self.lines = []
for line in lines:
the_dict = {}
for i, entry in enumerate(line):
the_dict[header[i]] = entry
self.lines.append(the_dict)
def __getitem__(self, item):
if type(item) is str:
return [line[item] for line in self.lines]
else:
return self.lines[item]
def __iter__(self):
return self.lines.__iter__()
def __str__(self):
result = " ; ".join(self.header)
if self.lines:
result += "\n"
result += "\n".join([" ; ".join(line.values()) for line in self.lines])
return result
def dfs(tree, action=print):
node, children = tree
action(tree)
for child in children:
dfs(child, action)
def read_csv_file(inp):
tree = list(CSV_PARSER.parse(inp))[0]
fields = []
records = []
def collect_records(t):
nonlocal fields, records
node, children = t
if node == "<csv-record>":
record = []
for child in children:
fields = []
dfs(child, collect_fields)
if fields:
record.append(fields)
records.extend(record)
def collect_fields(t):
nonlocal fields
node, children = t
if node == "<csv-string-list>":
fields.append(tree_to_string(children[0]))
dfs(tree, collect_records)
if len(records) < 1:
raise SyntaxError("CSV file must contain at least a header line!")
return CSVFile(records[0], records[1:])
buggy_input = """"Field 1";"Field 2";"Field 3"
17;23;42
5648;13459;"some
multiline
text"
1;2;"now:";4
100;200;300
"""
class ReadCSVFileRunner(Runner):
def run(self, inp):
try:
read_csv_file(inp)
return inp, Runner.PASS
except SyntaxError:
return inp, Runner.UNRESOLVED
except IndexError:
return inp, Runner.FAIL
read_csv_eval = ReadCSVFileRunner()
grammar_reducer = GrammarReducer(
read_csv_eval,
EarleyParser(CSV_GRAMMAR))
print(grammar_reducer.reduce(buggy_input))