GrammarReducer not working as expected

Open rindPHI opened this issue 4 years ago • 1 comments

Describe the bug I tried the GrammarReducer (version 0.8.1 from PyPi) of the "Reducing Failure-Inducing Inputs" chapter with an example of processing CSV files, and get a really strange result. Instead of the reduced input, I get the original input with additional nonterminals.

Precisely:

I have an input

Field 1";"Field 2";"Field 3"
17;23;42
5648;13459;"some
multiline
text"
1;2;"now:";4
100;200;300

where the second-but-last contains one more field than defined in the header, this raises in IndexError. My homegrown reduction reduces the input to

""
"";

which really is quite minimal. From the GrammarReducer, I get

<spaces-1>"Field 1<escaped-character-1>"<spaces-2>;<spaces-1>"Field 2<escaped-character-1>"<spaces-2>;<spaces-1>"Field 3<escaped-character-1>"<spaces-2>
<spaces-1>17<simple-character-1><spaces-2>;<spaces-1>23<simple-character-1><spaces-2>;<spaces-1>42<simple-character-1><spaces-2>
<spaces-1>5648<simple-character-1><spaces-2>;<spaces-1>13459<simple-character-1><spaces-2>;<spaces-1>"some
multiline
text<escaped-character-1>"<spaces-2>
<spaces-1>1<simple-character-1><spaces-2>;<spaces-1>2<simple-character-1><spaces-2>;<spaces-1>"now:<escaped-character-1>"<spaces-2>;<spaces-1>4<simple-character-1><spaces-2>
<spaces-1>100<simple-character-1><spaces-2>;<spaces-1>200<simple-character-1><spaces-2>;<spaces-1>300<simple-character-1><spaces-2>
<csv-record-1>

To Reproduce To reproduce the behavior, run the following code:

import string

from fuzzingbook.Fuzzer import Runner
from fuzzingbook.GrammarFuzzer import tree_to_string
from fuzzingbook.Grammars import convert_ebnf_grammar, srange
from fuzzingbook.Parser import EarleyParser
from fuzzingbook.Reducer import GrammarReducer

CSV_EBNF_GRAMMAR = {
    "<start>": ["<csv-file>"],
    "<csv-file>": ["<csv-record>*"],
    "<csv-record>": ["<csv-string-list>\n"],
    "<csv-string-list>": ["<raw-string>", "<raw-string>;<csv-string-list>"],
    "<raw-string>": ["<spaces>", "<spaces>?<raw-field><spaces>?"],
    "<raw-field>": ["<simple-field>", "<quoted-field>"],
    "<simple-field>": ["<simple-character>*"],
    "<simple-character>": [c for c in srange(string.printable) if c not in ["\n", ";", '"', " ", "\t", "\r"]],
    "<quoted-field>": ['"<escaped-field>"'],
    "<escaped-field>": ["<escaped-character>*"],
    "<escaped-character>": [c for c in srange(string.printable) if c not in ['"']],
    "<spaces>": [" ", " <spaces>"],
}

CSV_GRAMMAR = convert_ebnf_grammar(CSV_EBNF_GRAMMAR)

CSV_PARSER = EarleyParser(CSV_GRAMMAR)


class CSVFile:
    def __init__(self, header: list, lines: list[list]):
        self.header = header
        self.lines = []
        for line in lines:
            the_dict = {}
            for i, entry in enumerate(line):
                the_dict[header[i]] = entry
            self.lines.append(the_dict)

    def __getitem__(self, item):
        if type(item) is str:
            return [line[item] for line in self.lines]
        else:
            return self.lines[item]

    def __iter__(self):
        return self.lines.__iter__()

    def __str__(self):
        result = " ; ".join(self.header)
        if self.lines:
            result += "\n"
        result += "\n".join([" ; ".join(line.values()) for line in self.lines])
        return result


def dfs(tree, action=print):
    node, children = tree
    action(tree)
    for child in children:
        dfs(child, action)


def read_csv_file(inp):
    tree = list(CSV_PARSER.parse(inp))[0]

    fields = []
    records = []

    def collect_records(t):
        nonlocal fields, records
        node, children = t
        if node == "<csv-record>":
            record = []
            for child in children:
                fields = []
                dfs(child, collect_fields)
                if fields:
                    record.append(fields)
            records.extend(record)

    def collect_fields(t):
        nonlocal fields
        node, children = t
        if node == "<csv-string-list>":
            fields.append(tree_to_string(children[0]))

    dfs(tree, collect_records)

    if len(records) < 1:
        raise SyntaxError("CSV file must contain at least a header line!")

    return CSVFile(records[0], records[1:])


buggy_input = """"Field 1";"Field 2";"Field 3"
17;23;42
5648;13459;"some
multiline
text"
1;2;"now:";4
100;200;300
"""


class ReadCSVFileRunner(Runner):
    def run(self, inp):
        try:
            read_csv_file(inp)
            return inp, Runner.PASS
        except SyntaxError:
            return inp, Runner.UNRESOLVED
        except IndexError:
            return inp, Runner.FAIL


read_csv_eval = ReadCSVFileRunner()
grammar_reducer = GrammarReducer(
    read_csv_eval,
    EarleyParser(CSV_GRAMMAR))
print(grammar_reducer.reduce(buggy_input))

Expected behavior The expected printed output should somehow resemble

""
"";

Desktop (please complete the following information):

OS: macOS Big Sur 11.2.2
Browser: Does not apply (using PyPi version)
Python version: 3.9

Additional context I developed an alternative implementation of GrammarReducer without knowing of the existence of GrammarReducer. Now, I tried to replace my own implementation, but unfortunately did not obtain the desired behavior.

Mar 11 '21 10:03 rindPHI

The cause is not the version on PyPi, I just executed a cell with the following (only slightly changed) content in the interactive notebook for the current fuzzing book version, same result:

import string

from Fuzzer import Runner
from GrammarFuzzer import tree_to_string
from Grammars import convert_ebnf_grammar, srange
from Parser import EarleyParser
from Reducer import GrammarReducer

CSV_EBNF_GRAMMAR = {
    "<start>": ["<csv-file>"],
    "<csv-file>": ["<csv-record>*"],
    "<csv-record>": ["<csv-string-list>\n"],
    "<csv-string-list>": ["<raw-string>", "<raw-string>;<csv-string-list>"],
    "<raw-string>": ["<spaces>", "<spaces>?<raw-field><spaces>?"],
    "<raw-field>": ["<simple-field>", "<quoted-field>"],
    "<simple-field>": ["<simple-character>*"],
    "<simple-character>": [c for c in srange(string.printable) if c not in ["\n", ";", '"', " ", "\t", "\r"]],
    "<quoted-field>": ['"<escaped-field>"'],
    "<escaped-field>": ["<escaped-character>*"],
    "<escaped-character>": [c for c in srange(string.printable) if c not in ['"']],
    "<spaces>": [" ", " <spaces>"],
}

CSV_GRAMMAR = convert_ebnf_grammar(CSV_EBNF_GRAMMAR)

CSV_PARSER = EarleyParser(CSV_GRAMMAR)


class CSVFile:
    def __init__(self, header, lines):
        self.header = header
        self.lines = []
        for line in lines:
            the_dict = {}
            for i, entry in enumerate(line):
                the_dict[header[i]] = entry
            self.lines.append(the_dict)

    def __getitem__(self, item):
        if type(item) is str:
            return [line[item] for line in self.lines]
        else:
            return self.lines[item]

    def __iter__(self):
        return self.lines.__iter__()

    def __str__(self):
        result = " ; ".join(self.header)
        if self.lines:
            result += "\n"
        result += "\n".join([" ; ".join(line.values()) for line in self.lines])
        return result


def dfs(tree, action=print):
    node, children = tree
    action(tree)
    for child in children:
        dfs(child, action)


def read_csv_file(inp):
    tree = list(CSV_PARSER.parse(inp))[0]

    fields = []
    records = []

    def collect_records(t):
        nonlocal fields, records
        node, children = t
        if node == "<csv-record>":
            record = []
            for child in children:
                fields = []
                dfs(child, collect_fields)
                if fields:
                    record.append(fields)
            records.extend(record)

    def collect_fields(t):
        nonlocal fields
        node, children = t
        if node == "<csv-string-list>":
            fields.append(tree_to_string(children[0]))

    dfs(tree, collect_records)

    if len(records) < 1:
        raise SyntaxError("CSV file must contain at least a header line!")

    return CSVFile(records[0], records[1:])


buggy_input = """"Field 1";"Field 2";"Field 3"
17;23;42
5648;13459;"some
multiline
text"
1;2;"now:";4
100;200;300
"""


class ReadCSVFileRunner(Runner):
    def run(self, inp):
        try:
            read_csv_file(inp)
            return inp, Runner.PASS
        except SyntaxError:
            return inp, Runner.UNRESOLVED
        except IndexError:
            return inp, Runner.FAIL


read_csv_eval = ReadCSVFileRunner()
grammar_reducer = GrammarReducer(
    read_csv_eval,
    EarleyParser(CSV_GRAMMAR))
print(grammar_reducer.reduce(buggy_input))

Mar 11 '21 10:03 rindPHI