substrait icon indicating copy to clipboard operation
substrait copied to clipboard

Add CSV FileFormat in Substrait

Open sanjibansg opened this issue 2 years ago • 2 comments

With reference to https://github.com/substrait-io/substrait/issues/138, we can have the implementation for CSV file format by defining the required messages. (Prototype code can be found here)

message CSVConvertOptions{
        bool ignore_check_utf8 = 1;
        repeated string null_values = 2;
        repeated string true_values = 3;
        repeated string false_values = 4;
        bool strings_can_be_null = 5;
        bool quoted_strings_cannot_be_null = 6;
        bool auto_dict_encode = 7;
        int32 auto_dict_max_cardinality = 8;
        string decimal_point = 9;
        repeated string include_columns = 10;
        bool include_missing_columns = 11;
      }

message CSVReadOptions{
        bool no_use_threads = 1;
        int32 block_size = 2;
        int32 skip_rows = 3;
        int32 skip_rows_after_names = 4;
        repeated string column_names = 5;
        bool autogenerate_column_names = 6;
      }

message CSVParseOptions{
        string delimiter = 1;
        bool quoting = 2;
        string quote_char = 3; 
        bool double_quote = 4;
        bool escaping = 5;
        string escape_char = 6;
        bool newlines_in_values = 7;
        bool ignore_empty_lines = 8;
      }

message CSVOptions{
        CSVParseOptions parse_options = 1;
        CSVConvertOptions convert_options = 2;
        CSVReadOptions read_options = 3;
      }

and then the file_type can be defined by one_of,

      oneof file_type{
        FileFormat format = 5;
        CSVOptions csv_options = 6;
      }

We can proceed with this and can then develop a generic implementation using google.protobuf.Any, with separate .proto files defining various file formats.

sanjibansg avatar Apr 26 '22 05:04 sanjibansg