use minimal required fields

Open kevinjqliu opened this issue 1 year ago • 0 comments

Used the minimal required schema for V1 manifest list as described in https://iceberg.apache.org/spec/#manifest-lists

make test stack trace:

============================================================================ short test summary info ============================================================================
FAILED tests/utils/test_manifest.py::test_read_manifest_list - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
FAILED tests/utils/test_manifest.py::test_read_manifest_v1 - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
FAILED tests/utils/test_manifest.py::test_write_manifest[1] - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
FAILED tests/utils/test_manifest.py::test_write_manifest_list[19-1] - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
FAILED tests/utils/test_manifest.py::test_write_manifest_list[None-1] - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
FAILED tests/utils/test_schema_conversion.py::test_avro_to_iceberg - assert Schema(Nested..._field_ids=[]) == Schema(Nested..._field_ids=[])

test_read_manifest_list is the same error as #1194:

____________________________________________________________________________ test_read_manifest_list ____________________________________________________________________________

generated_manifest_file_file_v1 = '/var/folders/f1/3_vzsn7x1jq9hszb3z9y6f0m0000gn/T/tmpdhm82vv4/manifest.avro'

    def test_read_manifest_list(generated_manifest_file_file_v1: str) -> None:
        input_file = PyArrowFileIO().new_input(generated_manifest_file_file_v1)
>       manifest_list = list(read_manifest_list(input_file))[0]

tests/utils/test_manifest.py:193: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pyiceberg/manifest.py:651: in read_manifest_list
    with AvroFile[ManifestFile](
pyiceberg/avro/file.py:177: in __enter__
    self.reader = resolve_reader(self.schema, self.read_schema, self.read_types, self.read_enums)
pyiceberg/avro/resolver.py:235: in resolve_reader
    return visit_with_partner(file_schema, read_schema, ReadSchemaResolver(read_types, read_enums), SchemaPartnerAccessor())  # type: ignore
../../.pyenv/versions/3.12.8/lib/python3.12/functools.py:909: in wrapper
    return dispatch(args[0].__class__)(*args, **kw)
pyiceberg/schema.py:626: in _
    return visitor.schema(schema, partner, visit_with_partner(schema.as_struct(), struct_partner, visitor, accessor))  # type: ignore
../../.pyenv/versions/3.12.8/lib/python3.12/functools.py:909: in wrapper
    return dispatch(args[0].__class__)(*args, **kw)
pyiceberg/schema.py:641: in _
    return visitor.struct(struct, partner, field_results)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <pyiceberg.avro.resolver.ReadSchemaResolver object at 0x16f759ef0>
struct = StructType(fields=(NestedField(field_id=500, name='manifest_path', field_type=StringType(), required=True), NestedFiel...erType(), required=True), NestedField(field_id=503, name='added_snapshot_id', field_type=LongType(), required=False),))
expected_struct = StructType(fields=(NestedField(field_id=500, name='manifest_path', field_type=StringType(), required=True), NestedFiel...ired=True), required=False), NestedField(field_id=519, name='key_metadata', field_type=BinaryType(), required=False),))
field_readers = [StringReader(), IntegerReader(), IntegerReader(), OptionReader(option=IntegerReader())]

    def struct(self, struct: StructType, expected_struct: Optional[IcebergType], field_readers: List[Reader]) -> Reader:
        read_struct_id = self.context[STRUCT_ROOT] if len(self.context) > 0 else STRUCT_ROOT
        struct_callable = self.read_types.get(read_struct_id, Record)
    
        if not expected_struct:
            return StructReader(tuple(enumerate(field_readers)), struct_callable, struct)
    
        if not isinstance(expected_struct, StructType):
            raise ResolveError(f"File/read schema are not aligned for struct, got {expected_struct}")
    
        expected_positions: Dict[int, int] = {field.field_id: pos for pos, field in enumerate(expected_struct.fields)}
    
        # first, add readers for the file fields that must be in order
        results: List[Tuple[Optional[int], Reader]] = [
            (
                expected_positions.get(field.field_id),
                # Check if we need to convert it to an Enum
                result_reader if not (enum_type := self.read_enums.get(field.field_id)) else EnumReader(enum_type, result_reader),
            )
            for field, result_reader in zip(struct.fields, field_readers)
        ]
    
        file_fields = {field.field_id for field in struct.fields}
        for pos, read_field in enumerate(expected_struct.fields):
            if read_field.field_id not in file_fields:
                if isinstance(read_field, NestedField) and read_field.initial_default is not None:
                    # The field is not in the file, but there is a default value
                    # and that one can be required
                    results.append((pos, DefaultReader(read_field.initial_default)))
                elif read_field.required:
>                   raise ResolveError(f"{read_field} is non-optional, and not part of the file schema")
E                   pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema

pyiceberg/avro/resolver.py:399: ResolveError

Jan 04 '25 19:01 kevinjqliu