iceberg-python
iceberg-python copied to clipboard
use minimal required fields
Used the minimal required schema for V1 manifest list as described in https://iceberg.apache.org/spec/#manifest-lists
make test stack trace:
============================================================================ short test summary info ============================================================================
FAILED tests/utils/test_manifest.py::test_read_manifest_list - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
FAILED tests/utils/test_manifest.py::test_read_manifest_v1 - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
FAILED tests/utils/test_manifest.py::test_write_manifest[1] - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
FAILED tests/utils/test_manifest.py::test_write_manifest_list[19-1] - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
FAILED tests/utils/test_manifest.py::test_write_manifest_list[None-1] - pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
FAILED tests/utils/test_schema_conversion.py::test_avro_to_iceberg - assert Schema(Nested..._field_ids=[]) == Schema(Nested..._field_ids=[])
test_read_manifest_list is the same error as #1194:
____________________________________________________________________________ test_read_manifest_list ____________________________________________________________________________
generated_manifest_file_file_v1 = '/var/folders/f1/3_vzsn7x1jq9hszb3z9y6f0m0000gn/T/tmpdhm82vv4/manifest.avro'
def test_read_manifest_list(generated_manifest_file_file_v1: str) -> None:
input_file = PyArrowFileIO().new_input(generated_manifest_file_file_v1)
> manifest_list = list(read_manifest_list(input_file))[0]
tests/utils/test_manifest.py:193:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pyiceberg/manifest.py:651: in read_manifest_list
with AvroFile[ManifestFile](
pyiceberg/avro/file.py:177: in __enter__
self.reader = resolve_reader(self.schema, self.read_schema, self.read_types, self.read_enums)
pyiceberg/avro/resolver.py:235: in resolve_reader
return visit_with_partner(file_schema, read_schema, ReadSchemaResolver(read_types, read_enums), SchemaPartnerAccessor()) # type: ignore
../../.pyenv/versions/3.12.8/lib/python3.12/functools.py:909: in wrapper
return dispatch(args[0].__class__)(*args, **kw)
pyiceberg/schema.py:626: in _
return visitor.schema(schema, partner, visit_with_partner(schema.as_struct(), struct_partner, visitor, accessor)) # type: ignore
../../.pyenv/versions/3.12.8/lib/python3.12/functools.py:909: in wrapper
return dispatch(args[0].__class__)(*args, **kw)
pyiceberg/schema.py:641: in _
return visitor.struct(struct, partner, field_results)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <pyiceberg.avro.resolver.ReadSchemaResolver object at 0x16f759ef0>
struct = StructType(fields=(NestedField(field_id=500, name='manifest_path', field_type=StringType(), required=True), NestedFiel...erType(), required=True), NestedField(field_id=503, name='added_snapshot_id', field_type=LongType(), required=False),))
expected_struct = StructType(fields=(NestedField(field_id=500, name='manifest_path', field_type=StringType(), required=True), NestedFiel...ired=True), required=False), NestedField(field_id=519, name='key_metadata', field_type=BinaryType(), required=False),))
field_readers = [StringReader(), IntegerReader(), IntegerReader(), OptionReader(option=IntegerReader())]
def struct(self, struct: StructType, expected_struct: Optional[IcebergType], field_readers: List[Reader]) -> Reader:
read_struct_id = self.context[STRUCT_ROOT] if len(self.context) > 0 else STRUCT_ROOT
struct_callable = self.read_types.get(read_struct_id, Record)
if not expected_struct:
return StructReader(tuple(enumerate(field_readers)), struct_callable, struct)
if not isinstance(expected_struct, StructType):
raise ResolveError(f"File/read schema are not aligned for struct, got {expected_struct}")
expected_positions: Dict[int, int] = {field.field_id: pos for pos, field in enumerate(expected_struct.fields)}
# first, add readers for the file fields that must be in order
results: List[Tuple[Optional[int], Reader]] = [
(
expected_positions.get(field.field_id),
# Check if we need to convert it to an Enum
result_reader if not (enum_type := self.read_enums.get(field.field_id)) else EnumReader(enum_type, result_reader),
)
for field, result_reader in zip(struct.fields, field_readers)
]
file_fields = {field.field_id for field in struct.fields}
for pos, read_field in enumerate(expected_struct.fields):
if read_field.field_id not in file_fields:
if isinstance(read_field, NestedField) and read_field.initial_default is not None:
# The field is not in the file, but there is a default value
# and that one can be required
results.append((pos, DefaultReader(read_field.initial_default)))
elif read_field.required:
> raise ResolveError(f"{read_field} is non-optional, and not part of the file schema")
E pyiceberg.exceptions.ResolveError: 504: added_files_count: required int is non-optional, and not part of the file schema
pyiceberg/avro/resolver.py:399: ResolveError