iceberg-python icon indicating copy to clipboard operation
iceberg-python copied to clipboard

int64() is converted to float for nullable field when None is provided + nan returned when None is expected

Open Co0olCat opened this issue 2 years ago • 2 comments

Apache Iceberg version

0.6.0 (latest release)

Please describe the bug 🐞

For reproduction using https://github.com/apache/iceberg-python/blob/main/tests/catalog/test_glue.py

Here is failing test

@mock_aws
def test_create_table_with_pyarrow_schema(
    _bucket_initialize: None,
    moto_endpoint_url: str,
    database_name: str,
    table_name: str,
) -> None:
    catalog_name = "glue"
    identifier = (database_name, table_name)
    test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url})
    test_catalog.create_namespace(namespace=database_name)

    pa_schema = pa.schema([
        pa.field('year', pa.int64(), nullable=False),
        pa.field('n_legs', pa.int64(), nullable=True),
        pa.field('animals', pa.string(), nullable=True)
    ])

    table = test_catalog.create_table(
        identifier=identifier,
        schema=pa_schema,
        location=f"s3://{BUCKET_NAME}/{database_name}.db/{table_name}",
    )
    assert table.identifier == (catalog_name,) + identifier
    assert TABLE_METADATA_LOCATION_REGEX.match(table.metadata_location)
    assert test_catalog._parse_metadata_version(table.metadata_location) == 0

    table.append(
        pa.Table.from_pylist(
            [
                {"year": 2001, "n_legs": 2, "animals": None},
                {"year": 2002, "n_legs": None, "animals": "Horse"},
            ], schema=pa_schema
        )
    )

    assert len(table.scan().to_arrow()) == 2

    table.append(
        pa.Table.from_pylist(
            [
                {"year": 2003, "n_legs": 6, "animals": "Cicada"},
                {"year": 2004, "n_legs": 8, "animals": "Spider"},
            ], schema=pa_schema
        )
    )

    assert len(table.scan().to_arrow()) == 4

    assert table.scan().to_pandas().to_dict("records") == [
        {"animals": "Cicada", "n_legs": 6, "year": 2003},
        {"animals": "Spider", "n_legs": 8, "year": 2004},
        {"animals": None, "n_legs": 2, "year": 2001},
        {"animals": "Horse", "n_legs": None, "year": 2002},
    ]

Error part:

E         Full diff:
E           [
E         -  {'animals': 'Cicada', 'n_legs': 6, 'year': 2003},
E         +  {'animals': 'Cicada', 'n_legs': 6.0, 'year': 2003},
E         ?                                   ++
E         -  {'animals': 'Spider', 'n_legs': 8, 'year': 2004},
E         +  {'animals': 'Spider', 'n_legs': 8.0, 'year': 2004},
E         ?                                   ++
E         -  {'animals': None, 'n_legs': 2, 'year': 2001},
E         +  {'animals': None, 'n_legs': 2.0, 'year': 2001},
E         ?                               ++
E         -  {'animals': 'Horse', 'n_legs': None, 'year': 2002},
E         ?                                 -- ^
E         +  {'animals': 'Horse', 'n_legs': nan, 'year': 2002},
E         ?                                  ^^
E           ]

Co0olCat avatar Apr 03 '24 03:04 Co0olCat

It looks like an issue of pyarrow. scan() returns correct data and type.

pyiceberg.io.pyarrow.project_table() does the conversion...

Co0olCat avatar Apr 03 '24 03:04 Co0olCat

This issue has been automatically marked as stale because it has been open for 180 days with no activity. It will be closed in next 14 days if no further activity occurs. To permanently prevent this issue from being considered stale, add the label 'not-stale', but commenting on the issue is preferred when possible.

github-actions[bot] avatar Oct 01 '24 00:10 github-actions[bot]

This issue has been closed because it has not received any activity in the last 14 days since being marked as 'stale'

github-actions[bot] avatar Oct 16 '24 00:10 github-actions[bot]