GeoParquet: Ensure on dataset creation that all files have same schema

Open kylebarron opened this issue 6 months ago • 1 comments

from obstore.auth.planetary_computer import PlanetaryComputerCredentialProvider
from obstore.store import AzureStore
from lonboard import viz
import shapely
from geoarrow.rust.io import GeoParquetDataset
from arro3.core import struct_field
from urllib.parse import urlparse

credential_provider = PlanetaryComputerCredentialProvider(
    account_name="pcstacitems",
    container_name="items",
)
store = AzureStore(credential_provider=credential_provider)

files = store.list("naip.parquet").collect()
dataset = GeoParquetDataset(files, store=store)
all_data = dataset.read(parse_to_native=False)

raises:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[25], line 1
----> 1 all_data = dataset.read(parse_to_native=False)

TypeError: All batches must have same schema

Jun 17 '25 15:06 kylebarron

I think this is an upstream issue.

I downloaded all the files to local storage with:

import asyncio
from pathlib import Path

from obstore.auth.planetary_computer import PlanetaryComputerCredentialProvider
from obstore.store import AzureStore, LocalStore


async def download_files():
    credential_provider = PlanetaryComputerCredentialProvider(
        account_name="pcstacitems",
        container_name="items",
    )
    store = AzureStore(credential_provider=credential_provider)

    local_store = LocalStore(Path())
    files = store.list("naip.parquet").collect()
    for file in files:
        path = file["path"]

        # This only constructs the stream, it doesn't materialize the data in memory
        resp = await store.get_async(path)
        # A streaming upload is created to copy the file to path2
        await local_store.put_async(path, resp)


def main():
    asyncio.run(download_files())

Then read the files with pyarrow:

from pathlib import Path
import pyarrow.parquet as pq

path = Path("/Users/kyle/github/developmentseed/obstore/tests/naip.parquet")
path1 = path / "part-000_2010-04-22T00:00:00+00:00_2010-11-16T00:00:00+00:00.parquet"
path2 = path / "part-001_2011-04-19T00:00:00+00:00_2011-10-17T00:00:00+00:00.parquet"
schema1 = pq.read_schema(path1)
schema2 = pq.read_schema(path2)
schema1 == schema2 # False

for i in range(len(schema1)):
    print(schema1.field(i) == schema2.field(i))
# False
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True

schema1.names[0] # assets

schema1.field(0).type.names
# ['image', 'rendered_preview', 'thumbnail', 'tilejson']

schema2.field(0).type.names
# ['image', 'metadata', 'rendered_preview', 'thumbnail', 'tilejson']

We should probably check that all files have the same schema on dataset creation.

Jul 08 '25 19:07 kylebarron