geoarrow-rs
geoarrow-rs copied to clipboard
GeoParquet: Ensure on dataset creation that all files have same schema
from obstore.auth.planetary_computer import PlanetaryComputerCredentialProvider
from obstore.store import AzureStore
from lonboard import viz
import shapely
from geoarrow.rust.io import GeoParquetDataset
from arro3.core import struct_field
from urllib.parse import urlparse
credential_provider = PlanetaryComputerCredentialProvider(
account_name="pcstacitems",
container_name="items",
)
store = AzureStore(credential_provider=credential_provider)
files = store.list("naip.parquet").collect()
dataset = GeoParquetDataset(files, store=store)
all_data = dataset.read(parse_to_native=False)
raises:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[25], line 1
----> 1 all_data = dataset.read(parse_to_native=False)
TypeError: All batches must have same schema
I think this is an upstream issue.
I downloaded all the files to local storage with:
import asyncio
from pathlib import Path
from obstore.auth.planetary_computer import PlanetaryComputerCredentialProvider
from obstore.store import AzureStore, LocalStore
async def download_files():
credential_provider = PlanetaryComputerCredentialProvider(
account_name="pcstacitems",
container_name="items",
)
store = AzureStore(credential_provider=credential_provider)
local_store = LocalStore(Path())
files = store.list("naip.parquet").collect()
for file in files:
path = file["path"]
# This only constructs the stream, it doesn't materialize the data in memory
resp = await store.get_async(path)
# A streaming upload is created to copy the file to path2
await local_store.put_async(path, resp)
def main():
asyncio.run(download_files())
Then read the files with pyarrow:
from pathlib import Path
import pyarrow.parquet as pq
path = Path("/Users/kyle/github/developmentseed/obstore/tests/naip.parquet")
path1 = path / "part-000_2010-04-22T00:00:00+00:00_2010-11-16T00:00:00+00:00.parquet"
path2 = path / "part-001_2011-04-19T00:00:00+00:00_2011-10-17T00:00:00+00:00.parquet"
schema1 = pq.read_schema(path1)
schema2 = pq.read_schema(path2)
schema1 == schema2 # False
for i in range(len(schema1)):
print(schema1.field(i) == schema2.field(i))
# False
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
# True
schema1.names[0] # assets
schema1.field(0).type.names
# ['image', 'rendered_preview', 'thumbnail', 'tilejson']
schema2.field(0).type.names
# ['image', 'metadata', 'rendered_preview', 'thumbnail', 'tilejson']
We should probably check that all files have the same schema on dataset creation.