Timestamps aren't handled well
If you have a time series you'd like to push to snowflake, you might have a bade time. Data pushes to the database successfully, but the date/time column is populated with "invalid times".
When you try to fetch that data, you get an error:
import os
import dotenv
import pandas
from dask.dataframe import from_pandas
from dask_snowflake import to_snowflake, read_snowflake
from distributed import Client, LocalCluster
dotenv.load_dotenv()
cnxn = dict(
user=os.environ["SNOWFLAKE_USER"],
password=os.environ["SNOWFLAKE_PASSWORD"],
account=os.environ["SNOWFLAKE_ACCOUNT"],
database=os.environ.get("SNOWFLAKE_DATABASE", "testdb"),
schema=os.environ.get("SNOWFLAKE_SCHEMA", "public"),
warehouse=os.environ["SNOWFLAKE_WAREHOUSE"],
role=os.environ["SNOWFLAKE_ROLE"],
)
times = pandas.date_range(start='2000-01-01', periods=4, freq="5min")
data = pandas.DataFrame({
'dewp': [12.0, 12.5, 12.4, 12.6],
'wind': [0.1, 0.2, 0.3, 0.4],
}, index=pandas.Index(times, name="time"))
if __name__ == "__main__":
with LocalCluster(n_workers=4) as cluster:
with Client(cluster) as client:
ddf = from_pandas(data, npartitions=2)
print(ddf.compute())
to_snowflake(ddf.reset_index(), name="_test_dates", connection_kwargs=cnxn)
dates = read_snowflake("select * from _test_dates", connection_kwargs=cnxn)
print(dates.compute())
And the error you get:
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
Cell In[1], line 2
1 to_snowflake(ddf.reset_index(), name="_test_dates", connection_kwargs=cnxn)
----> 2 dates = read_snowflake("select * from _test_dates", connection_kwargs=cnxn)
3 print(dates.compute())
File ~/work/sources/dask-snowflake/dask_snowflake/core.py:289, in read_snowflake(query, connection_kwargs, arrow_options, execute_params, partition_size, npartitions)
280 raise RuntimeError(
281 f"Currently only `ArrowResultBatch` are supported, but received batch types {batch_types}"
282 )
284 # Read the first non-empty batch to determine meta, which is useful for a
285 # better size estimate when partitioning. We could also allow empty meta
286 # here, which should involve less data transfer to the client, at the
287 # cost of worse size estimates. Batches seem less than 1MiB in practice,
288 # so this is likely okay right now, but could be revisited.
--> 289 meta = batches[0].to_pandas(**arrow_options)
291 batches_partitioned = _partition_batches(
292 batches, meta, npartitions=npartitions, partition_size=partition_size
293 )
295 # Create Blockwise layer
File ~/mambaforge/envs/sflake/lib/python3.10/site-packages/snowflake/connector/result_batch.py:671, in ArrowResultBatch.to_pandas(self, connection, **kwargs)
669 self._check_can_use_pandas()
670 table = self.to_arrow(connection=connection)
--> 671 return table.to_pandas(**kwargs)
File ~/mambaforge/envs/sflake/lib/python3.10/site-packages/pyarrow/array.pxi:823, in pyarrow.lib._PandasConvertible.to_pandas()
File ~/mambaforge/envs/sflake/lib/python3.10/site-packages/pyarrow/table.pxi:3913, in pyarrow.lib.Table._to_pandas()
File ~/mambaforge/envs/sflake/lib/python3.10/site-packages/pyarrow/pandas_compat.py:818, in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
816 _check_data_column_metadata_consistency(all_columns)
817 columns = _deserialize_column_index(table, all_columns, column_indexes)
--> 818 blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
820 axes = [columns, index]
821 return BlockManager(blocks, axes)
File ~/mambaforge/envs/sflake/lib/python3.10/site-packages/pyarrow/pandas_compat.py:1168, in _table_to_blocks(options, block_table, categories, extension_columns)
1163 def _table_to_blocks(options, block_table, categories, extension_columns):
1164 # Part of table_to_blockmanager
1165
1166 # Convert an arrow table to Block from the internal pandas API
1167 columns = block_table.column_names
-> 1168 result = pa.lib.table_to_blocks(options, block_table, categories,
1169 list(extension_columns.keys()))
1170 return [_reconstruct_block(item, columns, extension_columns)
1171 for item in result]
File ~/mambaforge/envs/sflake/lib/python3.10/site-packages/pyarrow/table.pxi:2602, in pyarrow.lib.table_to_blocks()
File ~/mambaforge/envs/sflake/lib/python3.10/site-packages/pyarrow/error.pxi:100, in pyarrow.lib.check_status()
ArrowInvalid: Casting from timestamp[us] to timestamp[ns] would result in out of bounds timestamp: 5900852212012867584
Here's how the data appear in Snowflake

There's probably some way around this, but I have conjured it yet. I wanted to get this bug report up sooner than later. But I'll keep digging.
I'm getting the same "Invalid date" when uploading with dask-snowflake. @phobson have you figured out why or a work around?
explicitly formatting the timestamp works.
df['target_date'] = df['target_date'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')
I'm getting the same "Invalid date" when uploading with dask-snowflake. @phobson have you figured out why or a work around?