splink
splink copied to clipboard
Arrays/structs break when loading pandas df into `duckdb`
What happens?
When loading a pandas df with array columns into duckdb, it seems to transform them into strings.
So an array [1234, 5678]
will become -> "[1234, 5678]"
.
Array[0] will give you a value of ""
Array[1] will give you a value of [
Array[2] ... 1, etc.
It's a similar behaviour to this, where it's incorrectly transforming pandas objects into their corresponding duckdb variations.
To Reproduce
Duckdb -> Pandas -> Duckdb
import duckdb
import pandas as pd
con = duckdb.connect()
sql = """
CREATE TABLE testing AS (
SELECT ['tom', NULL, 'robin'] as a,
['thomas', 'robyn'] as b
)
"""
con.execute(sql)
test = con.execute("select * from testing").fetch_df()
con.register("test_df", test)
sql = """
select
a, b,
a[0] as a_0,
a[1] as a_1,
b[0] as b_0,
b[1] as b_1
from test_df
"""
con.execute(sql).fetch_df()
| | a | b | a_0 | a_1 | b_0 | b_1 |
|---|------------------------|---------------------|-----|-----|-----|-----|
| 0 | ['tom', None, 'robin'] | ['thomas', 'robyn'] | [ | ' | [ | ' |
Pandas -> DuckDB
import duckdb
import pandas as pd
import numpy as np
test = {
"a": [1, 2, 3, 4, 5],
"b": [np.array([1,2,3]), np.array([2]), np.array([3]), np.array([4,6]), np.array([5])],
"c": [np.nan, np.nan, np.array([4,5,6]), np.nan, np.array([1,2,3])],
}
test_df = pd.DataFrame(test)
con = duckdb.connect()
con.register("test_df", test_df)
sql = """
select b, c,
b[0] as b_0, b[1] as b_1, b[2] as b_2,
c[0] as c_0, c[1] as c_1, c[2] as c_2
from test_df
"""
con.execute(sql).fetch_df()
Parquet -> DuckDB (works correctly!)
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
import duckdb
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
from pyspark.sql import Row
data_list = [
{"a": ['robin', 'john'], "b": ['robyn', 'james']},
]
df = spark.createDataFrame(Row(**x) for x in data_list)
df.createOrReplaceTempView("df")
df = df.repartition(1)
df.write.mode('overwrite').parquet("test_parquet/")
con = duckdb.connect()
sql = """
select
a, b,
a[0] as a_0, a[1] as a_1,
b[0] as b_0, b[1] as b_1,
from 'test_parquet/*.parquet';
"""
con.execute(sql).fetch_df()
| | a | b | a_0 | a_1 | b_0 | b_1 |
|---:|:------------------|:-------------------|------:|:------|------:|:------|
| 0 | ['robin', 'john'] | ['robyn', 'james'] | nan | robin | nan | robyn |
Spark -> Pandas -> DuckDB. (Does not work)
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
from pyspark.sql import Row
data_list = [
{"a": ['robin', 'john'], "b": ['robyn', 'james']},
]
df = spark.createDataFrame(Row(**x) for x in data_list)
df.createOrReplaceTempView("df")
df_as_pandas = df.toPandas()
con = duckdb.connect()
con.register("test_df", df_as_pandas)
sql = """
select
a, b,
a[0] as a_0, a[1] as a_1,
b[0] as b_0, b[1] as b_1,
from test_df;
"""
print(con.execute(sql).fetch_df().to_markdown())
| | a | b | a_0 | a_1 | b_0 | b_1 |
|---:|:------------------|:-------------------|:------|:------|:------|:------|
| 0 | ['robin', 'john'] | ['robyn', 'james'] | | [ | | [ |
Parquet -> Arrow -> Duckdb (works as intended!)
import duckdb
import pyarrow.parquet as pq
con = duckdb.connect()
sql = """
COPY (
SELECT ['tom', NULL, 'robin'] as a,
['thomas', 'robyn'] as b
) TO 'test.parquet'
"""
con.execute(sql)
f = "test.parquet"
test_arrow = pq.read_table(f)
sql = """
select
a, b,
a[0] as a_0,
a[1] as a_1,
b[0] as b_0,
b[1] as b_1
from test_arrow
"""
con.execute(sql).fetch_df()
| | a | b | a_0 | a_1 | b_0 | b_1 |
|---|------------------------|---------------------|-----|-----|--------|-------|
| 0 | ['tom', None, 'robin'] | ['thomas', 'robyn'] | tom | | thomas | robyn |
Duckdb -> Pandas -> Arrow -> Duckdb (also works!)
import duckdb
import pandas as pd
import pyarrow as pa
con = duckdb.connect()
sql = """
CREATE TABLE testing AS (
SELECT ['tom', NULL, 'robin'] as a,
['thomas', 'robyn'] as b
)
"""
con.execute(sql)
test = con.execute("select * from testing").fetch_df()
test_arrow = pa.Table.from_pandas(test)
sql = """
select
a, b,
a[0] as a_0,
a[1] as a_1,
b[0] as b_0,
b[1] as b_1
from test_arrow
"""
con.execute(sql).fetch_df()
| | a | b | a_0 | a_1 | b_0 | b_1 |
|---|------------------------|---------------------|-----|-----|--------|-------|
| 0 | ['tom', None, 'robin'] | ['thomas', 'robyn'] | tom | | thomas | robyn |
OS:
iOS
Splink version:
3.0.1
Have you tried this on the latest master
branch?
- [X] I agree
Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?
- [X] I agree