splink icon indicating copy to clipboard operation
splink copied to clipboard

Arrays/structs break when loading pandas df into `duckdb`

Open ThomasHepworth opened this issue 2 years ago • 0 comments

What happens?

When loading a pandas df with array columns into duckdb, it seems to transform them into strings.

So an array [1234, 5678] will become -> "[1234, 5678]". Array[0] will give you a value of "" Array[1] will give you a value of [ Array[2] ... 1, etc.

It's a similar behaviour to this, where it's incorrectly transforming pandas objects into their corresponding duckdb variations.

To Reproduce

Duckdb -> Pandas -> Duckdb

import duckdb
import pandas as pd

con = duckdb.connect()

sql = """
CREATE TABLE testing AS (
    SELECT ['tom', NULL, 'robin'] as a,
    ['thomas', 'robyn'] as b
)
"""

con.execute(sql)
test = con.execute("select * from testing").fetch_df()
con.register("test_df", test)

sql = """
select
a, b,
a[0] as a_0,
a[1] as a_1,
b[0] as b_0,
b[1] as b_1
from test_df
"""

con.execute(sql).fetch_df()
|   | a                      | b                   | a_0 | a_1 | b_0 | b_1 |
|---|------------------------|---------------------|-----|-----|-----|-----|
| 0 | ['tom', None, 'robin'] | ['thomas', 'robyn'] | [   | '   | [   | '   |

Pandas -> DuckDB

import duckdb
import pandas as pd
import numpy as np

test = {
    "a": [1, 2, 3, 4, 5],
    "b": [np.array([1,2,3]), np.array([2]), np.array([3]), np.array([4,6]), np.array([5])],
    "c": [np.nan, np.nan, np.array([4,5,6]), np.nan, np.array([1,2,3])],
}

test_df = pd.DataFrame(test)

con = duckdb.connect()
con.register("test_df", test_df)

sql = """
select b, c, 
b[0] as b_0, b[1] as b_1, b[2] as b_2,
c[0] as c_0, c[1] as c_1, c[2] as c_2
from test_df
"""

con.execute(sql).fetch_df()

Parquet -> DuckDB (works correctly!)

from pyspark.context import SparkContext
from pyspark.sql import SparkSession
import duckdb
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
from pyspark.sql import Row
data_list = [
    {"a": ['robin', 'john'], "b": ['robyn', 'james']},

        ]

df = spark.createDataFrame(Row(**x) for x in data_list)
df.createOrReplaceTempView("df")
df = df.repartition(1)
df.write.mode('overwrite').parquet("test_parquet/")

con = duckdb.connect()

sql = """
select 
a, b,
a[0] as a_0, a[1] as a_1,
b[0] as b_0, b[1] as b_1,
from 'test_parquet/*.parquet';
"""

con.execute(sql).fetch_df()
|    | a                 | b                  |   a_0 | a_1   |   b_0 | b_1   |
|---:|:------------------|:-------------------|------:|:------|------:|:------|
|  0 | ['robin', 'john'] | ['robyn', 'james'] |   nan | robin |   nan | robyn |

Spark -> Pandas -> DuckDB. (Does not work)

from pyspark.context import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
from pyspark.sql import Row
data_list = [
    {"a": ['robin', 'john'], "b": ['robyn', 'james']},

        ]

df = spark.createDataFrame(Row(**x) for x in data_list)
df.createOrReplaceTempView("df")

df_as_pandas = df.toPandas()
con = duckdb.connect()
con.register("test_df", df_as_pandas)

sql = """
select 
a, b,
a[0] as a_0, a[1] as a_1,
b[0] as b_0, b[1] as b_1,
from test_df;
"""

print(con.execute(sql).fetch_df().to_markdown())
|    | a                 | b                  | a_0   | a_1   | b_0   | b_1   |
|---:|:------------------|:-------------------|:------|:------|:------|:------|
|  0 | ['robin', 'john'] | ['robyn', 'james'] |       | [     |       | [     |

Parquet -> Arrow -> Duckdb (works as intended!)

import duckdb
import pyarrow.parquet as pq

con = duckdb.connect()
sql = """
COPY (
    SELECT ['tom', NULL, 'robin'] as a,
    ['thomas', 'robyn'] as b
) TO 'test.parquet'
"""
con.execute(sql)

f = "test.parquet"
test_arrow = pq.read_table(f)

sql = """
select
a, b,
a[0] as a_0,
a[1] as a_1,
b[0] as b_0,
b[1] as b_1
from test_arrow
"""
con.execute(sql).fetch_df()
|   | a                      | b                   | a_0 | a_1 | b_0    | b_1   |
|---|------------------------|---------------------|-----|-----|--------|-------|
| 0 | ['tom', None, 'robin'] | ['thomas', 'robyn'] | tom |     | thomas | robyn |

Duckdb -> Pandas -> Arrow -> Duckdb (also works!)

import duckdb
import pandas as pd
import pyarrow as pa

con = duckdb.connect()

sql = """
CREATE TABLE testing AS (
    SELECT ['tom', NULL, 'robin'] as a,
    ['thomas', 'robyn'] as b
)
"""

con.execute(sql)
test = con.execute("select * from testing").fetch_df()
test_arrow = pa.Table.from_pandas(test)

sql = """
select
a, b,
a[0] as a_0,
a[1] as a_1,
b[0] as b_0,
b[1] as b_1
from test_arrow
"""
con.execute(sql).fetch_df()
|   | a                      | b                   | a_0 | a_1 | b_0    | b_1   |
|---|------------------------|---------------------|-----|-----|--------|-------|
| 0 | ['tom', None, 'robin'] | ['thomas', 'robyn'] | tom |     | thomas | robyn |

OS:

iOS

Splink version:

3.0.1

Have you tried this on the latest master branch?

  • [X] I agree

Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?

  • [X] I agree

ThomasHepworth avatar Aug 01 '22 11:08 ThomasHepworth