splink Arrays/structs break when loading pandas df into `duckdb`

Arrays/structs break when loading pandas df into `duckdb`

Open ThomasHepworth opened this issue 2 years ago • 0 comments

What happens?

When loading a pandas df with array columns into duckdb, it seems to transform them into strings.

So an array [1234, 5678] will become -> "[1234, 5678]". Array[0] will give you a value of "" Array[1] will give you a value of [ Array[2] ... 1, etc.

It's a similar behaviour to this, where it's incorrectly transforming pandas objects into their corresponding duckdb variations.

To Reproduce

Duckdb -> Pandas -> Duckdb

import duckdb
import pandas as pd

con = duckdb.connect()

sql = """
CREATE TABLE testing AS (
    SELECT ['tom', NULL, 'robin'] as a,
    ['thomas', 'robyn'] as b
)
"""

con.execute(sql)
test = con.execute("select * from testing").fetch_df()
con.register("test_df", test)

sql = """
select
a, b,
a[0] as a_0,
a[1] as a_1,
b[0] as b_0,
b[1] as b_1
from test_df
"""

con.execute(sql).fetch_df()

|   | a                      | b                   | a_0 | a_1 | b_0 | b_1 |
|---|------------------------|---------------------|-----|-----|-----|-----|
| 0 | ['tom', None, 'robin'] | ['thomas', 'robyn'] | [   | '   | [   | '   |

Pandas -> DuckDB

import duckdb
import pandas as pd
import numpy as np

test = {
    "a": [1, 2, 3, 4, 5],
    "b": [np.array([1,2,3]), np.array([2]), np.array([3]), np.array([4,6]), np.array([5])],
    "c": [np.nan, np.nan, np.array([4,5,6]), np.nan, np.array([1,2,3])],
}

test_df = pd.DataFrame(test)

con = duckdb.connect()
con.register("test_df", test_df)

sql = """
select b, c, 
b[0] as b_0, b[1] as b_1, b[2] as b_2,
c[0] as c_0, c[1] as c_1, c[2] as c_2
from test_df
"""

con.execute(sql).fetch_df()

Parquet -> DuckDB (works correctly!)

from pyspark.context import SparkContext
from pyspark.sql import SparkSession
import duckdb
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
from pyspark.sql import Row
data_list = [
    {"a": ['robin', 'john'], "b": ['robyn', 'james']},

        ]

df = spark.createDataFrame(Row(**x) for x in data_list)
df.createOrReplaceTempView("df")
df = df.repartition(1)
df.write.mode('overwrite').parquet("test_parquet/")

con = duckdb.connect()

sql = """
select 
a, b,
a[0] as a_0, a[1] as a_1,
b[0] as b_0, b[1] as b_1,
from 'test_parquet/*.parquet';
"""

con.execute(sql).fetch_df()

|    | a                 | b                  |   a_0 | a_1   |   b_0 | b_1   |
|---:|:------------------|:-------------------|------:|:------|------:|:------|
|  0 | ['robin', 'john'] | ['robyn', 'james'] |   nan | robin |   nan | robyn |

Spark -> Pandas -> DuckDB. (Does not work)

from pyspark.context import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
from pyspark.sql import Row
data_list = [
    {"a": ['robin', 'john'], "b": ['robyn', 'james']},

        ]

df = spark.createDataFrame(Row(**x) for x in data_list)
df.createOrReplaceTempView("df")

df_as_pandas = df.toPandas()
con = duckdb.connect()
con.register("test_df", df_as_pandas)

sql = """
select 
a, b,
a[0] as a_0, a[1] as a_1,
b[0] as b_0, b[1] as b_1,
from test_df;
"""

print(con.execute(sql).fetch_df().to_markdown())

|    | a                 | b                  | a_0   | a_1   | b_0   | b_1   |
|---:|:------------------|:-------------------|:------|:------|:------|:------|
|  0 | ['robin', 'john'] | ['robyn', 'james'] |       | [     |       | [     |

Parquet -> Arrow -> Duckdb (works as intended!)

import duckdb
import pyarrow.parquet as pq

con = duckdb.connect()
sql = """
COPY (
    SELECT ['tom', NULL, 'robin'] as a,
    ['thomas', 'robyn'] as b
) TO 'test.parquet'
"""
con.execute(sql)

f = "test.parquet"
test_arrow = pq.read_table(f)

sql = """
select
a, b,
a[0] as a_0,
a[1] as a_1,
b[0] as b_0,
b[1] as b_1
from test_arrow
"""
con.execute(sql).fetch_df()

|   | a                      | b                   | a_0 | a_1 | b_0    | b_1   |
|---|------------------------|---------------------|-----|-----|--------|-------|
| 0 | ['tom', None, 'robin'] | ['thomas', 'robyn'] | tom |     | thomas | robyn |

Duckdb -> Pandas -> Arrow -> Duckdb (also works!)

import duckdb
import pandas as pd
import pyarrow as pa

con = duckdb.connect()

sql = """
CREATE TABLE testing AS (
    SELECT ['tom', NULL, 'robin'] as a,
    ['thomas', 'robyn'] as b
)
"""

con.execute(sql)
test = con.execute("select * from testing").fetch_df()
test_arrow = pa.Table.from_pandas(test)

sql = """
select
a, b,
a[0] as a_0,
a[1] as a_1,
b[0] as b_0,
b[1] as b_1
from test_arrow
"""
con.execute(sql).fetch_df()

|   | a                      | b                   | a_0 | a_1 | b_0    | b_1   |
|---|------------------------|---------------------|-----|-----|--------|-------|
| 0 | ['tom', None, 'robin'] | ['thomas', 'robyn'] | tom |     | thomas | robyn |

OS:

iOS

Splink version:

3.0.1

Have you tried this on the latest `master` branch?

[X] I agree

Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?

[X] I agree

Aug 01 '22 11:08 ThomasHepworth

splink splink copied to clipboard

Arrays/structs break when loading pandas df into `duckdb`

What happens?

To Reproduce

Duckdb -> Pandas -> Duckdb

Pandas -> DuckDB

Parquet -> DuckDB (works correctly!)

Spark -> Pandas -> DuckDB. (Does not work)

Parquet -> Arrow -> Duckdb (works as intended!)

Duckdb -> Pandas -> Arrow -> Duckdb (also works!)

OS:

Splink version:

Have you tried this on the latest master branch?

Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?

splink
splink copied to clipboard

Have you tried this on the latest `master` branch?