bug After a chain join, executing mutate will cause an error.
What happened?
after chain join
df = df_user.join(df_portrait, df_user.user_id == df_portrait.user_id, how='inner') \
.join(df_user_shift_7d, _.user_id == df_user_shift_7d.user_id_shift_7d, how='left') \
.join(df_portrait_shift_7d, _.user_id == df_portrait_shift_7d.user_id_shift_7d, how='left')
DatabaseError: :HTTPDriver for http://cc-j6c9l15edn66o817k.public.clickhouse.ads.aliyuncs.com:8123/ returned response code 404) Code: 47. DB::Exception: There's no column 't10.user_id' in table 't10': While processing t10.user_id. (UNKNOWN_IDENTIFIER) (version 22.8.5.29)
What version of ibis are you using?
9.0.0
What backend(s) are you using, if any?
clickhouse
Relevant log output
No response
Code of Conduct
- [X] I agree to follow this project's Code of Conduct
Thanks for reporting @stereoF !
Could you provide some sample tables as text for us to use to help debug this? And if possible, if you can paste the tracebacks, etc, as text, it's much easier for us to read and dig into.
Thanks!
@gforsyth
df_user = ibis.memtable({"user_id": [1, 2, 3, 4], "account_id": ['1', '2', '3', '4']})
df_portrait = ibis.memtable({"user_id": [1, 2, 3, 4], "account_id": ['1', '2', '3', '4']})
df_user_shift_7d = ibis.memtable({"user_id_shift_7d": [1, 2, 3, 4], "account_id_shift_7d": ['1', '2', '3', '4']})
df_portrait_shift_7d = ibis.memtable({"user_id_shift_7d": [1, 2, 3, 4], "account_id_shift_7d": ['1', '2', '3', '4']})
df = df_user.join(df_portrait, df_user.user_id == df_portrait.user_id, how='inner') \
.join(df_user_shift_7d, _.user_id == df_user_shift_7d.user_id_shift_7d, how='left') \
.join(df_portrait_shift_7d, _.user_id == df_portrait_shift_7d.user_id_shift_7d, how='left')
df_user.mutate(id2=df_user.account_id).head()
┏━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━┓ ┃ user_id ┃ account_id ┃ id2 ┃ ┡━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━┩ │ int64 │ string │ string │ ├─────────┼────────────┼────────┤ │ 1 │ 1 │ 1 │ │ 2 │ 2 │ 2 │ │ 3 │ 3 │ 3 │ │ 4 │ 4 │ 4 │ └─────────┴────────────┴────────┘
df.mutate(id2=df_user.account_id).head()
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/expr/type │ │ s/core.py:99 in __rich_console__ │ │ │ │ 96 │ │ │ │ 97 │ │ try: │ │ 98 │ │ │ if opts.interactive: │ │ ❱ 99 │ │ │ │ rich_object = to_rich(self, console_width=console_width) │ │ 100 │ │ │ else: │ │ 101 │ │ │ │ rich_object = Text(self._noninteractive_repr()) │ │ 102 │ │ except Exception as e: │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/expr/type │ │ s/pretty.py:271 in to_rich │ │ │ │ 268 │ │ │ expr, max_length=max_length, max_string=max_string, max_depth=max_depth │ │ 269 │ │ ) │ │ 270 │ else: │ │ ❱ 271 │ │ return _to_rich_table( │ │ 272 │ │ │ expr, │ │ 273 │ │ │ max_rows=max_rows, │ │ 274 │ │ │ max_columns=max_columns, │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/expr/type │ │ s/pretty.py:342 in _to_rich_table │ │ │ │ 339 │ │ if orig_ncols > len(computed_cols): │ │ 340 │ │ │ table = table.select(*computed_cols) │ │ 341 │ │ │ ❱ 342 │ result = table.limit(max_rows + 1).to_pyarrow() │ │ 343 │ # Now format the columns in order, stopping if the console width would │ │ 344 │ # be exceeded. │ │ 345 │ col_info = [] │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/expr/type │ │ s/core.py:486 in to_pyarrow │ │ │ │ 483 │ │ Table │ │ 484 │ │ │ A pyarrow table holding the results of the executed expression. │ │ 485 │ │ """ │ │ ❱ 486 │ │ return self._find_backend(use_default=True).to_pyarrow( │ │ 487 │ │ │ self, params=params, limit=limit, **kwargs │ │ 488 │ │ ) │ │ 489 │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/backends/ │ │ clickhouse/__init__.py:290 in to_pyarrow │ │ │ │ 287 │ │ │ external_tables=external_tables, │ │ 288 │ │ │ **kwargs, │ │ 289 │ │ ) as reader: │ │ ❱ 290 │ │ │ table = reader.read_all() │ │ 291 │ │ │ │ 292 │ │ return expr.__pyarrow_result__(table) │ │ 293 │ │ │ │ in pyarrow.lib.RecordBatchReader.read_all:757 │ │ │ │ in pyarrow.lib._datatype_to_pep3118:88 │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/backends/ │ │ clickhouse/__init__.py:357 in batcher │ │ │ │ 354 │ │ │ # readonly != 1 means that the server setting is writable │ │ 355 │ │ │ if self.con.server_settings["max_block_size"].readonly != 1: │ │ 356 │ │ │ │ settings["max_block_size"] = chunk_size │ │ ❱ 357 │ │ │ with self.con.query_column_block_stream( │ │ 358 │ │ │ │ sql, external_data=external_data, settings=settings │ │ 359 │ │ │ ) as blocks: │ │ 360 │ │ │ │ yield from map( │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/clickhouse_con │ │ nect/driver/client.py:212 in query_column_block_stream │ │ │ │ 209 │ │ parameters, see the create_query_context method. │ │ 210 │ │ :return: StreamContext -- Iterable stream context that returns column oriented b │ │ 211 │ │ """ │ │ ❱ 212 │ │ return self._context_query(locals(), use_numpy=False, streaming=True).column_blo │ │ 213 │ │ │ 214 │ def query_row_block_stream(self, │ │ 215 │ │ │ │ │ │ │ query: str = None, │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/clickhouse_con │ │ nect/driver/client.py:721 in _context_query │ │ │ │ 718 │ │ kwargs = lcls.copy() │ │ 719 │ │ kwargs.pop('self') │ │ 720 │ │ kwargs.update(overrides) │ │ ❱ 721 │ │ return self._query_with_context((self.create_query_context(**kwargs))) │ │ 722 │ │ │ 723 │ def __enter__(self): │ │ 724 │ │ return self │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/clickhouse_con │ │ nect/driver/httpclient.py:213 in _query_with_context │ │ │ │ 210 │ │ │ body = final_query │ │ 211 │ │ │ fields = None │ │ 212 │ │ │ headers['Content-Type'] = 'text/plain; charset=utf-8' │ │ ❱ 213 │ │ response = self._raw_request(body, │ │ 214 │ │ │ │ │ │ │ │ │ params, │ │ 215 │ │ │ │ │ │ │ │ │ headers, │ │ 216 │ │ │ │ │ │ │ │ │ stream=True, │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/clickhouse_con │ │ nect/driver/httpclient.py:437 in _raw_request │ │ │ │ 434 │ │ │ elif error_handler: │ │ 435 │ │ │ │ error_handler(response) │ │ 436 │ │ │ else: │ │ ❱ 437 │ │ │ │ self._error_handler(response) │ │ 438 │ │ │ 439 │ def ping(self): │ │ 440 │ │ """ │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/clickhouse_con │ │ nect/driver/httpclient.py:361 in _error_handler │ │ │ │ 358 │ │ if err_content: │ │ 359 │ │ │ err_msg = common.format_error(err_content.decode(errors='backslashreplace')) │ │ 360 │ │ │ err_str = f':{err_str}\n {err_msg}' │ │ ❱ 361 │ │ raise OperationalError(err_str) if retried else DatabaseError(err_str) from None │ │ 362 │ │ │ 363 │ def _raw_request(self, │ │ 364 │ │ │ │ │ data, │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ DatabaseError: :HTTPDriver for http://cc-j6c9l15edn66o817k.public.clickhouse.ads.aliyuncs.com:8123 returned response code 404) Code: 47. DB::Exception: There's no column 't8.user_id' in table 't8': While processing t8.user_id. (UNKNOWN_IDENTIFIER) (version 22.8.5.29)
@gforsyth Not only mutate, group or other functions are also raise this kind of error after chain join
@gforsyth Hi, Is there a temporary solution before the fixing?
Hi @stereoF -- it's not clear to me that there's a bug here, except that maybe we should be raising an error message?
This line:
df.mutate(id2=df_user.account_id).head()
isn't going to work because df and df_user are different tables.
Hi @stereoF -- it's not clear to me that there's a bug here, except that maybe we should be raising an error message?
This line:
df.mutate(id2=df_user.account_id).head()isn't going to work because
dfanddf_userare different tables.
sorry, it's a clerical error. But It raise the same error when using
df.mutate(id2=df.account_id).head()
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/expr/type │ │ s/core.py:99 in __rich_console__ │ │ │ │ 96 │ │ │ │ 97 │ │ try: │ │ 98 │ │ │ if opts.interactive: │ │ ❱ 99 │ │ │ │ rich_object = to_rich(self, console_width=console_width) │ │ 100 │ │ │ else: │ │ 101 │ │ │ │ rich_object = Text(self._noninteractive_repr()) │ │ 102 │ │ except Exception as e: │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/expr/type │ │ s/pretty.py:271 in to_rich │ │ │ │ 268 │ │ │ expr, max_length=max_length, max_string=max_string, max_depth=max_depth │ │ 269 │ │ ) │ │ 270 │ else: │ │ ❱ 271 │ │ return _to_rich_table( │ │ 272 │ │ │ expr, │ │ 273 │ │ │ max_rows=max_rows, │ │ 274 │ │ │ max_columns=max_columns, │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/expr/type │ │ s/pretty.py:342 in _to_rich_table │ │ │ │ 339 │ │ if orig_ncols > len(computed_cols): │ │ 340 │ │ │ table = table.select(*computed_cols) │ │ 341 │ │ │ ❱ 342 │ result = table.limit(max_rows + 1).to_pyarrow() │ │ 343 │ # Now format the columns in order, stopping if the console width would │ │ 344 │ # be exceeded. │ │ 345 │ col_info = [] │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/expr/type │ │ s/core.py:486 in to_pyarrow │ │ │ │ 483 │ │ Table │ │ 484 │ │ │ A pyarrow table holding the results of the executed expression. │ │ 485 │ │ """ │ │ ❱ 486 │ │ return self._find_backend(use_default=True).to_pyarrow( │ │ 487 │ │ │ self, params=params, limit=limit, **kwargs │ │ 488 │ │ ) │ │ 489 │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/backends/ │ │ clickhouse/__init__.py:290 in to_pyarrow │ │ │ │ 287 │ │ │ external_tables=external_tables, │ │ 288 │ │ │ **kwargs, │ │ 289 │ │ ) as reader: │ │ ❱ 290 │ │ │ table = reader.read_all() │ │ 291 │ │ │ │ 292 │ │ return expr.__pyarrow_result__(table) │ │ 293 │ │ │ │ in pyarrow.lib.RecordBatchReader.read_all:757 │ │ │ │ in pyarrow.lib._datatype_to_pep3118:88 │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/ibis/backends/ │ │ clickhouse/__init__.py:357 in batcher │ │ │ │ 354 │ │ │ # readonly != 1 means that the server setting is writable │ │ 355 │ │ │ if self.con.server_settings["max_block_size"].readonly != 1: │ │ 356 │ │ │ │ settings["max_block_size"] = chunk_size │ │ ❱ 357 │ │ │ with self.con.query_column_block_stream( │ │ 358 │ │ │ │ sql, external_data=external_data, settings=settings │ │ 359 │ │ │ ) as blocks: │ │ 360 │ │ │ │ yield from map( │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/clickhouse_con │ │ nect/driver/client.py:212 in query_column_block_stream │ │ │ │ 209 │ │ parameters, see the create_query_context method. │ │ 210 │ │ :return: StreamContext -- Iterable stream context that returns column oriented b │ │ 211 │ │ """ │ │ ❱ 212 │ │ return self._context_query(locals(), use_numpy=False, streaming=True).column_blo │ │ 213 │ │ │ 214 │ def query_row_block_stream(self, │ │ 215 │ │ │ │ │ │ │ query: str = None, │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/clickhouse_con │ │ nect/driver/client.py:721 in _context_query │ │ │ │ 718 │ │ kwargs = lcls.copy() │ │ 719 │ │ kwargs.pop('self') │ │ 720 │ │ kwargs.update(overrides) │ │ ❱ 721 │ │ return self._query_with_context((self.create_query_context(**kwargs))) │ │ 722 │ │ │ 723 │ def __enter__(self): │ │ 724 │ │ return self │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/clickhouse_con │ │ nect/driver/httpclient.py:213 in _query_with_context │ │ │ │ 210 │ │ │ body = final_query │ │ 211 │ │ │ fields = None │ │ 212 │ │ │ headers['Content-Type'] = 'text/plain; charset=utf-8' │ │ ❱ 213 │ │ response = self._raw_request(body, │ │ 214 │ │ │ │ │ │ │ │ │ params, │ │ 215 │ │ │ │ │ │ │ │ │ headers, │ │ 216 │ │ │ │ │ │ │ │ │ stream=True, │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/clickhouse_con │ │ nect/driver/httpclient.py:437 in _raw_request │ │ │ │ 434 │ │ │ elif error_handler: │ │ 435 │ │ │ │ error_handler(response) │ │ 436 │ │ │ else: │ │ ❱ 437 │ │ │ │ self._error_handler(response) │ │ 438 │ │ │ 439 │ def ping(self): │ │ 440 │ │ """ │ │ │ │ /home/zhongminhu/.pyenv/versions/3.11.7/envs/venv311/lib/python3.11/site-packages/clickhouse_con │ │ nect/driver/httpclient.py:361 in _error_handler │ │ │ │ 358 │ │ if err_content: │ │ 359 │ │ │ err_msg = common.format_error(err_content.decode(errors='backslashreplace')) │ │ 360 │ │ │ err_str = f':{err_str}\n {err_msg}' │ │ ❱ 361 │ │ raise OperationalError(err_str) if retried else DatabaseError(err_str) from None │ │ 362 │ │ │ 363 │ def _raw_request(self, │ │ 364 │ │ │ │ │ data, │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ DatabaseError: :HTTPDriver for http://cc-j6c9l15edn66o817k.public.clickhouse.ads.aliyuncs.com:8123 returned response code 404) Code: 47. DB::Exception: There's no column 't8.user_id' in table 't8': While processing t8.user_id. (UNKNOWN_IDENTIFIER) (version 22.8.5.29)
Hey @stereoF -- sorry for the delay, we were at PyCon last week and it was pretty busy.
This is the SQL we're generating for Clickhouse for adding an id2 column:
[ins] In [62]: expr = df.mutate(id2=df.account_id)
[ins] In [63]: ibis.to_sql(expr)
Out[63]:
SELECT
"t8"."user_id",
"t8"."account_id",
"t8"."account_id_right",
"t8"."user_id_shift_7d",
"t8"."account_id_shift_7d",
"t8"."user_id_shift_7d_right",
"t8"."account_id_shift_7d_right",
"t8"."account_id" AS "id2"
FROM (
SELECT
"t4"."user_id",
"t4"."account_id",
"t5"."account_id" AS "account_id_right",
"t6"."user_id_shift_7d",
"t6"."account_id_shift_7d",
"t7"."user_id_shift_7d" AS "user_id_shift_7d_right",
"t7"."account_id_shift_7d" AS "account_id_shift_7d_right"
FROM "ibis_pandas_memtable_ctp55e5sc5cxfkyb7moevbl6f4" AS "t4"
INNER JOIN "ibis_pandas_memtable_7eg6rbbl2rf75a5sweeerkinuu" AS "t5"
ON "t4"."user_id" = "t5"."user_id"
LEFT OUTER JOIN "ibis_pandas_memtable_etbsqsk6rrfr5njw5yzdgnpc7e" AS "t6"
ON "t4"."user_id" = "t6"."user_id_shift_7d"
LEFT OUTER JOIN "ibis_pandas_memtable_p3vq4uorpbgppkkj24ohpsb3ti" AS "t7"
ON "t4"."user_id" = "t7"."user_id_shift_7d"
) AS "t8"
I'm going to have to do some reading, but I know Clickhouse has uncommon behavior around subqueries and I suspect that's the core of the issue here.
This is the same issue that I recently fixed #9525 and is ultimately an upstream clickhouse bug.
Here's Ibis main:
In [12]: from ibis.interactive import *
...:
...: df_user = ibis.memtable({"user_id": [1, 2, 3, 4], "account_id": ['1', '2', '3', '4']})
...: df_portrait = ibis.memtable({"user_id": [1, 2, 3, 4], "account_id": ['1', '2', '3', '4']})
...: df_user_shift_7d = ibis.memtable({"user_id_shift_7d": [1, 2, 3, 4], "account_id_shift_7d": ['1', '2', '3', '4']})
...: df_portrait_shift_7d = ibis.memtable({"user_id_shift_7d": [1, 2, 3, 4], "account_id_shift_7d": ['1', '2', '3', '4']})
...:
...: df = df_user.join(df_portrait, df_user.user_id == df_portrait.user_id, how='inner') \
...: .join(df_user_shift_7d, _.user_id == df_user_shift_7d.user_id_shift_7d, how='left') \
...: .join(df_portrait_shift_7d, _.user_id == df_portrait_shift_7d.user_id_shift_7d, how='left')
In [13]: expr = df.mutate(id2=df.account_id)
In [14]: ibis.to_sql(expr, dialect="clickhouse")
Out[14]:
SELECT
"t8"."user_id" AS "user_id",
"t8"."account_id" AS "account_id",
"t8"."account_id_right" AS "account_id_right",
"t8"."user_id_shift_7d" AS "user_id_shift_7d",
"t8"."account_id_shift_7d" AS "account_id_shift_7d",
"t8"."user_id_shift_7d_right" AS "user_id_shift_7d_right",
"t8"."account_id_shift_7d_right" AS "account_id_shift_7d_right",
"t8"."account_id" AS "id2"
FROM (
SELECT
"t4"."user_id" AS "user_id",
"t4"."account_id" AS "account_id",
"t5"."account_id" AS "account_id_right",
"t6"."user_id_shift_7d" AS "user_id_shift_7d",
"t6"."account_id_shift_7d" AS "account_id_shift_7d",
"t7"."user_id_shift_7d" AS "user_id_shift_7d_right",
"t7"."account_id_shift_7d" AS "account_id_shift_7d_right"
FROM "ibis_pandas_memtable_jgk62zahc5fufhp24rltzg6jq4" AS "t4"
INNER JOIN "ibis_pandas_memtable_bt4ph3dj65bmbnflys5lxr4rzi" AS "t5"
ON "t4"."user_id" = "t5"."user_id"
LEFT OUTER JOIN "ibis_pandas_memtable_qvr3juuhh5brvgba32t6cfkjuq" AS "t6"
ON "t4"."user_id" = "t6"."user_id_shift_7d"
LEFT OUTER JOIN "ibis_pandas_memtable_2dj4c7mssvcrza6qowdcwf46cy" AS "t7"
ON "t4"."user_id" = "t7"."user_id_shift_7d"
) AS "t8"
In [15]: con = ibis.clickhouse.connect()
In [16]: con.execute(expr)
Out[16]:
user_id account_id account_id_right user_id_shift_7d account_id_shift_7d user_id_shift_7d_right account_id_shift_7d_right id2
0 1 1 1 1 1 1 1 1
1 2 2 2 2 2 2 2 2
2 3 3 3 3 3 3 3 3
3 4 4 4 4 4 4 4 4