ibis-bigquery
ibis-bigquery copied to clipboard
BigQuery: groupby can generate LEFT SEMI JOIN, which is not supported syntax
Failing test
https://github.com/ibis-project/ibis/blob/a70d443c7931cb8bb47c52f97999589566e03cb2/ibis/tests/all/test_aggregation.py#L266-L293
Test output
$ pytest ibis/tests/all/test_aggregation.py::test_topk_filter_op[BigQuery-string_col_filter_top3]
Output:
======================================================= test session starts =======================================================
platform darwin -- Python 3.7.8, pytest-5.4.3, py-1.9.0, pluggy-0.13.1
rootdir: /Users/swast/src/ibis, inifile: setup.cfg
plugins: forked-1.2.0, mock-3.1.1, cov-2.10.0, xdist-1.34.0
collected 1 item
ibis/tests/all/test_aggregation.py F [100%]
============================================================ FAILURES =============================================================
______________________________________ test_topk_filter_op[BigQuery-string_col_filter_top3] _______________________________________
backend = <ibis.tests.backends.BigQuery object at 0x7f818a7bb710>
alltypes = BigQueryTable[table]
name: swast-scratch.testing.functional_alltypes
schema:
index : int64
Unnamed_0 : int...4
date_string_col : string
string_col : string
timestamp_col : timestamp
year : int64
month : int64
df = index Unnamed_0 id bool_col tinyint_col ... date_string_col string_col timestamp_col year m... False 9 ... 01/31/10 9 2010-01-31 05:09:13.860 2010 1
[7300 rows x 15 columns]
result_fn = <function <lambda> at 0x7f818a803560>, expected_fn = <function <lambda> at 0x7f818a8035f0>
@pytest.mark.parametrize(
('result_fn', 'expected_fn'),
[
param(
lambda t: t[t.string_col.topk(3)],
lambda t: t[
t.string_col.isin(
t.groupby('string_col')['string_col'].count().head(3).index
)
],
id='string_col_filter_top3',
)
],
)
@pytest.mark.xfail_unsupported
# Issues ibis-project/ibis#2133 ibis-project/ibis#2132# ibis-project/ibis#2133
@pytest.mark.xfail_backends([Clickhouse, MySQL, Postgres])
@pytest.mark.skip_backends([SQLite], reason='Issue ibis-project/ibis#2128')
def test_topk_filter_op(backend, alltypes, df, result_fn, expected_fn):
# TopK expression will order rows by "count" but each backend
# can have different result for that.
# Note: Maybe would be good if TopK could order by "count"
# and the field used by TopK
t = alltypes.sort_by(alltypes.string_col)
df = df.sort_values('string_col')
> result = result_fn(t).execute()
ibis/tests/all/test_aggregation.py:291:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
ibis/expr/types.py:219: in execute
self, limit=limit, timecontext=timecontext, params=params, **kwargs
ibis/client.py:368: in execute
return backend.execute(expr, limit=limit, params=params, **kwargs)
ibis/client.py:221: in execute
result = self._execute_query(query, **kwargs)
ibis/client.py:228: in _execute_query
return query.execute()
ibis/bigquery/client.py:194: in execute
query_parameters=self.query_parameters,
ibis/bigquery/client.py:475: in _execute
query.result() # blocks until finished
../../miniconda3/envs/ibis-dev/lib/python3.7/site-packages/google/cloud/bigquery/job.py:3207: in result
super(QueryJob, self).result(retry=retry, timeout=timeout)
../../miniconda3/envs/ibis-dev/lib/python3.7/site-packages/google/cloud/bigquery/job.py:812: in result
return super(_AsyncJob, self).result(timeout=timeout)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <google.cloud.bigquery.job.QueryJob object at 0x7f818aa0f110>, timeout = None
def result(self, timeout=None):
"""Get the result of the operation, blocking if necessary.
Args:
timeout (int):
How long (in seconds) to wait for the operation to complete.
If None, wait indefinitely.
Returns:
google.protobuf.Message: The Operation's result.
Raises:
google.api_core.GoogleAPICallError: If the operation errors or if
the timeout is reached before the operation completes.
"""
self._blocking_poll(timeout=timeout)
if self._exception is not None:
# pylint: disable=raising-bad-type
# Pylint doesn't recognize that this is valid in this case.
> raise self._exception
E google.api_core.exceptions.BadRequest: 400 Syntax error: Expected keyword JOIN but got identifier "SEMI" at [7:8]
E
E (job ID: fa67b0d0-eae0-4c80-9fff-3a9ece611d55)
E
E -----Query Job SQL Follows-----
E
E | . | . | . | . | . |
E 1:SELECT t0.*
E 2:FROM (
E 3: SELECT *
E 4: FROM `swast-scratch.testing.functional_alltypes`
E 5: ORDER BY `string_col`
E 6:) t0
E 7: LEFT SEMI JOIN (
E 8: SELECT *
E 9: FROM (
E 10: SELECT `string_col`, count(`string_col`) AS `count`
E 11: FROM `swast-scratch.testing.functional_alltypes`
E 12: GROUP BY 1
E 13: ORDER BY `string_col`
E 14: ) t2
E 15: ORDER BY `count` DESC
E 16: LIMIT 3
E 17: ) t1
E 18: ON t0.`string_col` = t1.`string_col`
E 19:LIMIT 10000
E | . | . | . | . | . |
../../miniconda3/envs/ibis-dev/lib/python3.7/site-packages/google/api_core/future/polling.py:130: BadRequest
======================================================== warnings summary =========================================================
ibis/tests/all/test_aggregation.py::test_topk_filter_op[BigQuery-string_col_filter_top3]
/Users/swast/src/ibis/ibis/bigquery/client.py:545: PendingDeprecationWarning: Client.dataset is deprecated and will be removed in a future version. Use a string like 'my_project.my_dataset' or a cloud.google.bigquery.DatasetReference object, instead.
table_ref = self.client.dataset(dataset, project=project).table(name)
ibis/tests/all/test_aggregation.py::test_topk_filter_op[BigQuery-string_col_filter_top3]
/Users/swast/src/ibis/ibis/bigquery/client.py:432: PendingDeprecationWarning: Client.dataset is deprecated and will be removed in a future version. Use a string like 'my_project.my_dataset' or a cloud.google.bigquery.DatasetReference object, instead.
dataset_ref = self.client.dataset(dataset, project=project)
-- Docs: https://docs.pytest.org/en/latest/warnings.html
===================================================== short test summary info =====================================================
FAILED ibis/tests/all/test_aggregation.py::test_topk_filter_op[BigQuery-string_col_filter_top3] - google.api_core.exceptions.Bad...
================================================== 1 failed, 2 warnings in 4.02s ==================================================
Found in ibis-project/ibis#2353