Ibis support in ui.dataframe should output python code
Description
In #2188 the output in SQL was introduces to ui.dataframe. That is a very cool feature. But SQL output is a feature in Ibis for backends, that support SQL (Ibis presentation). If e.g. Polars is set as backend, there will be no SQL output possible.
Suggested solution
To be usefull for all backends or for further usage of the code output in the notebook(as for Pandas/Polars), an output-tab for python code would be great.
Alternative
No response
Additional context
(was on holiday last week)
I started to write some code/tests to output Ibis as python. I can finish this off.
Just to be clear, you want the output python code with Ibis syntax, correct? and not as Polars/Pandas? Because if you do pass a polars/pandas dataframe, you will get the output using the polars / pandas syntax.
Hi @mscolnick
in my understanding, the syntax should be the same, as the input-dataframe. So Ibis-Pyhton for Ibis and so on.
I started to write some code/tests to output Ibis as python. I can finish this off.
Just to be clear, you want the output python code with Ibis syntax, correct? and not as Polars/Pandas? Because if you do pass a polars/pandas dataframe, you will get the output using the polars / pandas syntax.
Hallo @mscolnick , I played a little around with your commented code and got it working with the following function. I hope it helps to finalize it. Actually I've not yet contributed any pull request.
def python_print_ibis(
df_name: str, all_columns: List[str], transform: Transform
) -> str:
def generate_where_clause_ibis(df_name: str, where: Condition) -> str:
column_id, operator, value = (
where.column_id,
where.operator,
where.value,
)
if operator == "==" or operator == "equals":
return f"{df_name}[{_as_literal(column_id)}] == {_as_literal(value)}" # noqa: E501
elif operator == "does_not_equal" or operator == "!=":
return f"{df_name}[{_as_literal(column_id)}]!=({_as_literal(value)})" # noqa: E501
elif operator == "contains":
return f"{df_name}[{_as_literal(column_id)}].contains({_as_literal(value)})" # noqa: E501
elif operator == "regex":
return f"{df_name}[{_as_literal(column_id)}].re_search({_as_literal(value)})" # noqa: E501
elif operator == "starts_with":
return f"{df_name}[{_as_literal(column_id)}].startswith({_as_literal(value)})" # noqa: E501
elif operator == "ends_with":
return f"{df_name}[{_as_literal(column_id)}].endswith({_as_literal(value)})" # noqa: E501
elif operator == "in":
return f"{df_name}[{_as_literal(column_id)}].isin({_list_of_strings(value)})" # noqa: E501
elif operator in [">", ">=", "<", "<="]:
return f"{df_name}[{_as_literal(column_id)}] {operator} {_as_literal(value)}" # noqa: E501
elif operator == "is_nan":
return f"{df_name}[{_as_literal(column_id)}].isnull()"
elif operator == "is_not_nan":
return f"{df_name}[{_as_literal(column_id)}].notnull()"
elif operator == "is_true":
return f"{df_name}[{_as_literal(column_id)}] == True"
elif operator == "is_false":
return f"{df_name}[{_as_literal(column_id)}] == False"
else:
raise ValueError(f"Unknown operator: {operator}")
if transform.type == TransformType.COLUMN_CONVERSION:
column_id, data_type, errors = (
transform.column_id,
transform.data_type,
transform.errors,
)
if errors == "ignore":
return (
f"{df_name}.select('*', "
f"ibis.coalesce("
f"{df_name}[{_as_literal(column_id)}].cast(ibis.dtype({_as_literal(data_type)})), " # noqa: E501
f"{df_name}[{_as_literal(column_id)}]"
f").name({_as_literal(column_id)}))"
)
else:
return (
f"{df_name}.select('*', "
f"{df_name}[{_as_literal(column_id)}]"
f".cast(ibis.dtype({_as_literal(data_type)}))"
f".name({_as_literal(column_id)}))"
)
elif transform.type == TransformType.RENAME_COLUMN:
column_id, new_column_id = (
transform.column_id,
transform.new_column_id,
) # noqa: E501
return f"{df_name}.rename({{{_as_literal(new_column_id)}: {_as_literal(column_id)}}})" # noqa: E501
elif transform.type == TransformType.SORT_COLUMN:
column_id, ascending = transform.column_id, transform.ascending
return f"{df_name}.order_by([{df_name}[{_as_literal(column_id)}].{'asc' if ascending else 'desc'}()])" # noqa: E501
elif transform.type == TransformType.FILTER_ROWS:
operation, where = transform.operation, transform.where
if not where:
return df_name
where_clauses = [
generate_where_clause_ibis(df_name, condition)
for condition in where
]
if operation == "keep_rows" and len(where_clauses) == 1:
return f"{df_name}.filter({where_clauses[0]})"
expression = " & ".join(f"({clause})" for clause in where_clauses)
return (
f"{df_name}.filter({expression})"
if operation == "keep_rows"
else f"{df_name}.filter(~({expression}))"
)
elif transform.type == TransformType.AGGREGATE:
column_ids, aggregations = (
transform.column_ids,
transform.aggregations,
) # noqa: E501
agg_dict: dict[str, str] = {}
for col in column_ids:
for agg in aggregations:
agg_dict[f"{col}_{agg}"] = (
f"{df_name}[{_as_literal(col)}].{agg}()"
)
return f"{df_name}.agg({{{', '.join(f'{_as_literal(k)}: {v}' for k, v in agg_dict.items())}}})" # noqa: E501
elif transform.type == TransformType.GROUP_BY:
column_ids, agg = transform.column_ids, transform.aggregation
agg_dict: dict[str, str] = {}
for col in all_columns:
col = col.replace(
"_" + agg, ""
) # names in all_columns already contain aggregation
if col not in column_ids:
agg_dict[f"{col}_{agg}"] = (
f"{df_name}[{_as_literal(col)}].{agg}()"
)
return (
f"{df_name}.group_by({_list_of_strings(column_ids)})"
+ f".agg({{{', '.join(f'{_as_literal(k)}: {v}' for k, v in agg_dict.items())}}})"
) # noqa: E501
elif transform.type == TransformType.SELECT_COLUMNS:
column_ids = transform.column_ids
return f"{df_name}.select({_list_of_strings(column_ids)})"
elif transform.type == TransformType.SAMPLE_ROWS:
n, seed = transform.n, transform.seed
return f"{df_name}.sample({n} / {df_name}.count().execute(), method='row', seed={seed})" # noqa: E501
elif transform.type == TransformType.SHUFFLE_ROWS:
return f"{df_name}.order_by(ibis.random())"
elif transform.type == TransformType.EXPLODE_COLUMNS:
column_ids = transform.column_ids
return f"{df_name}.unnest({_list_of_strings(column_ids)})"
elif transform.type == TransformType.EXPAND_DICT:
column_id = transform.column_id
return f"{df_name}.unpack({_as_literal(column_id)})"
assert_never(transform.type)
@szst11 - it would be great if you can make the contribution! we always appreciate it
if you get busy, let us know and we can pick it back up.