marimo icon indicating copy to clipboard operation
marimo copied to clipboard

Ibis support in ui.dataframe should output python code

Open szst11 opened this issue 1 year ago • 4 comments

Description

In #2188 the output in SQL was introduces to ui.dataframe. That is a very cool feature. But SQL output is a feature in Ibis for backends, that support SQL (Ibis presentation). If e.g. Polars is set as backend, there will be no SQL output possible.

Suggested solution

To be usefull for all backends or for further usage of the code output in the notebook(as for Pandas/Polars), an output-tab for python code would be great.

Alternative

No response

Additional context

(was on holiday last week)

szst11 avatar Sep 09 '24 13:09 szst11

I started to write some code/tests to output Ibis as python. I can finish this off.

Just to be clear, you want the output python code with Ibis syntax, correct? and not as Polars/Pandas? Because if you do pass a polars/pandas dataframe, you will get the output using the polars / pandas syntax.

mscolnick avatar Sep 09 '24 13:09 mscolnick

Hi @mscolnick
in my understanding, the syntax should be the same, as the input-dataframe. So Ibis-Pyhton for Ibis and so on.

I started to write some code/tests to output Ibis as python. I can finish this off.

Just to be clear, you want the output python code with Ibis syntax, correct? and not as Polars/Pandas? Because if you do pass a polars/pandas dataframe, you will get the output using the polars / pandas syntax.

szst11 avatar Sep 09 '24 16:09 szst11

Hallo @mscolnick , I played a little around with your commented code and got it working with the following function. I hope it helps to finalize it. Actually I've not yet contributed any pull request.

def python_print_ibis(
    df_name: str, all_columns: List[str], transform: Transform
) -> str:

    def generate_where_clause_ibis(df_name: str, where: Condition) -> str:
        column_id, operator, value = (
            where.column_id,
            where.operator,
            where.value,
        )

        if operator == "==" or operator == "equals":
            return f"{df_name}[{_as_literal(column_id)}] == {_as_literal(value)}"  # noqa: E501
        elif operator == "does_not_equal" or operator == "!=":
            return f"{df_name}[{_as_literal(column_id)}]!=({_as_literal(value)})"  # noqa: E501
        elif operator == "contains":
            return f"{df_name}[{_as_literal(column_id)}].contains({_as_literal(value)})"  # noqa: E501
        elif operator == "regex":
            return f"{df_name}[{_as_literal(column_id)}].re_search({_as_literal(value)})"  # noqa: E501
        elif operator == "starts_with":
            return f"{df_name}[{_as_literal(column_id)}].startswith({_as_literal(value)})"  # noqa: E501
        elif operator == "ends_with":
            return f"{df_name}[{_as_literal(column_id)}].endswith({_as_literal(value)})"  # noqa: E501
        elif operator == "in":
            return f"{df_name}[{_as_literal(column_id)}].isin({_list_of_strings(value)})"  # noqa: E501
        elif operator in [">", ">=", "<", "<="]:
            return f"{df_name}[{_as_literal(column_id)}] {operator} {_as_literal(value)}"  # noqa: E501
        elif operator == "is_nan":
            return f"{df_name}[{_as_literal(column_id)}].isnull()"
        elif operator == "is_not_nan":
            return f"{df_name}[{_as_literal(column_id)}].notnull()"
        elif operator == "is_true":
            return f"{df_name}[{_as_literal(column_id)}] == True"
        elif operator == "is_false":
            return f"{df_name}[{_as_literal(column_id)}] == False"
        else:
            raise ValueError(f"Unknown operator: {operator}")

    if transform.type == TransformType.COLUMN_CONVERSION:
        column_id, data_type, errors = (
            transform.column_id,
            transform.data_type,
            transform.errors,
        )
        if errors == "ignore":
            return (
                f"{df_name}.select('*', "
                f"ibis.coalesce("
                f"{df_name}[{_as_literal(column_id)}].cast(ibis.dtype({_as_literal(data_type)})), "  # noqa: E501
                f"{df_name}[{_as_literal(column_id)}]"
                f").name({_as_literal(column_id)}))"
            )
        else:
            return (
                f"{df_name}.select('*', "
                f"{df_name}[{_as_literal(column_id)}]"
                f".cast(ibis.dtype({_as_literal(data_type)}))"
                f".name({_as_literal(column_id)}))"
            )

    elif transform.type == TransformType.RENAME_COLUMN:
        column_id, new_column_id = (
            transform.column_id,
            transform.new_column_id,
        )  # noqa: E501
        return f"{df_name}.rename({{{_as_literal(new_column_id)}: {_as_literal(column_id)}}})"  # noqa: E501

    elif transform.type == TransformType.SORT_COLUMN:
        column_id, ascending = transform.column_id, transform.ascending
        return f"{df_name}.order_by([{df_name}[{_as_literal(column_id)}].{'asc' if ascending else 'desc'}()])"  # noqa: E501

    elif transform.type == TransformType.FILTER_ROWS:
        operation, where = transform.operation, transform.where
        if not where:
            return df_name
        where_clauses = [
            generate_where_clause_ibis(df_name, condition)
            for condition in where
        ]
        if operation == "keep_rows" and len(where_clauses) == 1:
            return f"{df_name}.filter({where_clauses[0]})"
        expression = " & ".join(f"({clause})" for clause in where_clauses)
        return (
            f"{df_name}.filter({expression})"
            if operation == "keep_rows"
            else f"{df_name}.filter(~({expression}))"
        )

    elif transform.type == TransformType.AGGREGATE:
        column_ids, aggregations = (
            transform.column_ids,
            transform.aggregations,
        )  # noqa: E501
        agg_dict: dict[str, str] = {}
        
        for col in column_ids:
            for agg in aggregations:
                agg_dict[f"{col}_{agg}"] = (
                    f"{df_name}[{_as_literal(col)}].{agg}()"
                )
        return f"{df_name}.agg({{{', '.join(f'{_as_literal(k)}: {v}' for k, v in agg_dict.items())}}})"  # noqa: E501

    elif transform.type == TransformType.GROUP_BY:
        column_ids, agg = transform.column_ids, transform.aggregation
        agg_dict: dict[str, str] = {}
        for col in all_columns:
            col = col.replace(
                "_" + agg, ""
            )  # names in all_columns already contain aggregation
            if col not in column_ids:
                agg_dict[f"{col}_{agg}"] = (
                    f"{df_name}[{_as_literal(col)}].{agg}()"
                )
        return (
            f"{df_name}.group_by({_list_of_strings(column_ids)})"
            + f".agg({{{', '.join(f'{_as_literal(k)}: {v}' for k, v in agg_dict.items())}}})"
        )  # noqa: E501

    elif transform.type == TransformType.SELECT_COLUMNS:
        column_ids = transform.column_ids
        return f"{df_name}.select({_list_of_strings(column_ids)})"

    elif transform.type == TransformType.SAMPLE_ROWS:
        n, seed = transform.n, transform.seed
        return f"{df_name}.sample({n} / {df_name}.count().execute(), method='row', seed={seed})"  # noqa: E501

    elif transform.type == TransformType.SHUFFLE_ROWS:
        return f"{df_name}.order_by(ibis.random())"

    elif transform.type == TransformType.EXPLODE_COLUMNS:
        column_ids = transform.column_ids
        return f"{df_name}.unnest({_list_of_strings(column_ids)})"

    elif transform.type == TransformType.EXPAND_DICT:
        column_id = transform.column_id
        return f"{df_name}.unpack({_as_literal(column_id)})"

    assert_never(transform.type)

szst11 avatar Sep 25 '24 13:09 szst11

@szst11 - it would be great if you can make the contribution! we always appreciate it

if you get busy, let us know and we can pick it back up.

mscolnick avatar Sep 25 '24 13:09 mscolnick