sqlglot icon indicating copy to clipboard operation
sqlglot copied to clipboard

Optimizer relies on subquery's order to infer correct expression type with a multiple tables query

Open gabrielteotonio opened this issue 1 year ago • 1 comments

The Optimizer cannot infer the correct type for a given expression when analyzing a query with a UNION ALL clause with multiple tables (3 or more). It relies on the order of the SELECT.

SQL

The following code will result in a DataType(this=Type.NULL) for the some_id_feature expression.

SELECT
  __source_table,
  some_id AS some_id_feature,
FROM (
  SELECT
    __source_table,
    some_id
  FROM (
    SELECT
      __source_table,
      NULL AS some_id
    FROM (
      SELECT
        'table_1' AS __source_table
      FROM table_1
    )
    UNION ALL
    SELECT
      __source_table,
      some_id
    FROM (
      SELECT
        table_2.some_id,
        'table_2' AS __source_table
      FROM table_2
    )
    UNION ALL
    SELECT
      __source_table,
      NULL AS some_id
    FROM (
      SELECT
        'table_3' AS __source_table
      FROM table_3
    )
  )
)

Reproducible code

from sqlglot import parse_one
from sqlglot.optimizer import optimize

node = parse_one(provided_sql, dialect='spark')

optimized = optimize(
    node,
    schema={
        'table_1': {
            'column_a': 'LONG', 'column_b': 'VARCHAR'}, 
        'table_2': {
            'some_id': 'VARCHAR'}, 
        'table_3': {
            'column_a': 'LONG', 'column_b': 'VARCHAR'}
    }
)

optimized.args["expressions"]

The code outputs:

[Alias(
  this=Column(
    this=Identifier(this=__source_table, quoted=True),
    table=Identifier(this=_q_3, quoted=True),
    _type=DataType(this=Type.VARCHAR)),
  alias=Identifier(this=__source_table, quoted=True, _type=DataType(this=Type.UNKNOWN)),
  _type=DataType(this=Type.VARCHAR)), Alias(
  this=Column(
    this=Identifier(this=some_id, quoted=True),
    table=Identifier(this=_q_3, quoted=True),
    _type=DataType(this=Type.NULL)),
  alias=Identifier(this=some_id_feature, quoted=True, _type=DataType(this=Type.UNKNOWN)),
  _type=DataType(this=Type.NULL))]

However, if we change the order of the SELECT when using the UNION ALL, the correct VARCHAR will be inferred for some_id_feature as long as the table_2's SELECT remains as the first or last one in the subquery:

SELECT
  __source_table,
  some_id AS some_id_feature,
FROM (
  SELECT
    __source_table,
    some_id
  FROM (
    SELECT
      __source_table,
      some_id
    FROM (
      SELECT
        table_2.some_id,
        'table_2' AS __source_table
      FROM table_2
    )
    UNION ALL
    SELECT
      __source_table,
      NULL AS some_id
    FROM (
      SELECT
        'table_1' AS __source_table
      FROM table_1
    )
    UNION ALL
    SELECT
      __source_table,
      NULL AS some_id
    FROM (
      SELECT
        'table_3' AS __source_table
      FROM table_3
    )
  )
)

It results:

[Alias(
  this=Column(
    this=Identifier(this=__source_table, quoted=True),
    table=Identifier(this=_q_3, quoted=True),
    _type=DataType(this=Type.VARCHAR)),
  alias=Identifier(this=__source_table, quoted=True, _type=DataType(this=Type.UNKNOWN)),
  _type=DataType(this=Type.VARCHAR)), Alias(
  this=Column(
    this=Identifier(this=some_id, quoted=True),
    table=Identifier(this=_q_3, quoted=True),
    _type=DataType(this=Type.VARCHAR)),
  alias=Identifier(this=some_id_feature, quoted=True, _type=DataType(this=Type.UNKNOWN)),
  _type=DataType(this=Type.VARCHAR))]

And,

SELECT
  __source_table,
  some_id AS some_id_feature,
FROM (
  SELECT
    __source_table,
    some_id
  FROM (
    SELECT
      __source_table,
      NULL AS some_id
    FROM (
      SELECT
        'table_1' AS __source_table
      FROM table_1
    )
    UNION ALL
    SELECT
      __source_table,
      NULL AS some_id
    FROM (
      SELECT
        'table_3' AS __source_table
      FROM table_3
    )
    UNION ALL
    SELECT
      __source_table,
      some_id
    FROM (
      SELECT
        table_2.some_id,
        'table_2' AS __source_table
      FROM table_2
    )
  )
)

It results:

[Alias(
  this=Column(
    this=Identifier(this=__source_table, quoted=True),
    table=Identifier(this=_q_3, quoted=True),
    _type=DataType(this=Type.VARCHAR)),
  alias=Identifier(this=__source_table, quoted=True, _type=DataType(this=Type.UNKNOWN)),
  _type=DataType(this=Type.VARCHAR)), Alias(
  this=Column(
    this=Identifier(this=some_id, quoted=True),
    table=Identifier(this=_q_3, quoted=True),
    _type=DataType(this=Type.VARCHAR)),
  alias=Identifier(this=some_id_feature, quoted=True, _type=DataType(this=Type.UNKNOWN)),
  _type=DataType(this=Type.VARCHAR))]

gabrielteotonio avatar Oct 17 '24 18:10 gabrielteotonio

Appreciate the detailed report, we'll take a look shortly.

georgesittas avatar Oct 17 '24 22:10 georgesittas