sqlx icon indicating copy to clipboard operation
sqlx copied to clipboard

[query macros] faulty null inference involving left joins in Postgres

Open abonander opened this issue 4 years ago • 9 comments

https://discordapp.com/channels/665528275556106240/694697474689859614/717103397848875049

CREATE TABLE IF NOT EXISTS colors (
color_uuid UUID NOT NULL DEFAULT uuid_generate_v4(),
color TEXT
);

insert into colors (color_uuid, color) values ('5ab49fea-1b9e-4ae1-ad7d-9fd9e0f48852','Red');
insert into colors (color_uuid, color) values ('b7bf2e01-6bdb-4620-b437-a5784a70380a','White');
insert into colors (color_uuid, color) values ('f07f0b82-7008-4d25-993b-3e2ba80d1131','Black');

CREATE TABLE IF NOT EXISTS objects (
object_uuid UUID NOT NULL DEFAULT uuid_generate_v4(),
colors UUID[]
);

 insert into objects (colors) values ('{"5ab49fea-1b9e-4ae1-ad7d-9fd9e0f48852","b7bf2e01-6bdb-4620-b437-a5784a70380a"}');
 insert into objects (colors) values ('{"5ab49fea-1b9e-4ae1-ad7d-9fd9e0f48852"}');
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let pool = PgPool::new(&env::var("DATABASE_URL")?).await?;
    let recs = sqlx::query!(
    r#"SELECT o.object_uuid, o.colors,
    c1.color_uuid as color1_uuid, c1.color as color1_color,
    c2.color_uuid as color2_uuid, c2.color as color2_color
    FROM objects o
    LEFT JOIN colors c1 ON c1.color_uuid = o.colors[1]
    LEFT JOIN colors c2 ON c2.color_uuid = o.colors[2]
    "#)
        .fetch_all(&pool)
        .await?;

    for rec in recs {
        println!("{}", rec.object_uuid);
    }

    Ok(())
}

Causes a Decode(UnexpectedNullError) because the second column has a null even though it was inferred to be not-null.

This probably affects any query with a left join where the columns selected from the join have NOT NULL constraints.

abonander avatar Jun 01 '20 21:06 abonander

I don't think we can fix this without analyzing the SQL query ourselves, Postgres just doesn't tell us enough here.

All it tells us is that c2 comes from the colors table and that it's the second column, and when we pull the schema of colors we can see the second column has a not null constraint.

We could try issuing a warning if the query contains a LEFT JOIN but there's no stable interface for that yet.

abonander avatar Jun 01 '20 21:06 abonander

@mehcode suggests using explain verbose, which using JSON format gives us output like this:

postgres=# explain (verbose true, format json) select * from null_join_test left join null_join_test njt on false;
                     QUERY PLAN                     
----------------------------------------------------
 [                                                 +
   {                                               +
     "Plan": {                                     +
       "Node Type": "Nested Loop",                 +
       "Parallel Aware": false,                    +
       "Join Type": "Left",                        +
       "Startup Cost": 0.00,                       +
       "Total Cost": 61.00,                        +
       "Plan Rows": 2550,                          +
       "Plan Width": 8,                            +
       "Output": ["null_join_test.value", "value"],+
       "Inner Unique": false,                      +
       "Join Filter": "false",                     +
       "Plans": [                                  +
         {                                         +
           "Node Type": "Seq Scan",                +
           "Parent Relationship": "Outer",         +
           "Parallel Aware": false,                +
           "Relation Name": "null_join_test",      +
           "Schema": "public",                     +
           "Alias": "null_join_test",              +
           "Startup Cost": 0.00,                   +
           "Total Cost": 35.50,                    +
           "Plan Rows": 2550,                      +
           "Plan Width": 4,                        +
           "Output": ["null_join_test.value"]      +
         },                                        +
         {                                         +
           "Node Type": "Result",                  +
           "Parent Relationship": "Inner",         +
           "Parallel Aware": false,                +
           "Startup Cost": 0.00,                   +
           "Total Cost": 0.00,                     +
           "Plan Rows": 0,                         +
           "Plan Width": 4,                        +
           "Output": ["value"],                    +
           "One-Time Filter": "false"              +
         }                                         +
       ]                                           +
     }                                             +
   }                                               +
 ]
(1 row)

This may be enough for simple cases but I'll need to evaluate this more.

abonander avatar Jun 01 '20 22:06 abonander

For the above query:

[
  {
    "Plan": {
      "Node Type": "Merge Join",
      "Parallel Aware": false,
      "Join Type": "Right",
      "Startup Cost": 672.10,
      "Total Cost": 1136.80,
      "Plan Rows": 30623,
      "Plan Width": 144,
      "Output": ["o.object_uuid", "o.colors", "c1.color_uuid", "c1.color", "c2.color_uuid", "c2.color"],
      "Inner Unique": false,
      "Merge Cond": "(c2.color_uuid = (o.colors[2]))",
      "Plans": [
        {
          "Node Type": "Sort",
          "Parent Relationship": "Outer",
          "Parallel Aware": false,
          "Startup Cost": 74.54,
          "Total Cost": 77.21,
          "Plan Rows": 1070,
          "Plan Width": 48,
          "Output": ["c2.color_uuid", "c2.color"],
          "Sort Key": ["c2.color_uuid"],
          "Plans": [
            {
              "Node Type": "Seq Scan",
              "Parent Relationship": "Outer",
              "Parallel Aware": false,
              "Relation Name": "colors",
              "Schema": "public",
              "Alias": "c2",
              "Startup Cost": 0.00,
              "Total Cost": 20.70,
              "Plan Rows": 1070,
              "Plan Width": 48,
              "Output": ["c2.color_uuid", "c2.color"]
            }
          ]
        },
        {
          "Node Type": "Sort",
          "Parent Relationship": "Inner",
          "Parallel Aware": false,
          "Startup Cost": 597.56,
          "Total Cost": 611.87,
          "Plan Rows": 5724,
          "Plan Width": 96,
          "Output": ["o.object_uuid", "o.colors", "c1.color_uuid", "c1.color", "(o.colors[2])"],
          "Sort Key": ["(o.colors[2])"],
          "Plans": [
            {
              "Node Type": "Merge Join",
              "Parent Relationship": "Outer",
              "Parallel Aware": false,
              "Join Type": "Left",
              "Startup Cost": 149.08,
              "Total Cost": 240.30,
              "Plan Rows": 5724,
              "Plan Width": 96,
              "Output": ["o.object_uuid", "o.colors", "c1.color_uuid", "c1.color", "o.colors[2]"],
              "Inner Unique": false,
              "Merge Cond": "((o.colors[1]) = c1.color_uuid)",
              "Plans": [
                {
                  "Node Type": "Sort",
                  "Parent Relationship": "Outer",
                  "Parallel Aware": false,
                  "Startup Cost": 74.54,
                  "Total Cost": 77.21,
                  "Plan Rows": 1070,
                  "Plan Width": 48,
                  "Output": ["o.object_uuid", "o.colors", "(o.colors[1])"],
                  "Sort Key": ["(o.colors[1])"],
                  "Plans": [
                    {
                      "Node Type": "Seq Scan",
                      "Parent Relationship": "Outer",
                      "Parallel Aware": false,
                      "Relation Name": "objects",
                      "Schema": "public",
                      "Alias": "o",
                      "Startup Cost": 0.00,
                      "Total Cost": 20.70,
                      "Plan Rows": 1070,
                      "Plan Width": 48,
                      "Output": ["o.object_uuid", "o.colors", "o.colors[1]"]
                    }
                  ]
                },
                {
                  "Node Type": "Sort",
                  "Parent Relationship": "Inner",
                  "Parallel Aware": false,
                  "Startup Cost": 74.54,
                  "Total Cost": 77.21,
                  "Plan Rows": 1070,
                  "Plan Width": 48,
                  "Output": ["c1.color_uuid", "c1.color"],
                  "Sort Key": ["c1.color_uuid"],
                  "Plans": [
                    {
                      "Node Type": "Seq Scan",
                      "Parent Relationship": "Outer",
                      "Parallel Aware": false,
                      "Relation Name": "colors",
                      "Schema": "public",
                      "Alias": "c1",
                      "Startup Cost": 0.00,
                      "Total Cost": 20.70,
                      "Plan Rows": 1070,
                      "Plan Width": 48,
                      "Output": ["c1.color_uuid", "c1.color"]
                    }
                  ]
                }
              ]
            }
          ]
        }
      ]
    }
  }
]

abonander avatar Jun 02 '20 03:06 abonander

Potential strategy:

  • Given the outermost plan, use the Outputs key to map (potentially aliased) columns to their original names
  • Perform a depth-first traversal of the plan tree, adding all columns produced by a left or right join to a HashSet
  • Mark columns in the query as nullable if they appear in this set

abonander avatar Jun 02 '20 04:06 abonander

Would it make sense if we were able to use a struct like:

users_details would be None if the LEFT JOIN didn't return anything.

struct UsersDetails {
    field1: String,
    field2: Option<String>
}

struct Users {
    user_id: i32,
    users_details: Option<UsersDetails>
}
CREATE TABLE users (
    user_id integer NOT NULL
);

CREATE TABLE users_details (
    user_id integer NOT NULL,
    field1 text NOT NULL
    field2 text
);

SELECT
users.user_id,
users_details.field1,
users_details.field2
FROM users
LEFT JOIN users_details USING (user_id)

bbigras avatar Sep 06 '20 16:09 bbigras

Am I correct in thinking this should be solved in 0.5? I am using 0.5.1 and still encountering this, should I raise a bug?

cameron-martin avatar Mar 08 '21 23:03 cameron-martin

Am I correct in thinking this should be solved in 0.5? I am using 0.5.1 and still encountering this, should I raise a bug?

@cameron-martin Isn't a bug you have to implement what @abonander say, for example in your query you should add an alias ("Mark columns in the query as nullable if they appear in this set"):

async fn main() -> Result<(), Box<dyn std::error::Error>> { let pool = PgPool::new(&env::var("DATABASE_URL")?).await?; let recs = sqlx::query!( r#"SELECT o.object_uuid, o.colors, c1.color_uuid as "color_1?" c2.color_uuid as "color2?" FROM objects o LEFT JOIN colors c1 ON c1.color_uuid = o.colors[1] LEFT JOIN colors c2 ON c2.color_uuid = o.colors[2] "#) .fetch_all(&pool) .await?;

for rec in recs {
    println!("{}", rec.object_uuid);
}

Ok(())

}

If you use as "color_1?" inference that is null, If you use as "color_1!" inference that is not null,

ajosecueto avatar Mar 15 '21 23:03 ajosecueto