efcore.pg icon indicating copy to clipboard operation
efcore.pg copied to clipboard

GroupBy with joined table produces sub optimal query

Open domagojmedo opened this issue 3 months ago • 0 comments

When aggregating on values from a joined table EF produces what appears to be not optimal query. Repro code:

using Microsoft.EntityFrameworkCore;

var ctx = new BloggingContext();

var q1 = ctx.Posts
    .Where(x => x.Blog.Rating > 5)
    .GroupBy(x => x.CreatedDate.Date)
    .Select(x =>
        new
        {
            x.Key,
            PostRating = x.Sum(x => x.Rating)
        })
    .ToQueryString();

var q2 = ctx.Posts
    .Where(x => x.Blog.Rating > 5)
    .GroupBy(x => x.CreatedDate.Date)
    .Select(x =>
        new
        {
            x.Key,
            PostRating = x.Sum(x => x.Rating),
            BlogRating = x.Sum(x => x.Blog.Rating)
        })
    .ToQueryString();

Console.WriteLine("Hello, World!");

public class BloggingContext : DbContext
{
    public DbSet<Blog> Blogs { get; set; }
    public DbSet<Post> Posts { get; set; }

    protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
        => optionsBuilder.UseNpgsql("Host=my_host;Database=my_db;Username=my_user;Password=my_pw");
}

public class Blog
{
    public int BlogId { get; set; }
    public string Url { get; set; }
    public int Rating { get; set; }

    public List<Post> Posts { get; set; }
}

public class Post
{
    public int PostId { get; set; }
    public string Title { get; set; }
    public string Content { get; set; }

    public int Rating { get; set; }

    public DateTime CreatedDate { get; set; }

    public int BlogId { get; set; }
    public Blog Blog { get; set; }
}

q1 is

SELECT t."Key", COALESCE(sum(t."Rating"), 0)::int AS "PostRating"
FROM (
    SELECT p."Rating", date_trunc('day', p."CreatedDate", 'UTC') AS "Key"
    FROM "Posts" AS p
    INNER JOIN "Blogs" AS b ON p."BlogId" = b."BlogId"
    WHERE b."Rating" > 5
) AS t
GROUP BY t."Key"

but q2 is

SELECT t."Key", COALESCE(sum(t."Rating"), 0)::int AS "PostRating", (
    SELECT COALESCE(sum(b0."Rating"), 0)::int
    FROM (
        SELECT p0."PostId", p0."BlogId", p0."Content", p0."CreatedDate", p0."Rating", p0."Title", b1."BlogId" AS "BlogId0", b1."Rating" AS "Rating0", b1."Url", date_trunc('day', p0."CreatedDate", 'UTC') AS "Key"
        FROM "Posts" AS p0
        INNER JOIN "Blogs" AS b1 ON p0."BlogId" = b1."BlogId"
        WHERE b1."Rating" > 5
    ) AS t0
    INNER JOIN "Blogs" AS b0 ON t0."BlogId" = b0."BlogId"
    WHERE t."Key" = t0."Key" OR (t."Key" IS NULL AND t0."Key" IS NULL)) AS "BlogRating"
FROM (
    SELECT p."Rating", date_trunc('day', p."CreatedDate", 'UTC') AS "Key"
    FROM "Posts" AS p
    INNER JOIN "Blogs" AS b ON p."BlogId" = b."BlogId"
    WHERE b."Rating" > 5
) AS t
GROUP BY t."Key"

Isn't filtering twice not optimal? Shouldn't q2 be something like

SELECT t."Key", COALESCE(sum(t."Rating"), 0)::int AS "PostRating", COALESCE(sum(t."BlogRating"), 0)::int AS "BlogRating"
FROM (
    SELECT p."Rating", b."Rating" AS "BlogRating", date_trunc('day', p."CreatedDate", 'UTC') AS "Key"
    FROM "Posts" AS p
    INNER JOIN "Blogs" AS b ON p."BlogId" = b."BlogId"
    WHERE b."Rating" > 5
) AS t
GROUP BY t."Key"

We ran into this issue with a bigger table (30m rows), ended up doing 2 queries where we group values from each table separately and merge it in code.

domagojmedo avatar Apr 26 '24 12:04 domagojmedo