[BUG] mixed join can hang
Describe the bug We recently had a hang with a customer query. We were able to reproduce it locally with fake data.
Steps/Code to reproduce bug
#include <cudf/ast/expressions.hpp>
#include <cudf/column/column_factories.hpp>
// Include the correct header for mixed_join
#include <cudf/join/mixed_join.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/export.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/device_uvector.hpp>
#include <iostream>
#include <memory>
#include <vector>
#include <string>
...
struct MyTests : public cudf::test::BaseFixture {};
// Helper function to convert a vector of unique_ptr<column> to a vector of column_view
std::vector<cudf::column_view> get_column_views(
std::vector<std::unique_ptr<cudf::column>> const& columns)
{
std::vector<cudf::column_view> views;
views.reserve(columns.size());
for (auto const& col : columns) {
views.push_back(col->view());
}
return views;
}
// Helper to create a strings column from a host vector of strings
std::unique_ptr<cudf::column> make_strings_column_from_host(std::vector<std::string> const& data)
{
std::vector<cudf::size_type> offsets;
offsets.push_back(0);
std::string chars;
for (auto const& s : data) {
chars.append(s);
offsets.push_back(static_cast<cudf::size_type>(chars.size()));
}
rmm::device_uvector<cudf::size_type> d_offsets(offsets.size(), rmm::cuda_stream_default);
RMM_CUDA_TRY(cudaMemcpy(d_offsets.data(),
offsets.data(),
offsets.size() * sizeof(cudf::size_type),
cudaMemcpyHostToDevice));
rmm::device_buffer d_chars(chars.data(), chars.size(), rmm::cuda_stream_default);
auto offsets_col =
std::make_unique<cudf::column>(std::move(d_offsets), rmm::device_buffer{}, 0);
return cudf::make_strings_column(
data.size(), std::move(offsets_col), std::move(d_chars), 0, rmm::device_buffer{});
}
TEST_F(MyTests, join_hang)
{
// --- 1. Create the Left Table (Single Row) ---
std::vector<std::unique_ptr<cudf::column>> left_columns;
{
std::vector<std::string> imsi_data{"310260250298289"};
left_columns.push_back(make_strings_column_from_host(imsi_data));
std::vector<int32_t> hour_data{0};
rmm::device_buffer hour_buffer(hour_data.data(), hour_data.size() * sizeof(int32_t), rmm::cuda_stream_default);
left_columns.push_back(cudf::make_numeric_column(
cudf::data_type{cudf::type_id::INT32}, 1, std::move(hour_buffer), 0, {}));
std::vector<int64_t> time_data{1759115400L};
rmm::device_buffer time_buffer(time_data.data(), time_data.size() * sizeof(int64_t), rmm::cuda_stream_default);
left_columns.push_back(cudf::make_numeric_column(
cudf::data_type{cudf::type_id::INT64}, 1, std::move(time_buffer), 0, {}));
}
cudf::table_view left_table(get_column_views(left_columns));
std::cout << "Left table created. Rows: " << left_table.num_rows()
<< ", Columns: " << left_table.num_columns() << std::endl;
// --- 2. Create the Right Table (Many Rows) ---
constexpr cudf::size_type num_rows = 25445819;
std::string join_imsi = "310260250298289";
std::vector<std::unique_ptr<cudf::column>> right_columns;
{
std::vector<std::string> imsi_data(num_rows, join_imsi);
right_columns.push_back(make_strings_column_from_host(imsi_data));
std::vector<int32_t> hour_data(num_rows, 0); // Matching hour_part = 0
rmm::device_buffer hour_buffer(hour_data.data(), hour_data.size() * sizeof(int32_t), rmm::cuda_stream_default);
right_columns.push_back(cudf::make_numeric_column(
cudf::data_type{cudf::type_id::INT32}, num_rows, std::move(hour_buffer), 0, {}));
std::vector<int64_t> start_time_data(num_rows, 1759113600L); // 2025-09-29 00:00:00 UTC
rmm::device_buffer start_time_buffer(start_time_data.data(), start_time_data.size() * sizeof(int64_t), rmm::cuda_stream_default);
right_columns.push_back(cudf::make_numeric_column(
cudf::data_type{cudf::type_id::INT64}, num_rows, std::move(start_time_buffer), 0, {}));
std::vector<int64_t> end_time_data(num_rows, 1759117199L); // 2025-09-29 00:59:59 UTC
rmm::device_buffer end_time_buffer(end_time_data.data(), end_time_data.size() * sizeof(int64_t), rmm::cuda_stream_default);
right_columns.push_back(cudf::make_numeric_column(
cudf::data_type{cudf::type_id::INT64}, num_rows, std::move(end_time_buffer), 0, {}));
}
cudf::table_view right_table(get_column_views(right_columns));
std::cout << "Right table created. Rows: " << right_table.num_rows()
<< ", Columns: " << right_table.num_columns() << std::endl;
// --- 3. Execute the Join ---
cudf::ast::tree expr_tree;
auto const& ge_expr = expr_tree.emplace<cudf::ast::operation>(
cudf::ast::ast_operator::GREATER_EQUAL,
expr_tree.emplace<cudf::ast::column_reference>(2, cudf::ast::table_reference::LEFT),
expr_tree.emplace<cudf::ast::column_reference>(2, cudf::ast::table_reference::RIGHT));
auto const& le_expr = expr_tree.emplace<cudf::ast::operation>(
cudf::ast::ast_operator::LESS_EQUAL,
expr_tree.emplace<cudf::ast::column_reference>(2, cudf::ast::table_reference::LEFT),
expr_tree.emplace<cudf::ast::column_reference>(3, cudf::ast::table_reference::RIGHT));
auto const& and_expr =
expr_tree.emplace<cudf::ast::operation>(cudf::ast::ast_operator::LOGICAL_AND, ge_expr, le_expr);
std::cout << "Starting join... this is where it will hang." << std::endl;
// Use mixed_left_join for joins with both equality and non-equality conditions.
// This overload takes the full tables and vectors specifying the equality key columns.
auto result =
cudf::mixed_left_join(left_table, right_table, left_table, right_table, and_expr);
std::cout << "Join finished. Result has " << result.first->size() << " rows." << std::endl;
}
Expected behavior No hang the join just works.
I am going to try and find if this is a real hang or if it is just taking forever.
Environment overview (please complete the following information) I ran this with 25.10 in ubuntu
@PointKernel would you please check out this repro? Seems like a nice lightweight repro. Thank you for your help.
This might actually be a performance issue instead of a hang. To the customer it looks the same, but it is quite different. If I vary the number of rows in the right hand column the time growth is not proportional to the column growth.
Is this just a high-multiplicity build table? like https://github.com/rapidsai/cudf/issues/16025?
Is this just a high-multiplicity build table? like #16025?
Probably, but there are additional issues in the given example. First, we are comparing a 3-column table against a 4-column table, which should throw at runtime due to incompatible table shapes. I will open a PR to ensure that the two-table comparator constructor throws in this scenario. Once that is fixed, and if we use only the first two columns for the equality check, the single-row left table will match all 25 million keys in the hash table, resulting in a multiplicity of 25 million. This could explain the extremely long execution time, and in this case, a sort-merge join is likely the more efficient approach.
@PointKernel what is the issue with 3 columns vs 4 columns? Is this the equality part of the join? If so I think that is just an issue with the test. When we do a real join I believe that we just pull out the join keys.
That said, there is clearly a performance problem here. In my testing with 100,000 rows in the right table, which is not that large, the GPU run is 7x slower than the CPU. about 900 ms for the GPU vs 130 ms on the CPU. But then it gets worse as the right side gets larger. At 4 million rows the CPU finishes in 1.5 seconds, while the GPU is taking about 106.5 seconds 71 times slower. At this point it is much faster for us to fall back to the CPU to do the processing. Oh and as a side note because of the fact that there is only one row on the left table, the 71x faster is 1 entire GPU against 1 CPU core.
I also don't see how a sort merge join is going to fix this. Is the issue that fundamentally we are using a singe thread to check all 25-million rows? or what is it that causes this code to scale so horribly.
Is the issue that fundamentally we are using a singe thread to check all 25-million rows?
Exactly
what is it that causes this code to scale so horribly.
Inserting the same key {imsi, hour} 25 million times into a hash table would result in 312,499,987,500,000 hash collisions, arguably one of the most extreme stress tests we had so far for hash table insertion.
Do we already have a sort_merge_join implementation for mixed_left_join that also takes an AST?
Okay so effectively when we look at the build table and we to a count for each unique key group, if we see that any single key has more than a few hundred rows then we should just give up because it is not going to work. At least until we have the sort merge join code.
Do we already have a sort_merge_join implementation for mixed_left_join that also takes an AST?
No, we don't.
@GregoryKimball @shrshi Is it worthwhile to add a sort-based mixed join implementation to libcudf?
@GregoryKimball @shrshi Is it worthwhile to add a sort-based mixed join implementation to libcudf?
If not we will have to do it in spark-rapids because the performance is so bad that we have no other choice.
Thank you for the discussion.
sort-based mixed join implementation
Can we just do the sort-based hash join and then post filter?
if we see that any single key has more than a few hundred rows
then we should be using sort-based anyways
Can we just do the sort-based hash join and then post filter?
This is a left outer join, so not without some extra work. Also I don't want to materialize 25 million rows to throw them all away. I'll see what I can come up with.
Thank you @revans2. Please let me know what options you see. I'm very interested in providing new options in cuDF to process filter joins without using the "mixed" join kernels.
Just FYI I used AI over the weekend to make a stopgap version of the missing join functionality.
https://github.com/NVIDIA/spark-rapids-jni/pull/3858
It is not a great solution, but it does work. As a side note 5 million rows before took about 158 seconds, where as this new join takes about 2 seconds for 25 million rows. This is not apples to apples in comparison, but the order of magnitude should be be enough to say that we are going down the right path. I would appreciate reviews for this, but I do not see it as a final solution. I think that needs to be in cudf. But I do plan on using this as a starting point in the mean time.