velox
velox copied to clipboard
Add parquet bloom filter read support for int,bigint,string columns
This PR adds support to read a column's bloom filter if it exists and tests it to discard rowgroups that don't satisfy the filter. For e.g., if the query has a filter c1 = 100 and a bloom filter for column c1 exists, the test is done on bloom filter whether value 100 exists in the bloom filter. If it returns true, then Parquet reader proceeds to read the column chunks for the rowgroup, else it discards that rowgroup.
This feature is supported using a config property hive.parquet.use-bloom-filter and session property hive_parquet_read_bloom_filter.
Deploy Preview for meta-velox canceled.
| Name | Link |
|---|---|
| Latest commit | f47318d497cbef525e5d8b8ed8af87c71ca547f9 |
| Latest deploy log | https://app.netlify.com/sites/meta-velox/deploys/67d7f83f9d96170008161b17 |
@Yuhta Thanks for the review. I have addressed your comments. Please review.
For the config/session property, I thought we would need a feature flag to disable/enable when wanted. The session property helps in testing.
@Yuhta I have addressed your comments. Request to please review again.
This pull request has been automatically marked as stale because it has not had recent activity. If you'd still like this PR merged, please comment on the PR, make sure you've addressed reviewer comments, and rebase on the latest main. Thank you for your contributions!
Reopening
Ran some performance tests on Mac with file hive metastore and data in local storage. Results below. Data size is 1M rows in a single parquet file with 27 rowgroups. Very good improvement in datasize/second.
-- QUERY WITH EQUALITY FILTER ON int32 COLUMN
select * from nobloom1m where i32=350480;
id | i64 | uuid_str | i32
--------+----------------------+--------------------------------------+--------
1851 | -5381273289709368977 | 2dc70313-0da3-4572-8e46-dd64a1a14a4b | 350480
199765 | -7648116877464198269 | 2b8abc14-3c3b-4e34-bcb7-5092d78683d9 | 350480
762966 | 2045830541054823720 | 0901964b-c5c6-47f1-a0f8-63108210a9d3 | 350480
(3 rows)
Query 20250306_083844_00001_5mbwz, FINISHED, 1 node
Splits: 3 total, 3 done (100.00%)
w/o BloomFilter [Latency: client-side: 0:02, server-side: 0:01] [1000K rows, 31.9MB] [736K rows/s, 23.5MB/s]
select * from bloom1m where i32=350480;
id | i64 | uuid_str | i32
--------+----------------------+--------------------------------------+--------
762966 | 2045830541054823720 | 0901964b-c5c6-47f1-a0f8-63108210a9d3 | 350480
1851 | -5381273289709368977 | 2dc70313-0da3-4572-8e46-dd64a1a14a4b | 350480
199765 | -7648116877464198269 | 2b8abc14-3c3b-4e34-bcb7-5092d78683d9 | 350480
(3 rows)
Query 20250306_083901_00002_5mbwz, FINISHED, 1 node
Splits: 3 total, 3 done (100.00%)
with BloomFilter [Latency: client-side: 0:01, server-side: 387ms] [1000K rows, 50.2MB] [2.58M rows/s, 130MB/s]
-- QUERY WITH EQUALITY FILTER ON int64 COLUMN
select * from nobloom1m where i64 = -4795602564334896251;
id | i64 | uuid_str | i32
--------+----------------------+--------------------------------------+--------
500000 | -4795602564334896251 | 81fb9e09-1c96-4ed6-ae73-fc321845b40f | 729885
(1 row)
w/o BloomFilter [Latency: client-side: 451ms, server-side: 356ms] [1000K rows, 30.1MB] [2.81M rows/s, 84.5MB/s]
select * from bloom1m where i64 = -4795602564334896251;
id | i64 | uuid_str | i32
--------+----------------------+--------------------------------------+--------
500000 | -4795602564334896251 | 81fb9e09-1c96-4ed6-ae73-fc321845b40f | 729885
(1 row)
with BloomFilter [Latency: client-side: 429ms, server-side: 341ms] [1000K rows, 50.2MB] [2.93M rows/s, 147MB/s]
-- QUERY WITH EQUALITY FILTER ON string COLUMN
select * from nobloom1m where uuid_str = '579144d3-41b9-45b6-b087-5ee3a6f138c5';
id | i64 | uuid_str | i32
--------+---------------------+--------------------------------------+--------
700021 | 6693176875950799448 | 579144d3-41b9-45b6-b087-5ee3a6f138c5 | 266864
(1 row)
w/o BloomFilter [Latency: client-side: 0:01, server-side: 428ms] [1000K rows, 46.4MB] [2.34M rows/s, 108MB/s]
select * from bloom1m where uuid_str = '579144d3-41b9-45b6-b087-5ee3a6f138c5';
id | i64 | uuid_str | i32
--------+---------------------+--------------------------------------+--------
700021 | 6693176875950799448 | 579144d3-41b9-45b6-b087-5ee3a6f138c5 | 266864
(1 row)
with BloomFilter [Latency: client-side: 403ms, server-side: 326ms] [1000K rows, 50.2MB] [3.07M rows/s, 154MB/s]
-- QUERY WITH IN LIST FILTER ON int32 COLUMN
select * from nobloom1m where i32 IN (861238,960184,175196,415315, 874289,817028, 583611, 140830, 928513, 473720);
id | i64 | uuid_str | i32
--------+----------------------+--------------------------------------+--------
624073 | 4821509976325655273 | 89162f92-2ee2-4d5c-b842-b3181d506656 | 140830
...(17 rows)
w/o BloomFilter [Latency: client-side: 0:01, server-side: 0:01] [1000K rows, 35.8MB] [1.51M rows/s, 54.1MB/s]
select * from bloom1m where i32 IN (861238,960184,175196,415315, 874289,817028, 583611, 140830, 928513, 473720);
id | i64 | uuid_str | i32
--------+----------------------+--------------------------------------+--------
538981 | 4016403845055264990 | 1b6f65f4-359d-4e98-80c8-f747c7f9fe76 | 960184
...(17 rows)
with BloomFilter [Latency: client-side: 0:01, server-side: 434ms] [1000K rows, 50.2MB] [2.3M rows/s, 116MB/s]
-- QUERY WITH IN LIST FILTER ON int64 COLUMN
select * from nobloom1m where i64
IN (4844306939353580699, 8165227391461865716, 5700831961382649896, 6029090564549764056,
3019438974541197522, 7782354359169432543, 7042319073133563630, -7548203853298777064,
-2014299987200530584, 252587141269205362);
id | i64 | uuid_str | i32
--------+----------------------+--------------------------------------+--------
200000 | 4844306939353580699 | 4518058f-128e-4e3f-80f9-c9f49c5fd339 | 197612
...(10 rows)
w/o BloomFilter [Latency: client-side: 0:01, server-side: 0:01] [1000K rows, 28.2MB] [1.54M rows/s, 43.4MB/s]
select * from bloom1m where i64
IN (4844306939353580699, 8165227391461865716, 5700831961382649896, 6029090564549764056,
3019438974541197522, 7782354359169432543, 7042319073133563630, -7548203853298777064,
-2014299987200530584, 252587141269205362);
id | i64 | uuid_str | i32
--------+----------------------+--------------------------------------+--------
200000 | 4844306939353580699 | 4518058f-128e-4e3f-80f9-c9f49c5fd339 | 197612
...(10 rows)
with BloomFilter [Latency: client-side: 0:01, server-side: 445ms] [1000K rows, 50.2MB] [2.25M rows/s, 113MB/s]
-- QUERY WITH IN LIST FILTER ON string COLUMN
select * from nobloom1m where uuid_str
IN
('37501ac3-9b18-4d81-bf57-09e525e07f6e',
'18d66a8a-989b-4693-a4b6-014eee13bff8',
'94dcb860-607d-40c2-a5ee-a26bc602c61e',
'db02e668-b810-4dd1-a0d8-b965d980527f',
'7cef903f-8f12-421e-8b2f-70eda2c4b14b',
'12a10da0-0a64-4d89-8279-c619c1dfba54',
'b8ce2fc7-19cf-4987-82ac-a318717aec34',
'a1a8e76b-5bbf-4120-a8b1-3ecdf627a028',
'f98d9537-4d04-43ea-9c48-5fa28a92cd14',
'2cc077d6-91e2-43d6-864d-fb0f5d193f39');
id | i64 | uuid_str | i32
--------+----------------------+--------------------------------------+--------
458753 | -3380685551391314659 | 37501ac3-9b18-4d81-bf57-09e525e07f6e | 553140
...(10 rows)
w/o BloomFilter [Latency: client-side: 0:01, server-side: 0:01] [1000K rows, 45.7MB] [1.33M rows/s, 60.7MB/s]
select * from bloom1m where uuid_str
IN
('37501ac3-9b18-4d81-bf57-09e525e07f6e',
'18d66a8a-989b-4693-a4b6-014eee13bff8',
'94dcb860-607d-40c2-a5ee-a26bc602c61e',
'db02e668-b810-4dd1-a0d8-b965d980527f',
'7cef903f-8f12-421e-8b2f-70eda2c4b14b',
'12a10da0-0a64-4d89-8279-c619c1dfba54',
'b8ce2fc7-19cf-4987-82ac-a318717aec34',
'a1a8e76b-5bbf-4120-a8b1-3ecdf627a028',
'f98d9537-4d04-43ea-9c48-5fa28a92cd14',
'2cc077d6-91e2-43d6-864d-fb0f5d193f39');
id | i64 | uuid_str | i32
--------+----------------------+--------------------------------------+--------
458753 | -3380685551391314659 | 37501ac3-9b18-4d81-bf57-09e525e07f6e | 553140
...(10 rows)
with BloomFilter [Latency: client-side: 0:01, server-side: 0:01] [1000K rows, 50.2MB] [1.61M rows/s, 80.7MB/s]
@Yuhta @yingsu00 Can you please review this?
@rui-mo Can you please review this?
This pull request has been automatically marked as stale because it has not had recent activity. If you'd still like this PR merged, please comment on the PR, make sure you've addressed reviewer comments, and rebase on the latest main. Thank you for your contributions!