[Not confirmed] SELECT time_bucket('0s',...) ... GROUP BY ... query segfaults on Debian
Relevant system information:
- OS: Debian 10
- PostgreSQL version (output of
postgres --version): postgres (PostgreSQL) 13.4 (Debian 13.4-1.pgdg100+1) - TimescaleDB version (output of
\dxinpsql): 2.5.0 - Installation method: apt
Describe the bug During a query, postgres segfaulted. Core dump indicates it was during a libpq call from async_response_result_close
To Reproduce Steps to reproduce the behavior:
create table test (time timestamp, v int);
select create_distributed_hypertable('test','time');
insert into test values ('2021-10-29T18:00:00+00', 0);
select time_bucket('0s',"time") from test GROUP BY "time";
Expected behavior No segfault
Actual behavior Segfault
Screenshots If applicable, add screenshots to help explain your problem.
Additional context
While this value for time_bucket interval might not make sense, it was generated by Grafana, so not within my control.
(gdb) bt full
#0 0x0000000000000000 in ?? ()
No symbol table info available.
#1 0x00007f1631375c3d in PQclear (res=<optimized out>) at ./build/../src/interfaces/libpq/fe-exec.c:710
evt = {result = 0x55a80211d1f0}
block = <optimized out>
i = 0
block = <optimized out>
i = <optimized out>
evt = <optimized out>
#2 PQclear (res=0x55a80211d1f0) at ./build/../src/interfaces/libpq/fe-exec.c:694
block = <optimized out>
i = <optimized out>
evt = <optimized out>
#3 0x00007f1631413345 in async_response_result_close (res=0x55a80226ce28) at ./tsl/src/remote/async.c:338
No locals.
#4 0x00007f1631418d7e in cursor_fetcher_fetch_data_complete (cursor=0x55a802261080) at ./tsl/src/remote/cursor_fetcher.c:352
_save_exception_stack = 0x7ffd90bead50
_save_context_stack = 0x7ffd90beb420
_local_sigjmp_buf = {{__jmpbuf = {94180078915712, -8368000476335463812, 94180044825024, 94180078907888, 94180078915712, 1, -8368000476421447044, -8500086793593052548}, __mask_was_saved = 0, __saved_mask = {__val = {8878361712873700608, 94180078915891, 94180078731696, 94180079158752, 94180078907888, 94180078915712, 1, 140727031868336, 139733292388054, 206158430248, 140727031870496, 140727031868752, 94180078915712, 94180078453208, 94180078963320, 0}}}}
_do_rethrow = <optimized out>
response = 0x55a80226ce28
oldcontext = <optimized out>
numrows = 0
format = 0
__func__ = "cursor_fetcher_fetch_data_complete"
#5 0x00007f1631419705 in data_fetcher_get_tuple (row=<optimized out>, df=0x55a802261080) at ./tsl/src/remote/data_fetcher.c:80
No locals.
#6 data_fetcher_get_next_tuple (df=0x55a802261080) at ./tsl/src/remote/data_fetcher.c:97
tuple = <optimized out>
#7 0x00007f1631401653 in fdw_scan_iterate (ss=ss@entry=0x55a80225f548, fsstate=fsstate@entry=0x55a80225f658) at ./tsl/src/fdw/scan_exec.c:273
slot = 0x55a80225f948
tuple = <optimized out>
fetcher = <optimized out>
#8 0x00007f1631407926 in data_node_scan_next (node=node@entry=0x55a80225f548) at ./tsl/src/fdw/data_node_scan_exec.c:65
sss = 0x55a80225f548
econtext = <optimized out>
oldcontext = 0x55a80225f1f0
slot = <optimized out>
__func__ = "data_node_scan_next"
__errno_location = <optimized out>
#9 0x000055a7ffcbaf89 in ExecScanFetch (recheckMtd=0x7f16314077d0 <data_node_scan_recheck>, accessMtd=0x7f16314078f0 <data_node_scan_next>, node=0x55a80225f548) at ./build/../src/backend/executor/execScan.c:133
estate = 0x55a80225f308
estate = <optimized out>
epqstate = <optimized out>
scanrelid = <optimized out>
slot = <optimized out>
slot = <optimized out>
slot = <optimized out>
slot = <optimized out>
#10 ExecScan (node=0x55a80225f548, accessMtd=0x7f16314078f0 <data_node_scan_next>, recheckMtd=0x7f16314077d0 <data_node_scan_recheck>) at ./build/../src/backend/executor/execScan.c:182
econtext = <optimized out>
qual = 0x0
projInfo = 0x0
#11 0x000055a7ffcb0c83 in ExecProcNode (node=0x55a80225f548) at ./build/../src/include/executor/executor.h:248
No locals.
#12 ExecutePlan (execute_once=<optimized out>, dest=0x55a8020c83e8, direction=<optimized out>, numberTuples=501, sendTuples=<optimized out>, operation=CMD_SELECT, use_parallel_mode=<optimized out>, planstate=0x55a80225f548, estate=0x55a80225f308) at ./build/../src/backend/executor/execMain.c:1632
slot = <optimized out>
current_tuple_count = 0
slot = <optimized out>
current_tuple_count = <optimized out>
#13 standard_ExecutorRun (queryDesc=0x55a802230548, direction=<optimized out>, count=501, execute_once=<optimized out>) at ./build/../src/backend/executor/execMain.c:350
estate = 0x55a80225f308
operation = CMD_SELECT
dest = 0x55a8020c83e8
sendTuples = <optimized out>
oldcontext = 0x55a802230320
__func__ = "standard_ExecutorRun"
#14 0x00007f16b720109d in pgss_ExecutorRun (queryDesc=0x55a802230548, direction=ForwardScanDirection, count=501, execute_once=<optimized out>) at ./build/../contrib/pg_stat_statements/pg_stat_statements.c:1045
_save_exception_stack = 0x7ffd90beae60
_save_context_stack = 0x7ffd90beb420
_local_sigjmp_buf = {{__jmpbuf = {94180077952616, -8368000476526304644, 94180078716232, 501, 501, 1, -8368000476559859076, -8500354236980157828}, __mask_was_saved = 0, __saved_mask = {__val = {8878361712873700608, 94180077588976, 94180077588976, 140727031868944, 0, 140727031868896, 1, 0, 8878361712873700608, 94180077588976, 139733292489664, 140727031869104, 94180078747480, 1, 140727031869216, 0}}}}
_do_rethrow = false
#15 0x00007f16b71f39bd in ?? () from /usr/lib/postgresql/13/lib/pg_stat_kcache.so
No symbol table info available.
#16 0x00007f16b71dfccd in ?? () from /usr/lib/postgresql/13/lib/pg_qualstats.so
No symbol table info available.
#17 0x000055a7ffe2b54c in PortalRunSelect (portal=portal@entry=0x55a802175e68, forward=forward@entry=true, count=count@entry=501, dest=dest@entry=0x55a8020c83e8) at ./build/../src/backend/tcop/pquery.c:919
queryDesc = 0x55a802230548
direction = <optimized out>
nprocessed = <optimized out>
__func__ = "PortalRunSelect"
#18 0x000055a7ffe2cb38 in PortalRun (portal=0x55a802175e68, count=501, isTopLevel=<optimized out>, run_once=<optimized out>, dest=0x55a8020c83e8, altdest=0x55a8020c83e8, qc=0x7ffd90beb380) at ./build/../src/backend/tcop/pquery.c:763
_save_exception_stack = 0x7ffd90beb440
_save_context_stack = 0x7ffd90beb420
_local_sigjmp_buf = {{__jmpbuf = {0, -8368000477203684740, 3, 94180077241320, 94180077952616, 0, -8368000477247724932, -2347996780806036868}, __mask_was_saved = 0, __saved_mask = {__val = {16098747145842565756, 94180042866688, 94180042207816, 140727031869952, 94180037632023, 140727031869840, 140727031946207, 140727031869888, 112, 94180077240000, 80, 94180077952616, 0, 94180078925896, 94180077241320, 94180077952616}}}}
_do_rethrow = <optimized out>
result = <optimized out>
nprocessed = <optimized out>
saveTopTransactionResourceOwner = 0x55a802133988
saveTopTransactionContext = 0x55a8021962b0
saveActivePortal = 0x0
saveResourceOwner = 0x55a802133988
savePortalContext = 0x0
saveMemoryContext = 0x55a8020c7ec0
__func__ = "PortalRun"
#19 0x000055a7ffe29d23 in PostgresMain (argc=<optimized out>, argv=<optimized out>, dbname=<optimized out>, username=<optimized out>) at ./build/../src/backend/tcop/postgres.c:2195
firstchar = <optimized out>
input_message = {data = 0x55a8020c7fd8 "", len = 5, maxlen = 1024, cursor = 5}
local_sigjmp_buf = {{__jmpbuf = {140727031870400, -8368000477346291076, 1, 94180077653272, 3, 94180077617664, -8368000477199490436, -2347996778577026436}, __mask_was_saved = 1, __saved_mask = {__val = {0, 139637976727552, 139735551210936, 140727031870896, 8878361712873700608, 140727031870864, 94180042484015, 206158430248, 140727031870880, 140727031870688, 94180077240000, 1024, 140727031870976, 94180077653608, 94180077617664, 140727031870752}}}}
send_ready_for_query = false
disable_idle_in_transaction_timeout = <optimized out>
__func__ = "PostgresMain"
#20 0x000055a7ffda9bcd in BackendRun (port=0x55a802124200, port=0x55a802124200) at ./build/../src/backend/postmaster/postmaster.c:4526
av = 0x55a80212ce68
maxac = <optimized out>
ac = 1
i = 1
av = <optimized out>
maxac = <optimized out>
ac = <optimized out>
i = <optimized out>
__func__ = "BackendRun"
__errno_location = <optimized out>
__errno_location = <optimized out>
__errno_location = <optimized out>
#21 BackendStartup (port=0x55a802124200) at ./build/../src/backend/postmaster/postmaster.c:4210
bn = <optimized out>
pid = <optimized out>
bn = <optimized out>
pid = <optimized out>
__func__ = "BackendStartup"
__errno_location = <optimized out>
__errno_location = <optimized out>
save_errno = <optimized out>
__errno_location = <optimized out>
__errno_location = <optimized out>
#22 ServerLoop () at ./build/../src/backend/postmaster/postmaster.c:1739
port = 0x55a802124200
i = <optimized out>
rmask = {fds_bits = {64, 0 <repeats 15 times>}}
selres = <optimized out>
now = <optimized out>
readmask = {fds_bits = {192, 0 <repeats 15 times>}}
nSockets = 8
last_lockfile_recheck_time = 1635532472
last_touch_time = 1635531617
__func__ = "ServerLoop"
#23 0x000055a7ffdaab41 in PostmasterMain (argc=17, argv=<optimized out>) at ./build/../src/backend/postmaster/postmaster.c:1412
opt = <optimized out>
status = <optimized out>
userDoption = <optimized out>
listen_addr_saved = true
i = <optimized out>
output_config_variable = <optimized out>
__func__ = "PostmasterMain"
#24 0x000055a7ffaf4f4f in main (argc=17, argv=0x55a8020c1c30) at ./build/../src/backend/main/main.c:210
I get an exception:
ERROR: period must be greater than 0
Is there anything particular with your set-up? How many data-nodes? What timescaledb version is in the data-nodes?
2 data nodes. version 2.5.0
I tried with 2 data nodes, version 2.5.0 and PG 13.4. Both Debug and Release builds on MacOSX. Get the same error:
postgres=# select time_bucket('0s',"time") from test GROUP BY "time";
ERROR: period must be greater than 0
Also tried PG versions 13.2, 12.8 and 12.6
I set up a Debian 10 VM, with the same packages, and reproducing the steps in the description does not make it crash for some reason. Maybe there is additional state that needs to be there, that doesn't exist in a vanilla installation.
I only get that period must be greater than 0 error when GROUP BY "time" is not present in the query.
I'm not sure what the difference in our setups would be.
I tried attaching GDB to the access node and putting a breakpoint on ts_timestamp_bucket, but the breakpoint doesn't even trigger.
So below is my stack trace when I put a breakpoint on ts_timestamp_bucket and I omit the GROUP BY "time" (so that I get the expected error):
#0 ts_timestamp_bucket (fcinfo=0x55a80220b628) at ./src/time_bucket.c:157
#1 0x000055a7ffcaca8e in ExecInterpExpr (state=0x55a80220b550, econtext=0x55a80220b1e8, isnull=<optimized out>) at ./build/../src/backend/executor/execExprInterp.c:704
#2 0x000055a7ffcbaf4a in ExecEvalExprSwitchContext (isNull=0x7ffd90beaba7, econtext=0x55a80220b1e8, state=0x55a80220b550) at ./build/../src/include/executor/executor.h:322
#3 ExecProject (projInfo=0x55a80220b548) at ./build/../src/include/executor/executor.h:356
#4 ExecScan (node=<optimized out>, accessMtd=0x7f16314078f0 <data_node_scan_next>, recheckMtd=0x7f16314077d0 <data_node_scan_recheck>) at ./build/../src/backend/executor/execScan.c:238
#5 0x000055a7ffcb0c83 in ExecProcNode (node=0x55a80220afd8) at ./build/../src/include/executor/executor.h:248
#6 ExecutePlan (execute_once=<optimized out>, dest=0x55a80220ea50, direction=<optimized out>, numberTuples=0, sendTuples=<optimized out>, operation=CMD_SELECT, use_parallel_mode=<optimized out>, planstate=0x55a80220afd8, estate=0x55a80220ad98) at ./build/../src/backend/executor/execMain.c:1632
#7 standard_ExecutorRun (queryDesc=0x55a80220a988, direction=<optimized out>, count=0, execute_once=<optimized out>) at ./build/../src/backend/executor/execMain.c:350
#8 0x00007f16b720109d in pgss_ExecutorRun (queryDesc=0x55a80220a988, direction=ForwardScanDirection, count=0, execute_once=<optimized out>) at ./build/../contrib/pg_stat_statements/pg_stat_statements.c:1045
#9 0x00007f16b71f39bd in ?? () from /usr/lib/postgresql/13/lib/pg_stat_kcache.so
#10 0x00007f16b71dfccd in ?? () from /usr/lib/postgresql/13/lib/pg_qualstats.so
#11 0x000055a7ffe2b54c in PortalRunSelect (portal=portal@entry=0x55a80216c988, forward=forward@entry=true, count=0, count@entry=9223372036854775807, dest=dest@entry=0x55a80220ea50) at ./build/../src/backend/tcop/pquery.c:919
#12 0x000055a7ffe2cb38 in PortalRun (portal=0x55a80216c988, count=9223372036854775807, isTopLevel=<optimized out>, run_once=<optimized out>, dest=0x55a80220ea50, altdest=0x55a80220ea50, qc=0x7ffd90beb190) at ./build/../src/backend/tcop/pquery.c:763
#13 0x000055a7ffe28595 in exec_simple_query (query_string=<optimized out>) at ./build/../src/backend/tcop/postgres.c:1239
#14 0x000055a7ffe2a143 in PostgresMain (argc=<optimized out>, argv=<optimized out>, dbname=<optimized out>, username=<optimized out>) at ./build/../src/backend/tcop/postgres.c:4339
#15 0x000055a7ffda9bcd in BackendRun (port=0x55a802124200, port=0x55a802124200) at ./build/../src/backend/postmaster/postmaster.c:4526
#16 BackendStartup (port=0x55a802124200) at ./build/../src/backend/postmaster/postmaster.c:4210
#17 ServerLoop () at ./build/../src/backend/postmaster/postmaster.c:1739
#18 0x000055a7ffdaab41 in PostmasterMain (argc=17, argv=<optimized out>) at ./build/../src/backend/postmaster/postmaster.c:1412
#19 0x000055a7ffaf4f4f in main (argc=17, argv=0x55a8020c1c30) at ./build/../src/backend/main/main.c:210
When compared against the stack trace that segfaults, we can see it deviates in the ExecScan frame. In the expected result, it's frame #4 execScan.c:238. In the segfault it's frame #10 execScan.c:182.
The condition that causes the branch is that in the segfault projInfo is null, where as in the error it is defined. Though I'm not sure if this is expected due to the group by or not.
@phemmer thanks for providing more details and sorry for the delay. I will investigate this.
So far I was unable to reproduce the issue. I used the master branch and PG 14.1. I modified continuous_aggs_variable_size_buckets.sql test like this:
...
-- Test caggs with monthly buckets on top of distributed hypertable
\c :TEST_DBNAME :ROLE_CLUSTER_SUPERUSER
\set DATA_NODE_1 :TEST_DBNAME _1
\set DATA_NODE_2 :TEST_DBNAME _2
\set DATA_NODE_3 :TEST_DBNAME _3
SELECT (add_data_node (name, host => 'localhost', DATABASE => name)).*
FROM (VALUES (:'DATA_NODE_1'), (:'DATA_NODE_2'), (:'DATA_NODE_3')) v (name);
GRANT USAGE ON FOREIGN SERVER :DATA_NODE_1, :DATA_NODE_2, :DATA_NODE_3 TO PUBLIC;
SET ROLE :ROLE_DEFAULT_PERM_USER;
CREATE TABLE conditions_dist(
day TIMESTAMP NOT NULL, -- type changed as in the issue
temperature INT NOT NULL);
SELECT table_name FROM create_distributed_hypertable('conditions_dist', 'day', chunk_time_interval => INTERVAL '1 day');
INSERT INTO conditions_dist(day, temperature)
SELECT ts, date_part('month', ts)*100 + date_part('day', ts)
FROM generate_series('2010-01-01' :: TIMESTAMP, '2010-03-01' :: TIMESTAMP - interval '1 day', '1 day') as ts;
-- same query as in the issue
select time_bucket('0s', day) FROM conditions_dist GROUP BY day;
Executing cd build && make -j8 && make install && TESTS=continuous_aggs_variable_size_buckets make installcheck resulted in:
$ cat /Users/eax/projects/c/timescaledb/build/tsl/test/regression.diffs
...
+FROM generate_series('2010-01-01' :: TIMESTAMP, '2010-03-01' :: TIMESTAMP - interval '1 day', '1 day') as ts;
+select time_bucket('0s', day) FROM conditions_dist GROUP BY day;
+ERROR: period must be greater than 0
Similarly there is no segfault for timescaledb_experimental.time_bucket_ng
I'm going to repeat the test on PG 13.4 and TS 2.5.0 as in the bug report and share the results. UPD: OK, I see @mfundul and @nikkhils already did it above.
@phemmer just to clarify, is there anything unusual about your setup? For instance, do you use TS with some other extensions, or maybe there are some triggers on the table? What is the CPU ISA - amd64, arm, ...?
Closing. I barely even recall this issue. Plus this seems to be related to multi-node, which is deprecated, and I stopped trying to use a long time ago.