gpdb
gpdb copied to clipboard
Fix possible dxid=0 for extended queries after session reset
Normally MySessionState->latestCursorCommandId remains synchronized with gp_command_count. However, if the session is reset (e.g. due to Out Of Memory, errcode ERRCODE_GP_MEMPROT_KILL), gp_command_count will be reset to 0, but MySessionState->latestCursorCommandId remains unchanged.
The following extended queries might opportunely occur (++gp_command_count == MySessionState->latestCursorCommandId) So setupDtxTransaction() would be skipped.
It can trigger:
AssertImply(inCursor,
dtxContextInfo->distributedXid != InvalidDistributedTransactionId &&
gp_command_count == MySessionState->latestCursorCommandId);
Fix it by resetting also MySessionState->latestCursorCommandId while session reset.
Here are some reminders before you submit the pull request
- [ ] Add tests for the change
- [ ] Document changes
- [ ] Communicate in the mailing list if needed
- [ ] Pass
make installcheck
- [ ] Review a PR in return to support the community
It can be reproduced by a simple JDBC example:
test.java:
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.Statement;
public class test {
public void run() {
Connection conn = null;
Statement st = null;
String url = "jdbc:postgresql://127.0.0.1:5432/postgres";
try {
Class.forName("org.postgresql.Driver");
conn = DriverManager.getConnection(url, "gpadmin", "<password>");
st = conn.createStatement();
System.out.println("Running...");
// JDBC will send some SET commands automatically.
// Here we reset session so gp_command_count is 0 again.
try {
st.executeQuery("select palloc_mem(800::bigint<<30)");
} catch (Exception e) {
e.printStackTrace();
System.out.println("Continue... ");
}
// After this one, gp_command_count=1, MySessionState->latestCursorCommandId=1
st.executeQuery("select * from gp_dist_random('gp_id')");
// Reset session, so gp_command_count=0
try {
st.executeQuery("select palloc_mem(800::bigint<<30)");
} catch (Exception e) {
e.printStackTrace();
System.out.println("Continue... ");
}
// Trigger assertion
st.executeQuery("select * from gp_dist_random('gp_id')");
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
test testQuery = new test();
testQuery.run();
}
}
Makefile:
all:
@javac test.java
@java -classpath .:postgresql-9.3-1102-jdbc41.jar test
palloc_mem
is UDF function:
/* SQL --------------------
CREATE FUNCTION public.palloc_mem(bigint) RETURNS void
AS 'MODULE_PATHNAME', 'palloc_mem'
LANGUAGE C STABLE;
------------------------- */
#include "postgres.h"
#include "miscadmin.h"
#include "utils/memutils.h"
PG_FUNCTION_INFO_V1(palloc_mem);
Datum
palloc_mem(PG_FUNCTION_ARGS)
{
int64 size = PG_GETARG_INT64(0);
int64 i, count = size / MaxAllocSize;
HOLD_INTERRUPTS(); // bypass runaway
for (i = 0; i < count; i++)
(void)palloc(MaxAllocSize);
(void)palloc(size % MaxAllocSize);
RESUME_INTERRUPTS();
PG_RETURN_VOID();
}
Run:
$ make
Running...
org.postgresql.util.PSQLException: ERROR: Out of memory
Detail: Resource group memory limit reached
at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2198)
at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:1927)
at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:255)
at org.postgresql.jdbc2.AbstractJdbc2Statement.execute(AbstractJdbc2Statement.java:561)
at org.postgresql.jdbc2.AbstractJdbc2Statement.executeWithFlags(AbstractJdbc2Statement.java:405)
at org.postgresql.jdbc2.AbstractJdbc2Statement.executeQuery(AbstractJdbc2Statement.java:285)
at test.run(test.java:23)
at test.main(test.java:50)
Continue...
org.postgresql.util.PSQLException: ERROR: Out of memory
Detail: Resource group memory limit reached
at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2198)
at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:1927)
at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:255)
at org.postgresql.jdbc2.AbstractJdbc2Statement.execute(AbstractJdbc2Statement.java:561)
at org.postgresql.jdbc2.AbstractJdbc2Statement.executeWithFlags(AbstractJdbc2Statement.java:405)
at org.postgresql.jdbc2.AbstractJdbc2Statement.executeQuery(AbstractJdbc2Statement.java:285)
at test.run(test.java:34)
at test.main(test.java:50)
Continue...
org.postgresql.util.PSQLException: FATAL: Unexpected internal error (cdbdtxcontextinfo.c:71)
Detail: AssertImply failed("!(!(inCursor) || (dtxContextInfo->distributedXid != ((DistributedTransactionId) 0) && gp_command_count == MySessionState->latestCursorCommandId))", File: "cdbdtxcontextinfo.c", Line: 71)
at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2198)
at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:1927)
at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:255)
at org.postgresql.jdbc2.AbstractJdbc2Statement.execute(AbstractJdbc2Statement.java:561)
at org.postgresql.jdbc2.AbstractJdbc2Statement.executeWithFlags(AbstractJdbc2Statement.java:405)
at org.postgresql.jdbc2.AbstractJdbc2Statement.executeQuery(AbstractJdbc2Statement.java:285)
at test.run(test.java:41)
at test.main(test.java:50)
We met this issue many times during JMeter tests on GP6:
$ cat gpdb-2022-08-26_000000.csv | grep AssertImply
2022-08-26 09:40:37.641232 CST,"dbtest","cloud",p558078,th994261120,"10.33.80.20","49397",2022-08-26 09:22:18 CST,0,con21951,cmd1,seg-1,,,,sx1,"FATAL","XX000","Unexpected internal error (cdbdtxcontextinfo.c:71)","AssertImply failed(""!(!(inCursor) || (dtxContextInfo->distributedXid != ((DistributedTransactionId) 0) && gp_command_count == MySessionState->latestCursorCommandId))"", File: ""cdbdtxcontextinfo.c"", Line: 71)",,,,,"/*
2022-08-26 10:39:27.561719 CST,"dbtest","cloud",p33173,th994261120,"10.33.80.20","62646",2022-08-26 10:33:01 CST,0,con1586,cmd1,seg-1,,,,sx1,"FATAL","XX000","Unexpected internal error (cdbdtxcontextinfo.c:71)","AssertImply failed(""!(!(inCursor) || (dtxContextInfo->distributedXid != ((DistributedTransactionId) 0) && gp_command_count == MySessionState->latestCursorCommandId))"", File: ""cdbdtxcontextinfo.c"", Line: 71)",,,,,"
2022-08-26 10:44:36.168985 CST,"dbtest","cloud",p177604,th994261120,"10.33.80.20","62867",2022-08-26 10:39:51 CST,0,con150,cmd1,seg-1,,,,sx1,"FATAL","XX000","Unexpected internal error (cdbdtxcontextinfo.c:71)","AssertImply failed(""!(!(inCursor) || (dtxContextInfo->distributedXid != ((DistributedTransactionId) 0) && gp_command_count == MySessionState->latestCursorCommandId))"", File: ""cdbdtxcontextinfo.c"", Line: 71)",,,,,"/*
2022-08-26 10:50:14.185465 CST,"dbtest","cloud",p268505,th994261120,"10.33.80.20","63011",2022-08-26 10:44:52 CST,0,con190,cmd2,seg-1,,,,sx1,"FATAL","XX000","Unexpected internal error (cdbdtxcontextinfo.c:71)","AssertImply failed(""!(!(inCursor) || (dtxContextInfo->distributedXid != ((DistributedTransactionId) 0) && gp_command_count == MySessionState->latestCursorCommandId))"", File: ""cdbdtxcontextinfo.c"", Line: 71)",,,,,"/*
JMeter tests
Very Interesting. Thank you for bug, RCA and fix. Please can you explain more on what kind of testing you are performing for GPDB using this and related details.
JMeter tests
Very Interesting. Thank you for bug, RCA and fix. Please can you explain more on what kind of testing you are performing for GPDB using this and related details.
To drain CPU&IO, we make TB-data partitioned table without indexes, and we use JMeter "Concurrency Thread Group" to run 16 concuurent tests persistently, each test randomly chooses a dynamically generated SQL (type can be Update/Delete/Insert/Select).
The assertion often happens for sessions: First an UPDATE sql reports "Out of memory - Resource group memory limit reached", then a SELECT sql trigger the assertion.
hi, @dh-cloud can we reproduce this issue without jdbc and use psql?
hi, @dh-cloud can we reproduce this issue without jdbc and use psql?
It requires extended queries.
we have some test for extended queries via /home/ashwin/workspace/gpdb/src/test/regress/twophase_pqexecparams.c
can something on similar lines help to repro and write automated test?