TiBigData
TiBigData copied to clipboard
trino-tidb-conn tikv-netty native memory leak
Netty is using more and more native memory. which I think was a memory leak that eventually led to the node OOM, kernel killed the trino-server.
Use pmap and jstack, locate the following stack based on the abnormal thread:
"grpc-nio-worker-ELG-1-14" #149 daemon prio=5 os_prio=0 cpu=20349.76ms elapsed=1507.21s tid=0x00007f9b158223a0 nid=0x9b0e runnable [0x00007f9b207f6000]
java.lang.Thread.State: RUNNABLE
at sun.nio.ch.SocketDispatcher.read0([email protected]/Native Method)
at sun.nio.ch.SocketDispatcher.read([email protected]/SocketDispatcher.java:47)
at sun.nio.ch.IOUtil.readIntoNativeBuffer([email protected]/IOUtil.java:330)
at sun.nio.ch.IOUtil.read([email protected]/IOUtil.java:284)
at sun.nio.ch.IOUtil.read([email protected]/IOUtil.java:259)
at sun.nio.ch.SocketChannelImpl.read([email protected]/SocketChannelImpl.java:417)
at org.tikv.shade.io.netty.buffer.PooledByteBuf.setBytes(PooledByteBuf.java:258)
at org.tikv.shade.io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:1132)
at org.tikv.shade.io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:395)
at org.tikv.shade.io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:151)
at org.tikv.shade.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:722)
at org.tikv.shade.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:658)
at org.tikv.shade.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:584)
at org.tikv.shade.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:496)
at org.tikv.shade.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:995)
at org.tikv.shade.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at org.tikv.shade.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
at java.lang.Thread.run([email protected]/Thread.java:833)
Moreover, when the memory of the trino-node surges or fluctuates, tasks using the connecter will fail with an error:
org.tikv.common.exception.TiClientInternalException: Error scanning data from region.
at org.tikv.common.operation.iterator.ScanIterator.cacheLoadFails(ScanIterator.java:114)
at org.tikv.common.operation.iterator.ConcreteScanIterator.hasNext(ConcreteScanIterator.java:110)
at io.tidb.bigdata.tidb.codec.MetaCodec.hashGetFields(MetaCodec.java:125)
at io.tidb.bigdata.tidb.catalog.CatalogTransaction.getTables(CatalogTransaction.java:94)
at io.tidb.bigdata.tidb.catalog.Catalog$CatalogCache.loadTables(Catalog.java:198)
at io.tidb.bigdata.tidb.catalog.Catalog$CatalogCache.getTable(Catalog.java:186)
at io.tidb.bigdata.tidb.catalog.Catalog.getTable(Catalog.java:111)
at io.tidb.bigdata.tidb.catalog.Catalog.getTable(Catalog.java:104)
at io.tidb.bigdata.tidb.ClientSession.getTable(ClientSession.java:175)
at io.tidb.bigdata.tidb.ClientSession.getTableMust(ClientSession.java:183)
at io.tidb.bigdata.tidb.RecordSetInternal.iterator(RecordSetInternal.java:120)
at io.tidb.bigdata.tidb.RecordSetInternal.cursor(RecordSetInternal.java:96)
at io.tidb.bigdata.trino.tidb.TiDBRecordSet.cursor(TiDBRecordSet.java:68)
at io.trino.spi.connector.RecordPageSource.<init>(RecordPageSource.java:37)
at io.trino.split.RecordPageSourceProvider.createPageSource(RecordPageSourceProvider.java:50)
at io.trino.split.PageSourceManager.createPageSource(PageSourceManager.java:61)
at io.trino.operator.ScanFilterAndProjectOperator$SplitToPages.process(ScanFilterAndProjectOperator.java:265)
at io.trino.operator.ScanFilterAndProjectOperator$SplitToPages.process(ScanFilterAndProjectOperator.java:193)
at io.trino.operator.WorkProcessorUtils$3.process(WorkProcessorUtils.java:359)
at io.trino.operator.WorkProcessorUtils$ProcessWorkProcessor.process(WorkProcessorUtils.java:412)
at io.trino.operator.WorkProcessorUtils$3.process(WorkProcessorUtils.java:346)
at io.trino.operator.WorkProcessorUtils$ProcessWorkProcessor.process(WorkProcessorUtils.java:412)
at io.trino.operator.WorkProcessorUtils$3.process(WorkProcessorUtils.java:346)
at io.trino.operator.WorkProcessorUtils$ProcessWorkProcessor.process(WorkProcessorUtils.java:412)
at io.trino.operator.WorkProcessorUtils.getNextState(WorkProcessorUtils.java:261)
at io.trino.operator.WorkProcessorUtils.lambda$processStateMonitor$2(WorkProcessorUtils.java:240)
at io.trino.operator.WorkProcessorUtils$ProcessWorkProcessor.process(WorkProcessorUtils.java:412)
at io.trino.operator.WorkProcessorUtils.getNextState(WorkProcessorUtils.java:261)
at io.trino.operator.WorkProcessorUtils.lambda$finishWhen$3(WorkProcessorUtils.java:255)
at io.trino.operator.WorkProcessorUtils$ProcessWorkProcessor.process(WorkProcessorUtils.java:412)
at io.trino.operator.WorkProcessorSourceOperatorAdapter.getOutput(WorkProcessorSourceOperatorAdapter.java:145)
at io.trino.operator.Driver.processInternal(Driver.java:395)
at io.trino.operator.Driver.lambda$process$8(Driver.java:298)
at io.trino.operator.Driver.tryWithLock(Driver.java:694)
at io.trino.operator.Driver.process(Driver.java:290)
at io.trino.operator.Driver.processForDuration(Driver.java:261)
at io.trino.execution.SqlTaskExecution$DriverSplitRunner.processFor(SqlTaskExecution.java:887)
at io.trino.execution.executor.timesharing.PrioritizedSplitRunner.process(PrioritizedSplitRunner.java:187)
at io.trino.execution.executor.timesharing.TimeSharingTaskExecutor$TaskRunner.run(TimeSharingTaskExecutor.java:565)
at io.trino.$gen.Trino_424____20231103_030247_2.run(Unknown Source)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.tikv.common.exception.TiClientInternalException: ScanResponse failed without a cause
at org.tikv.common.region.RegionStoreClient.handleScanResponse(RegionStoreClient.java:379)
at org.tikv.common.region.RegionStoreClient.scan(RegionStoreClient.java:369)
at org.tikv.common.region.RegionStoreClient.scan(RegionStoreClient.java:417)
at org.tikv.common.operation.iterator.ConcreteScanIterator.loadCurrentRegionToCache(ConcreteScanIterator.java:79)
at org.tikv.common.operation.iterator.ScanIterator.cacheLoadFails(ScanIterator.java:81)
... 42 more