Memory leak while reading json from S3 to memory
Hi, I have an memory issue while using duckdb-node 0.9.2 on nodejs 20.10.0-bullseye-slim.
If I open duckdb for linux 0.9.2 and keeps running the following query in sequence ( +10x ), memory footprint is ok ~75Mb. Query time ~3 seconds.
install httpfs;
install json;
load httpfs;
load json;
SET threads TO 4;
SET s3_region = 'us-east-1';
SET s3_url_style = 'path';
SET s3_use_ssl = false;
SET s3_endpoint = 'minio-group-svc.vortus-files.svc.cluster.local:9000';
SET s3_access_key_id = '5dRFa59i2YPPNWedAMHa';
SET s3_secret_access_key = '123';
SELECT DISTINCT _id, dtProcessamento, input.xLongitude, input.yLatitude, input.dtEventoGerado, input.ignicaoLigada, input.velocidade, input.odometro, input.horimetro, aggregations.rastreavel.identificador, aggregations.localizacao.cidade, aggregations.localizacao.reverseGeocode, aggregations.ponto.cliente.nome, aggregations.ponto.cliente['distancia-human'] AS 'distancia-human' FROM read_json_auto([ 's3://lake.photon.transmissao-cold/STOLEbJdEZLOl6hyAtASu/2023/01/data.json.gz', 's3://lake.photon.transmissao-cold/STOLEbJdEZLOl6hyAtASu/2023/02/data.json.gz', 's3://lake.photon.transmissao-cold/STOLEbJdEZLOl6hyAtASu/2023/03/data.json.gz', 's3://lake.photon.transmissao-cold/STOLEbJdEZLOl6hyAtASu/2023/04/data.json.gz' ], format = 'newline_delimited', compression = 'gzip', union_by_name = true, ignore_errors = true) ORDER BY input.dtEventoGerado asc;
But if i do the same thing inside nodejs, RSS memory blows up:
const duckdb = require('duckdb');
app.post('/leak-sync-memory', (req, res, next) => {
const leakDb = new duckdb.Database(':memory:');
try {
const sequence2 = `
install httpfs;
install json;
load httpfs;
load json;
SET threads TO 4;
SET s3_region = 'us-east-1';
SET s3_url_style = 'path';
SET s3_use_ssl = false;
SET s3_endpoint = 'minio-group-svc.vortus-files.svc.cluster.local:9000';
SET s3_access_key_id = '5dRFa59i2YPPNWedAMHa';
SET s3_secret_access_key = '123';
SELECT DISTINCT _id, dtProcessamento, input.xLongitude, input.yLatitude, input.dtEventoGerado, input.ignicaoLigada, input.velocidade, input.odometro, input.horimetro, aggregations.rastreavel.identificador, aggregations.localizacao.cidade, aggregations.localizacao.reverseGeocode, aggregations.ponto.cliente.nome, aggregations.ponto.cliente['distancia-human'] AS 'distancia-human' FROM read_json_auto([ 's3://lake.photon.transmissao-cold/STOLEbJdEZLOl6hyAtASu/2023/01/data.json.gz', 's3://lake.photon.transmissao-cold/STOLEbJdEZLOl6hyAtASu/2023/02/data.json.gz', 's3://lake.photon.transmissao-cold/STOLEbJdEZLOl6hyAtASu/2023/03/data.json.gz', 's3://lake.photon.transmissao-cold/STOLEbJdEZLOl6hyAtASu/2023/04/data.json.gz' ], format = 'newline_delimited', compression = 'gzip', union_by_name = true, ignore_errors = true) ORDER BY input.dtEventoGerado asc;
`;
leakDb.all( sequence2, (err, dbRes) => {
if ( err ) {
console.log(err);
}
leakDb.close();
const memory = process.memoryUsage();
console.log(`rss: ${memory.rss / (1024 * 1024)} heapTotal: ${memory.heapTotal / (1024 * 1024)} heapUsed: ${memory.heapUsed / (1024 * 1024)}`);
console.log('Response completed');
res.send('leak-sync');
});
} catch (e) {
leakDb.close();
console.log(e);
}
});
rss: 214.73046875 heapTotal: 66.7578125 heapUsed: 32.91553497314453
Response completed
rss: 379.1953125 heapTotal: 66.5078125 heapUsed: 32.48948669433594
Response completed
rss: 439.70703125 heapTotal: 66.2578125 heapUsed: 32.414947509765625
Response completed
rss: 478.2265625 heapTotal: 66.2578125 heapUsed: 32.281333923339844
Response completed
rss: 531.59765625 heapTotal: 66.0078125 heapUsed: 32.391151428222656
Response completed
rss: 592.43359375 heapTotal: 66.0078125 heapUsed: 32.41764831542969
Response completed
rss: 647.1328125 heapTotal: 66.0078125 heapUsed: 32.56422424316406
Response completed
If you need more data to trace, let me know.
Thanks Danilo Recchia
Same here ... but not just with S3
@rrcomtech @drecchia did you guys end up solving this? I'm running into a nasty memory leak with nodeJS with a larger parquet file dataset
Ended up solving it. I think its a leak with a geoJSON function I'm using in the spatial extension. More details here https://github.com/duckdb/duckdb_spatial/issues/371
Memory usage stabilizes after consuming over 1 GB, which isn't ideal since my JSON files are over 50 KB, but it's still manageable.