js-libp2p
js-libp2p copied to clipboard
[switch] ensure all connections are closed on switch stop
I'm using why-is-node-running to inspect what's causing node to keep running after ipfs.stop() is called. I think the "latency-monitor" one is a red herring as these timeouts are unref'd:
There are 32 handle(s) keeping the process running
# Timeout
/Users/alan/Desktop/test/node_modules/latency-monitor/dist/LatencyMonitor.js:163 - this._emitIntervalID = setInterval(function () {
/Users/alan/Desktop/test/node_modules/latency-monitor/dist/LatencyMonitor.js:141 - that._startTimers();
/Users/alan/Desktop/test/node_modules/libp2p-connection-manager/src/index.js:53 - this._latencyMonitor = new LatencyMonitor({
/Users/alan/Desktop/test/node_modules/libp2p/src/index.js:157 - this.connectionManager.start()
# TCPWRAP
/Users/alan/Desktop/test/node_modules/libp2p-tcp/src/index.js:30 - const rawSocket = net.connect(cOpts)
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:79 - const conn = transport.dial(addr, (err) => {
/Users/alan/Desktop/test/node_modules/async/timeout.js:86 - fn.apply(null, args);
/Users/alan/Desktop/test/node_modules/async/internal/initialParams.js:11 - fn.call(this, args, callback);
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:86 - }, this.dialTimeout)(callback)
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:41 - this._dialWithTimeout(transport, addr, (err, conn) => {
# TCPWRAP
/Users/alan/Desktop/test/node_modules/libp2p-tcp/src/index.js:30 - const rawSocket = net.connect(cOpts)
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:79 - const conn = transport.dial(addr, (err) => {
/Users/alan/Desktop/test/node_modules/async/timeout.js:86 - fn.apply(null, args);
/Users/alan/Desktop/test/node_modules/async/internal/initialParams.js:11 - fn.call(this, args, callback);
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:86 - }, this.dialTimeout)(callback)
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:41 - this._dialWithTimeout(transport, addr, (err, conn) => {
# TCPWRAP
/Users/alan/Desktop/test/node_modules/libp2p-tcp/src/index.js:30 - const rawSocket = net.connect(cOpts)
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:79 - const conn = transport.dial(addr, (err) => {
/Users/alan/Desktop/test/node_modules/async/timeout.js:86 - fn.apply(null, args);
/Users/alan/Desktop/test/node_modules/async/internal/initialParams.js:11 - fn.call(this, args, callback);
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:86 - }, this.dialTimeout)(callback)
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:41 - this._dialWithTimeout(transport, addr, (err, conn) => {
# TCPWRAP
/Users/alan/Desktop/test/node_modules/libp2p-tcp/src/index.js:30 - const rawSocket = net.connect(cOpts)
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:79 - const conn = transport.dial(addr, (err) => {
/Users/alan/Desktop/test/node_modules/async/timeout.js:86 - fn.apply(null, args);
/Users/alan/Desktop/test/node_modules/async/internal/initialParams.js:11 - fn.call(this, args, callback);
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:86 - }, this.dialTimeout)(callback)
/Users/alan/Desktop/test/node_modules/libp2p-switch/src/limit-dialer/queue.js:41 - this._dialWithTimeout(transport, addr, (err, conn) => {
# TLSWRAP
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# TLSWRAP
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# HTTPPARSER
(unknown stack trace)
# Timeout
/Users/alan/Desktop/test/ls-bug.js:33 - setTimeout(() => log(), 10000)
# Timeout
/Users/alan/Desktop/test/node_modules/latency-monitor/dist/LatencyMonitor.js:268 - this._checkLatencyID = setTimeout(function () {
/Users/alan/Desktop/test/node_modules/latency-monitor/dist/LatencyMonitor.js:257 - that._checkLatency(); // Start again ASAP
/Users/alan/Desktop/test/node_modules/latency-monitor/dist/LatencyMonitor.js:281 - cb();
Example script:
const IPFS = require('ipfs')
const Path = require('path')
const Os = require('os')
const log = require('why-is-node-running')
const ipfs = new IPFS({ repo: Path.join(Os.tmpdir(), `${Date.now()}`) })
ipfs.on('ready', async () => {
const files = Array.from([1, 2, 3], i => ({
path: `test-file-${Date.now()}-${i}`,
content: Buffer.from(`data ${Date.now()} ${i}`)
}))
const res = await ipfs.files.add(files, { wrapWithDirectory: true })
console.log('Added files:')
console.log(res)
const { hash } = res[res.length - 1]
const ipfsPath = `/ipfs/${hash}`
console.log(`Copying from ${ipfsPath}`)
const mfsPath = `/test-dir-${Date.now()}`
console.log(`Copying to ${mfsPath}`)
await ipfs.files.cp(ipfsPath, mfsPath)
console.log(`Directory listing of ${mfsPath}:`)
console.log(await ipfs.files.ls(mfsPath))
await ipfs.stop()
setTimeout(() => log(), 10000)
})
There are timeouts in latency-monitor, but all refs are removed on shutdown. There are private methods on the latency-monitor for turning off the timers, ideally we'd be able to shut those down, but it shouldn't be an issue for this.
The TCPWRAP logs are due to open dial attempts. I think we need to make sure Switch is cancelling dials as best it can when shutting down. I didn't see these locally since I didn't have active dials, but it should get addressed regardless. The new state machine updates may help us with that.
TLSWRAP and HTTPPARSER appear to be ipfs related, as best I can tell right now. I've got the libp2p internals for the example above replicated without ipfs, except for the data exchange, and I am not seeing either of those logs.
@alanshaw it looks like the TLSWRAP and HTTPPARSER logs go away when preloading is disabled on ipfs.
Preloading requests are tracked and canceled when the node is stopped https://github.com/ipfs/js-ipfs/blob/master/src/core/preload.js#L76-L81. Unless something has changed in a new Node.js version I don't think these are problematic as that code hasn't changed now for 3 months.
This is a very recent change. I noticed because all the examples here used to exit when done but now don't :(
Okay, I can replicate that now with the read+write iterator example. It looks like if the node is running long enough and attempts to stop there may be several queued dial calls, even if switch is stopped. I'm looking into making the queue abortable.
After look at this more there are a few things we're going to need to make sure are happening for graceful shutdowns, several of which aren't currently happening.
TODO List
- [ ] 1. Shutdown all listeners to prevent incoming dials. Right now shutting down some listeners causes all existing connections to close ungracefully, which will throw errors. Related to https://github.com/libp2p/js-libp2p-switch/issues/251.
- [ ] 2. Clear dialing queues so no new dials will happen. libp2p/js-libp2p#432
- [ ] 3. Close all open and dialing connection. We can currently close open connections fairly easily, especially muxed connections. We need to add abort logic to the transports so that we can abort new connections.
- [ ] 4. Prevent dials when the switch is starting/stopping/stopped. https://github.com/libp2p/js-libp2p-switch/issues/293 Right now the dialer doesn't care what state the switch is in; it needs to. The latest state machine update should make this easy.
This has marginally improved with 0.41.3. Closing out in progress dials is the major issue left to resolve.
@jacobheun can you post the links to the PRs that are solving the issues you listed in https://github.com/libp2p/js-libp2p-switch/issues/281#issuecomment-432624680?
I added issues that were missing to complete this work and updated the todo list in the comment, https://github.com/libp2p/js-libp2p-switch/issues/281#issuecomment-447384312. There are several issues that need to be completed for in progress/queued dials, so it's still a significant amount of work left.
The marginal improvement was the result of a fixed bug where not all connections were being properly closed, https://github.com/libp2p/js-libp2p-switch/pull/291. They are now being tracked better, and are also closing the underlying duplex stream, which they weren't previously doing.
Closing due to staleness