kubo
kubo copied to clipboard
[Bug Report] MFS files rm is hanging and not resolving
Checklist
- [x] This is a bug report, not a question. Ask questions on discuss.ipfs.tech.
- [x] I have searched on the issue tracker for my bug.
- [x] I am running the latest kubo version or have an issue updating.
Installation method
dist.ipfs.tech or ipfs-update
Version
Kubo version: 0.35.0
Repo version: 16
System version: amd64/linux
Golang version: go1.24.3
Config
{
"API": {
"HTTPHeaders": {}
},
"Addresses": {
"API": "/ip4/127.0.0.1/tcp/50019",
"Announce": [],
"AppendAnnounce": [],
"Gateway": "/ip4/127.0.0.1/tcp/6473",
"NoAnnounce": [
"/ip4/10.0.0.0/ipcidr/8",
"/ip4/100.64.0.0/ipcidr/10",
"/ip4/169.254.0.0/ipcidr/16",
"/ip4/172.16.0.0/ipcidr/12",
"/ip4/192.0.0.0/ipcidr/24",
"/ip4/192.0.2.0/ipcidr/24",
"/ip4/192.168.0.0/ipcidr/16",
"/ip4/198.18.0.0/ipcidr/15",
"/ip4/198.51.100.0/ipcidr/24",
"/ip4/203.0.113.0/ipcidr/24",
"/ip4/240.0.0.0/ipcidr/4",
"/ip6/100::/ipcidr/64",
"/ip6/2001:2::/ipcidr/48",
"/ip6/2001:db8::/ipcidr/32",
"/ip6/fc00::/ipcidr/7",
"/ip6/fe80::/ipcidr/10"
],
"Swarm": [
"/ip4/0.0.0.0/tcp/4001",
"/ip6/::/tcp/4001",
"/ip4/0.0.0.0/udp/4001/webrtc-direct",
"/ip4/0.0.0.0/udp/4001/quic-v1",
"/ip4/0.0.0.0/udp/4001/quic-v1/webtransport",
"/ip6/::/udp/4001/webrtc-direct",
"/ip6/::/udp/4001/quic-v1",
"/ip6/::/udp/4001/quic-v1/webtransport"
]
},
"AutoNAT": {},
"AutoTLS": {
"Enabled": true
},
"Bitswap": {},
"Bootstrap": [
"/dnsaddr/bootstrap.libp2p.io/p2p/QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb",
"/dnsaddr/bootstrap.libp2p.io/p2p/QmcZf59bWwK5XFi76CZX8cbJ4BhTzzA3gU1ZjYZcYW3dwt",
"/dnsaddr/va1.bootstrap.libp2p.io/p2p/12D3KooWKnDdG3iXw9eTFijk3EWSunZcFi54Zka4wmtqtt6rPxc8",
"/ip4/104.131.131.82/tcp/4001/p2p/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ",
"/ip4/104.131.131.82/udp/4001/quic-v1/p2p/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ",
"/dnsaddr/bootstrap.libp2p.io/p2p/QmNnooDu7bfjPFoTZYxMNLWUQJyrVwtbZg5gBMjTezGAJN",
"/dnsaddr/bootstrap.libp2p.io/p2p/QmQCU2EcMqAqQPR2i9bChDtGNJchTbq5TbXJJ16u19uLTa"
],
"DNS": {
"Resolvers": {}
},
"Datastore": {
"BlockKeyCacheSize": null,
"BloomFilterSize": 0,
"GCPeriod": "1h",
"HashOnRead": false,
"Spec": {
"mounts": [
{
"child": {
"path": "blocks",
"shardFunc": "/repo/flatfs/shard/v1/next-to-last/2",
"sync": false,
"type": "flatfs"
},
"mountpoint": "/blocks",
"prefix": "flatfs.datastore",
"type": "measure"
},
{
"child": {
"compression": "none",
"path": "datastore",
"type": "levelds"
},
"mountpoint": "/",
"prefix": "leveldb.datastore",
"type": "measure"
}
],
"type": "mount"
},
"StorageGCWatermark": 90,
"StorageMax": "10GB"
},
"Discovery": {
"MDNS": {
"Enabled": false
}
},
"Experimental": {
"FilestoreEnabled": false,
"Libp2pStreamMounting": false,
"OptimisticProvide": false,
"OptimisticProvideJobsPoolSize": 0,
"P2pHttpProxy": false,
"StrategicProviding": false,
"UrlstoreEnabled": false
},
"Gateway": {
"DeserializedResponses": null,
"DisableHTMLErrors": null,
"ExposeRoutingAPI": null,
"HTTPHeaders": {},
"NoDNSLink": false,
"NoFetch": false,
"PublicGateways": null,
"RootRedirect": ""
},
"HTTPRetrieval": {},
"Import": {
"BatchMaxNodes": null,
"BatchMaxSize": null,
"CidVersion": null,
"HashFunction": null,
"UnixFSChunker": null,
"UnixFSDirectoryMaxLinks": null,
"UnixFSFileMaxLinks": null,
"UnixFSHAMTDirectoryMaxFanout": null,
"UnixFSHAMTDirectorySizeThreshold": null,
"UnixFSRawLeaves": null
},
"Internal": {},
"Ipns": {
"RecordLifetime": "",
"RepublishPeriod": "",
"ResolveCacheSize": 128
},
"Migration": {
"DownloadSources": [],
"Keep": ""
},
"Mounts": {
"FuseAllowOther": false,
"IPFS": "/ipfs",
"IPNS": "/ipns",
"MFS": ""
},
"Peering": {
"Peers": null
},
"Pinning": {
"RemoteServices": {}
},
"Plugins": {
"Plugins": null
},
"Provider": {
"Strategy": ""
},
"Pubsub": {
"DisableSigning": false,
"Router": ""
},
"Reprovider": {},
"Routing": {
"Methods": {
"find-peers": {
"RouterName": "HttpRouterNotSupported"
},
"find-providers": {
"RouterName": "HttpRoutersParallel"
},
"get-ipns": {
"RouterName": "HttpRouterNotSupported"
},
"provide": {
"RouterName": "HttpRoutersParallel"
},
"put-ipns": {
"RouterName": "HttpRouterNotSupported"
}
},
"Routers": {
"HttpRouter1": {
"Parameters": {
"Endpoint": "http://127.0.0.1:19575"
},
"Type": "http"
},
"HttpRouter2": {
"Parameters": {
"Endpoint": "http://127.0.0.1:19576"
},
"Type": "http"
},
"HttpRouter3": {
"Parameters": {
"Endpoint": "http://127.0.0.1:19577"
},
"Type": "http"
},
"HttpRouter4": {
"Parameters": {
"Endpoint": "http://127.0.0.1:19578"
},
"Type": "http"
},
"HttpRouterNotSupported": {
"Parameters": {
"Endpoint": "http://kubohttprouternotsupported"
},
"Type": "http"
},
"HttpRoutersParallel": {
"Parameters": {
"Routers": [
{
"IgnoreErrors": true,
"RouterName": "HttpRouter1",
"Timeout": "10s"
},
{
"IgnoreErrors": true,
"RouterName": "HttpRouter2",
"Timeout": "10s"
},
{
"IgnoreErrors": true,
"RouterName": "HttpRouter3",
"Timeout": "10s"
},
{
"IgnoreErrors": true,
"RouterName": "HttpRouter4",
"Timeout": "10s"
}
]
},
"Type": "parallel"
}
},
"Type": "custom"
},
"Swarm": {
"AddrFilters": [
"/ip4/10.0.0.0/ipcidr/8",
"/ip4/100.64.0.0/ipcidr/10",
"/ip4/169.254.0.0/ipcidr/16",
"/ip4/172.16.0.0/ipcidr/12",
"/ip4/192.0.0.0/ipcidr/24",
"/ip4/192.0.2.0/ipcidr/24",
"/ip4/192.168.0.0/ipcidr/16",
"/ip4/198.18.0.0/ipcidr/15",
"/ip4/198.51.100.0/ipcidr/24",
"/ip4/203.0.113.0/ipcidr/24",
"/ip4/240.0.0.0/ipcidr/4",
"/ip6/100::/ipcidr/64",
"/ip6/2001:2::/ipcidr/48",
"/ip6/2001:db8::/ipcidr/32",
"/ip6/fc00::/ipcidr/7",
"/ip6/fe80::/ipcidr/10"
],
"ConnMgr": {},
"DisableBandwidthMetrics": false,
"DisableNatPortMap": true,
"RelayClient": {},
"RelayService": {},
"ResourceMgr": {},
"Transports": {
"Multiplexers": {},
"Network": {},
"Security": {}
}
},
"Version": {}
}
Description
Running ipfs files rm --force /censorship-watch.eth/postUpdates/2592000/QmbKjC3gJmaAifM9xEPUBQ7uP4Yd6FNbb8uCbkjvMG96iV/update seems to hang the process and never resolve. Same thing in JSON RPC. Passing --force or --recursive doesn't seem to change anything
While doing ipfs files rm --force /censorship-watch.eth resolves instantly.
I think what may be causing the timeout is not using --flush=true with any files operations.
What we're doing right now is ipfs files rm --flush=false, ipfs files cp --flush=false and then once all operations conclude we do ipfs files flush /
Since there are multiple processes accessing the same kubo node doing these operations in parallel, that may have caused the unbound growth of the directories' caches especially when we're updating a huge amount of MFS paths (1000+).
I think a solution is running a batch of 10 MFS operations, and then flushing, that way the cache shouldn't grow indefinitely. Or another option is to do every MFS operation with flush=true, that may fix it but I think it will slow down the propagation of MFS updates. I will test and update this issue
Doing an automatic flush when the cache size reaches some size limit seems helpful, but will actually cause problems for people not expecting this behavior. Really, there jneeds to be more frequent flush done explicitly.
Triage notes:
- Add warning in the helptext to inform user that they must flush after files operations
I've opened https://github.com/ipfs/kubo/pull/10848 to improve --help with clear warning, that users should do manual ipfs files flush if they decide to use --flush=false.
Interestingly, after a closer look, it turns out ipfs files rm in Kubo 0.35 ignores --flush=false and always flushes the parent of removed items.
This explains why the command is "hanging" for you – likely, it triggered an expensive flush. We will either fix that, or error – TBD, see
@Rinse12 independently from #10848, what you want to do is to update your logic to keep cache sizes in check:
- periodically do a manual
ipfs files flush(every X seconds or Y operations) - when you are done with removing items for a specific directory
/path/to/dir/*, flush it withipfs files flush /path/to/dir(or just dormoperation with--flush=true)
Even files rm --flush=true seems to cause hanging after a while, doesn't seem to change anything in my case.
Also interestingly, ipfs files write has no flush option in kubo-rpc-client, which makes me assume it flushes with every write? It seems to hang after a while as well, just like ipfs files rm.
I also tried a different strategy of splitting my paths to be removed into 3-paths chunks, and after each chunk we call ipfs flush /, but that still hangs after a while.
How many MFS operations do you perform before flushing?
I call ipfs.files.rm() with an array of three paths, then I flush. Repeat this until all paths are removed.
Doesn't sound like that would cause large cache growth enough to freeze things for very long while syncing?
It shouldn't, I also tried flush=true with every rm operation and got the same result. I think MFS is just bugged out
Could running ipfs files rm, ipfs flush and ipfs repo gc in sequence could cause this?
Can you provide a minimal set of operations going from an empty repository that reproduces this so that I can quicky try it?
MFS is lazy loaded. Are you sure your full MFS tree is local or retrievable when you are doing operations and that it is not hanging trying to retrieve an existing node from somewhere?
Can you provide a minimal set of operations going from an empty repository that reproduces this so that I can quicky try it?
Will do soon.
Are you sure your full MFS tree is local or retrievable
Does that matter if we're doing ipfs files rm? I assume if it's not local it will throw immediately, no?
Does that matter if we're doing
ipfs files rm? I assume if it's not local it will throw immediately, no?
Throw what?
I can't say it whether it matters. One test: run with ipfs --offline daemon and do your operations. If it errors because it tries to fetch something that is no there you can check if it's a CID that was known to the DAG or a new CID (resulting from the rm operation).
Also ipfs refs -r allows you to fetch everything or verify things are local.
I meant throw an error if the resolved path is not locally in the node.
I was able to reproduce the timeout within our repo plebbit-js, but I couldn't reproduce it in a basic script. I do have the logs of kubo node where it timed out, if you search for /files/rm starting from the bottom, you will see the call hanging because it's looking for providers, which is odd to to me because the file should exist locally, no?
https://limewire.com/d/6aLwu#Lzb3fEjfFP
Should we call ipfs refs -r <path> before any files rm to make sure the path is there before removing? What would be the best fix for this?
Should we call ipfs refs -r
before any files rm to make sure the path is there before removing?
You should not normally have to do that, and if that was necessary that would be something we would need to tix. The intent here is to see if doing that first changes the behavior you are seeing seeing, which would give us information about what is happening.
Did you get a chance to look at the logs? Kubo seems to look for providers right after files/rm call, is it supposed to do that?
Lookng into this to see if it is a provider-related issue.
Should we call ipfs refs -r
before any files rm to make sure the path is there before removing? What would be the best fix for this?
Well, does that fix it for you or also hangs? Try it out.
MFS is lazy loaded, and the depending on the operations on the folder, it may be that flushing needs to fetch CIDs that were not fetched before. If those are not retrievable, it will hang. I gave you a way to easily verify: ipfs --offline daemon and you will get an error instead of a hang, hopefully with the CID. If you have a way to reproduce your hang, restart the daemon in offline mode just before your hanging command and try. If you have a CID, you can possibly try to figure out if its known to you (somewhere in the /censorship-watch.eth) or not.
I read the code around dir.Unlink and I don't find a problem. It's very old code. My gut feeling is you have a retrieval problem, not an MFS one, but if I'm wrong I'm still unsure how to reproduce this (I tried without luck).
Well, does that fix it for you or also hangs? Try it out.
Not yet, I haven't tried daemon --offline either. It takes a lot of time to reproduce this MFS timeout bug on my end, and I'm a bit occupied with other things atm. I will try those out later.
In the meanwhile, I have tried a new fix which seems to be working the last couple of days. Our codebase originally files/rm and then files/cp to make sure path is not occupied before copying.
I changed it so that we do files/cp and if path exists already we do files/rm, then files/cp. I haven't seen the timeout error for a few days now.
Triage notes:
- we can't figure this out without a bash script that reproduces the issue
- @Rinse12 if you have time, PoC repro would be very helpful
- we also need to get some user-feedback on https://github.com/ipfs/kubo/pull/10848
- @Rinse12 do you thing you are able to test that PR and see if it makes a difference?
Oops, seems like we needed more information for this issue, please comment with more details or this issue will be closed in 7 days.
This issue was closed because it is missing author input.
I was able to reproduce it with this self contained script. If it doesn't reproduce any time out errors try to run it a couple times more. It automatically takes care of logging kubo as well:
https://github.com/Rinse12/mfs-timeout-repro
triage note: we will try to reproduce with kubo v0.37.0
Thank you for taking time to create that repro @Rinse12 and being patient with us. Repro is very useful.
@Rinse12 until this is fixed your immediate workaround is what was suggested already: periodically do manual ipfs files flush to ensure cache does not grow too big:
# Batch operations with periodic flushes
i=0
for batch in operations; do
ipfs files write/rm/etc... --flush=false
if [ $((i % 100)) -eq 0 ]; then
ipfs files flush /path # flush specific path or entire mfs -- whatever makes sense
fi
i=$((i + 1))
done
ipfs files flush /
Context for others reading this
The provided reproduction script effectively triggers the bug by:
- Creating 500 directories with 5000 files each (2.5M entries)
- Running operations in parallel batches of 100
- Stressing with concurrent DHT/IPNS operations
- Attempting removal of deeply nested paths
What happens?
The bug appears to be in the HAMT directory removal path in boxo:
- HAMT Traversal Issue: When removing a deeply nested path, the code needs to traverse through HAMT shards. The
RemoveChildoperation inHAMTDirectory(boxo/ipld/unixfs/io/directory.go:667) callsshard.Take()which then traverses the HAMT tree recursively throughswapValue()andgetValue(). - Cache Loading Problem: During removal, the HAMT implementation loads child nodes from disk via
loadChild()(boxo/ipld/unixfs/hamt/hamt.go:886). With many unflushed operations and large directory structures, this creates a cascading effect where:- each level needs to load its children
- the DAG service fetches nodes that may not be persisted
- without flushing, the in-memory cache grows unbounded
- Lock Contention: The MFS layer uses locks (boxo/mfs/dir.go) that can cause deadlocks when:
- multiple goroutines try to access the same directory hierarchy
- parent directories need to update while children are being removed
Why it hangs?
The timeout occurs because:
- The HAMT traversal for deeply nested paths becomes exponentially expensive
- Without periodic flushing, the DAG nodes accumulate in memory
- The recursive nature of
getValue()andswapValue()creates deep call stacks - Maybe deadlock somewhere? (needs more analysis)
How to fix it?
Unsure. My initial idea is to implement a blunt maximum cache size / automatic flushing threshold to remove this footgun of unbound growth. We could also optimize HAMT removal code to do shallow rm (without full traversal), but not sure if worth it if we have autoflush.
Is there a better way? @ipfs/kubo-maintainers
I feel we should fix this before shipping 0.38, as this feels related to provide system refactor: if we add extra work and provide on MFS flush (https://github.com/ipfs/boxo/pull/995) it makes this problem even worse.
Triage notes:
- see if flushing fixes the repro
- make a decision if we enforce flush each N operations (and error?) vs autoflush (could be too expensive to implement) vs providing users with more low level https://github.com/ipfs/kubo/issues/4782
Context: https://github.com/ipfs/boxo/pull/751
Also: https://github.com/ipfs/kubo/issues/8694#issuecomment-2551052645
Unbounded in-memory cache growth + HAMT folders + some operation triggering flush may look like a deadlock but maybe it is just writing many blocks to disk.
Triage:
- @lidel is working on bounding cache growth. Might be included in the next RC.
Protections added in
- https://github.com/ipfs/kubo/pull/10971
- error when
flash=falseis not supported - autoflush when threshold is hit (user can override to railse / disable)
- error when
Revised approach is to error once limit is reached, rather than silently fixtup:
- https://github.com/ipfs/kubo/pull/10985
The intention is to encourage users to do self-service and execute intentional flushes at the time that makes sense for their use case / data, rather us trying to guess what is the right threshold.