nats-streaming-server
nats-streaming-server copied to clipboard
100% CPU usage after restarting one of the cluster nodes
Our cluster consists of three nodes:
- 32xXeon(R) CPU E7-8860 v4 @ 2.20GHz, 128 GB RAM, SSD, VMware VM.
- nats-streaming-server version 0.19.0, nats-server: v2.1.9
Configuration file:
port: 4222
http_port: 8222
write_deadline: "15s"
debug: false
trace: false
logtime: true
logfile_size_limit: 10MB
log_file: "/storage/nats-streaming-server/logs/nats.log"
authorization {
timeout: 2
users: [
{ user: user1, password: 1 }
...
{ user: nss_cluster, password: clusterpass }
]
}
cluster: {
listen: 0.0.0.0:6222
authorization: {
timeout: 2
user: nats_route
password: natsroutepass
}
routes: ["nats-route://nats_route:[email protected]:6222", "nats-route://nats_route:[email protected]:6222"]
}
streaming: {
id: "stan-eas"
sd: false
sv: false
store: "file"
dir: "/storage/nats-streaming-server/data"
hb_interval: "30s"
hb_timeout: "10s"
hb_fail_count: 10
username: nss_cluster
password: clusterpass
store_limits: {
max_channels: 0
max_msgs: 0
max_bytes: 0MB
max_age: "144h"
max_subs: 0
max_inactivity: "144h"
channels: {
"LocalAction": { max_age: "144h" }
"PME.Control": { max_age: "144h" }
"PME.Rules": { max_age: "8760h", max_msgs: 5 }
"PerfCounter.Metrics": { max_age: "4h" }
"SQLExporter.Metrics": { max_age: "4h" }
"WatchDog.Metrics": { max_age: "4h" }
"PerfCounter.Log": { max_age: "4h" }
"SQLExporter.Log": { max_age: "4h" }
"WatchDog.Log": { max_age: "4h" }
}
}
file: {
compact_min_size: 100MB
sync_on_flush: true
file_descriptors_limit: 100
parallel_recovery: 4
}
cluster: {
node_id: "a"
peers: ["b", "c"]
log_path: "/storage/nats-streaming-server/nsslog"
log_cache_size: 1024
log_snapshots: 10
trailing_logs: 256
raft_logging: false
}
}
Number of clients: ~ 40 000. The average consumption of CPU resources: 400/3200 (from "top"). If I restart one of the nodes, the utilization on all nodes jumps to 3200.
Log:
[64189] 2020/12/22 14:12:58.305625 [WRN] 10.121.75.67:52353 - cid:3100306 - Readloop processing time: 3.453755114s
[64189] 2020/12/22 14:12:58.305672 [WRN] 10.52.214.213:55416 - cid:3100375 - Readloop processing time: 3.607654096s
[64189] 2020/12/22 14:12:58.305736 [WRN] 10.28.100.161:56442 - cid:3100352 - Readloop processing time: 3.657804437s
[64189] 2020/12/22 14:12:58.305799 [WRN] 10.42.118.194:50449 - cid:3100231 - Readloop processing time: 3.519814355s
[64189] 2020/12/22 14:12:58.305843 [WRN] 10.52.214.213:55420 - cid:3100472 - Readloop processing time: 3.552920741s
[64189] 2020/12/22 14:12:58.305903 [WRN] 10.125.220.213:52240 - cid:3099976 - Readloop processing time: 3.991156933s
[64189] 2020/12/22 14:12:58.305970 [WRN] 10.25.81.205:49452 - cid:3099913 - Readloop processing time: 6.234881178s
[64189] 2020/12/22 14:12:58.305991 [WRN] 10.126.77.154:58490 - cid:3099999 - Readloop processing time: 3.49807845s
[64189] 2020/12/22 14:12:58.306033 [WRN] 10.32.7.145:63423 - cid:3100051 - Readloop processing time: 3.728901556s
[64189] 2020/12/22 14:12:58.306076 [WRN] 10.118.201.117:55069 - cid:3100431 - Readloop processing time: 3.650559115s
[64189] 2020/12/22 14:12:58.306110 [WRN] 10.121.75.67:53058 - cid:3100164 - Readloop processing time: 3.618370455s
[64189] 2020/12/22 14:12:58.306163 [WRN] 10.84.89.227:51199 - cid:3100208 - Readloop processing time: 3.640992134s
[64189] 2020/12/22 14:12:58.306206 [WRN] 10.66.110.225:58100 - cid:3100222 - Readloop processing time: 3.704752827s
[64189] 2020/12/22 14:12:58.306256 [WRN] 10.121.75.67:53157 - cid:3100299 - Readloop processing time: 3.487310526s
[64189] 2020/12/22 14:12:58.306304 [WRN] 10.96.8.16:56316 - cid:3100012 - Readloop processing time: 3.882298912s
[64189] 2020/12/22 14:12:58.306336 [WRN] 10.113.81.66:52301 - cid:3100159 - Readloop processing time: 3.729128858s
[64189] 2020/12/22 14:12:58.306374 [WRN] 10.42.93.98:54259 - cid:3100947 - Readloop processing time: 2.452962671s
[64189] 2020/12/22 14:12:58.306382 [WRN] 10.39.210.213:49808 - cid:3100371 - Readloop processing time: 3.542212148s
[64189] 2020/12/22 14:12:58.306416 [WRN] 10.118.88.4:52712 - cid:3100117 - Readloop processing time: 3.520391606s
[64189] 2020/12/22 14:12:58.306490 [WRN] 10.28.97.66:50986 - cid:3100471 - Readloop processing time: 3.542449103s
[64189] 2020/12/22 14:12:58.306587 [WRN] 10.93.217.5:51259 - cid:3100391 - Readloop processing time: 3.552975942s
[64189] 2020/12/22 14:12:58.306633 [WRN] 10.121.75.67:52246 - cid:3100373 - Readloop processing time: 3.608532296s
[64189] 2020/12/22 14:12:58.306677 [WRN] 10.121.75.67:53091 - cid:3100210 - Readloop processing time: 3.586355502s