ome
ome copied to clipboard
[BUG] NCCl TCPStore remote address is incorrect
What happened?
https://lmsys.org/blog/2025-07-20-k2-large-scale-ep/?linkId=100000374601795#3%EF%B8%8F%E2%83%A3-our-solution-ome--sglang-pd-disaggregation--large-scale-expert-parallelism
when i flow the blog to run kimi-k2-pd with 128*H200, found that: after i run the ClusterServingRuntime, all pod is running, but after a while, sglang failed
[rank87]:[W1104 01:29:53.261611977 ProcessGroupNCCL.cpp:1783] [PG ID 0 PG GUID 0 Rank 87] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash?
[rank82]:[W1104 01:29:53.257494110 TCPStore.cpp:125] [c10d] recvValue failed on SocketImpl(fd=116, addr=[lws-kimi-k2-instruct-decoder-0-10.lws-kimi-k2-instruct-decoder.ome.svc.cluster.local]:44516, remote=[10-24-8-26.kimi-k2-instruct-decoder.ome.svc.cluster.local]:5757): Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash?
why the remote address is: remote=[10-24-8-26.kimi-k2-instruct-decoder.ome.svc.cluster.local]:5757 it should be: [lws-kimi-k2-instruct-decoder-0.lws-kimi-k2-instruct-decoder.ome.svc.cluster.local]:5757
How can we reproduce it (as minimally and precisely as possible)?
- download kimi-k2-0905
- build ome image from github sorce code(20251104), ome version: v0.1.3-79-g93e8c92-dirty
- flow the blog ,install ome and lws
- sglang version: 0.4.10 or 0.5.3
- then run the kimi-k2-pd inference service
Anything else we need to know?
root@pod1-gpu-017:/llm/src/ome# kubectl get pod -n ome
NAME READY STATUS RESTARTS AGE
kimi-k2-instruct-router-65d868fd9c-whkm2 1/1 Running 0 16m
lws-kimi-k2-instruct-decoder-0 1/1 Running 0 14m
lws-kimi-k2-instruct-decoder-0-0 1/1 Running 0 14s
lws-kimi-k2-instruct-decoder-0-1 1/1 Running 0 14m
lws-kimi-k2-instruct-decoder-0-10 1/1 Running 0 14m
lws-kimi-k2-instruct-decoder-0-2 1/1 Running 0 14m
lws-kimi-k2-instruct-decoder-0-3 1/1 Running 0 14m
lws-kimi-k2-instruct-decoder-0-4 1/1 Running 0 14m
lws-kimi-k2-instruct-decoder-0-5 1/1 Running 0 14m
lws-kimi-k2-instruct-decoder-0-6 1/1 Running 0 14m
lws-kimi-k2-instruct-decoder-0-7 1/1 Running 0 14m
lws-kimi-k2-instruct-decoder-0-8 1/1 Running 0 14m
lws-kimi-k2-instruct-decoder-0-9 1/1 Running 0 14m
lws-kimi-k2-instruct-engine-0 1/1 Running 0 16m
lws-kimi-k2-instruct-engine-0-0 1/1 Terminating 1 (12s ago) 30s
lws-kimi-k2-instruct-engine-0-1 1/1 Running 0 16m
lws-kimi-k2-instruct-engine-0-2 1/1 Running 0 16m
ome-controller-manager-5c9f8f9fb6-mpp8l 1/1 Running 0 107m
ome-controller-manager-5c9f8f9fb6-xj4ph 1/1 Running 0 107m
ome-controller-manager-5c9f8f9fb6-z4tqn 1/1 Running 0 107m
ome-model-agent-daemonset-4gg7q 1/1 Running 0 149m
ome-model-agent-daemonset-579nk 1/1 Running 0 149m
ome-model-agent-daemonset-95gh4 1/1 Running 0 149m
ome-model-agent-daemonset-fzwsw 1/1 Running 0 149m
ome-model-agent-daemonset-g5z6h 1/1 Running 0 149m
ome-model-agent-daemonset-jtrj6 1/1 Running 0 149m
ome-model-agent-daemonset-k55xl 1/1 Running 0 149m
ome-model-agent-daemonset-l6hps 1/1 Running 0 149m
ome-model-agent-daemonset-lrvn8 1/1 Running 0 149m
ome-model-agent-daemonset-pdv5m 1/1 Running 0 149m
ome-model-agent-daemonset-pshw7 1/1 Running 0 149m
ome-model-agent-daemonset-sbffh 1/1 Running 0 149m
ome-model-agent-daemonset-sm8v4 1/1 Running 0 149m
ome-model-agent-daemonset-tx2sz 1/1 Running 0 149m
ome-model-agent-daemonset-vrgsx 1/1 Running 0 149m
ome-model-agent-daemonset-vrkj9 1/1 Running 0 149m
root@pod1-gpu-017:/llm/src/ome# kubectl get service -n ome -o wide
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR
kimi-k2-instruct ClusterIP 200.197.15.80 <none> 80/TCP 24m component=router,ome.io/inferenceservice=kimi-k2-instruct
kimi-k2-instruct-decoder ClusterIP 200.194.203.109 <none> 30000/TCP 24m ray.io/node-type=head
kimi-k2-instruct-engine ClusterIP 200.194.36.188 <none> 30000/TCP 24m ray.io/node-type=head
kimi-k2-instruct-router ClusterIP 200.194.26.254 <none> 8080/TCP 24m app=kimi-k2-instruct-router
lws-kimi-k2-instruct-decoder ClusterIP None <none> <none> 24m leaderworkerset.sigs.k8s.io/name=lws-kimi-k2-instruct-decoder
lws-kimi-k2-instruct-engine ClusterIP None <none> <none> 24m leaderworkerset.sigs.k8s.io/name=lws-kimi-k2-instruct-engine
ome-controller-manager-service ClusterIP 200.202.196.81 <none> 8443/TCP 157m control-plane=ome-controller-manager,controller-tools.k8s.io=1.0
ome-webhook-server-service ClusterIP 200.196.230.159 <none> 443/TCP 157m control-plane=ome-controller-manager
./config/runtimes/srt/kimi-k2-pd-rt.yaml
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: srt-kimi-k2-pd
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- pod1-gpu-017
- pod1-gpu-018
- pod1-gpu-019
- pod1-gpu-020
- pod1-gpu-021
- pod1-gpu-022
- pod1-gpu-023
- pod1-gpu-024
- pod1-gpu-025
- pod1-gpu-026
- pod1-gpu-027
- pod1-gpu-028
- pod1-gpu-029
- pod1-gpu-030
- pod1-gpu-031
- pod1-gpu-032
disabled: false
modelSizeRange:
min: 1T
max: 1.5T
supportedModelFormats:
- modelFormat:
name: safetensors
version: "1.0.0"
modelFramework:
name: transformers
version: "4.51.3"
modelArchitecture: DeepseekV3ForCausalLM
quantization: "fp8"
autoSelect: true
priority: 1
protocolVersions:
- openAI
engineConfig:
leader:
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
dnsPolicy: ClusterFirstWithHostNet
hostNetwork: true
enableServiceLinks: false
hostIPC: true
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: devinf
hostPath:
path: /dev/infiniband
runner:
name: ome-container
image: 10.24.10.61:20405/sglang:main-0707-deepep-mooncake
ports:
- containerPort: 30000
protocol: TCP
env:
- name: MC_TE_METRIC
value: "true"
- name: PYTHONUNBUFFERED
value: "1"
- name: GLOO_SOCKET_IFNAME
value: bond4
- name: NCCL_SOCKET_IFNAME
value: bond4
- name: TORCH_CUDA_ARCH_LIST
value: "9.0"
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
command:
- python3
- -m
- sglang.launch_server
- --port
- "30000"
- --host
- "0.0.0.0"
- --model-path
- $(MODEL_PATH)
- --disaggregation-ib-device
- mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_9
- --chunked-prefill-size
- "524288"
- --ep-dispatch-algorithm
- dynamic
- --eplb-algorithm
- deepseek
- --enable-dp-lm-head
- --disable-cuda-graph
- --enable-two-batch-overlap
- --enable-dp-attention
- --disable-shared-experts-fusion
- --dp-size
- $(PARALLELISM_SIZE)
- --disable-radix-cache
- --enable-deepep-moe
- --deepep-mode
- normal
- --disaggregation-mode
- prefill
- --mem-fraction-static
- "0.849"
- --tp-size
- $(PARALLELISM_SIZE)
- --dist-init-addr
- $(LWS_LEADER_ADDRESS):5757
- --nnodes
- $(LWS_GROUP_SIZE)
- --node-rank
- $(LWS_WORKER_INDEX)
- --trust-remote-code
- --moe-dense-tp-size
- "1"
- --decode-log-interval
- "1"
- --max-running-requests
- "1024"
- --max-total-tokens
- "131072"
- --enable-eplb
- --ep-num-redundant-experts
- $(PARALLELISM_SIZE)
resources:
requests:
nvidia.com/gpu: 8
limits:
nvidia.com/gpu: 8
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/infiniband
name: devinf
securityContext:
capabilities:
add:
- IPC_LOCK
- CAP_SYS_ADMIN
privileged: true
worker:
size: 3
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
dnsPolicy: ClusterFirstWithHostNet
hostNetwork: true
enableServiceLinks: false
hostIPC: true
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: devinf
hostPath:
path: /dev/infiniband
runner:
name: ome-container
image: 10.24.10.61:20405/main-0707-deepep-mooncake
ports:
- containerPort: 30001
protocol: TCP
env:
- name: MC_TE_METRIC
value: "true"
- name: PYTHONUNBUFFERED
value: "1"
- name: GLOO_SOCKET_IFNAME
value: bond4
- name: NCCL_SOCKET_IFNAME
value: bond4
- name: TORCH_CUDA_ARCH_LIST
value: "9.0"
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
command:
- python3
- -m
- sglang.launch_server
- --model-path
- $(MODEL_PATH)
- --disaggregation-ib-device
- mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_9
- --chunked-prefill-size
- "524288"
- --ep-dispatch-algorithm
- dynamic
- --eplb-algorithm
- deepseek
- --enable-dp-lm-head
- --disable-cuda-graph
- --enable-two-batch-overlap
- --enable-dp-attention
- --disable-shared-experts-fusion
- --dp-size
- $(PARALLELISM_SIZE)
- --disable-radix-cache
- --enable-deepep-moe
- --deepep-mode
- normal
- --disaggregation-mode
- prefill
- --mem-fraction-static
- "0.849"
- --tp-size
- $(PARALLELISM_SIZE)
- --dist-init-addr
- $(LWS_LEADER_ADDRESS):5757
- --nnodes
- $(LWS_GROUP_SIZE)
- --node-rank
- $(LWS_WORKER_INDEX)
- --trust-remote-code
- --moe-dense-tp-size
- "1"
- --decode-log-interval
- "1"
- --max-running-requests
- "1024"
- --max-total-tokens
- "131072"
- --enable-eplb
- --ep-num-redundant-experts
- $(PARALLELISM_SIZE)
resources:
limits:
nvidia.com/gpu: "8"
requests:
nvidia.com/gpu: "8"
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/infiniband
name: devinf
securityContext:
capabilities:
add:
- IPC_LOCK
- CAP_SYS_ADMIN
privileged: true
decoderConfig:
leader:
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: devinf
hostPath:
path: /dev/infiniband
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
dnsPolicy: ClusterFirstWithHostNet
hostNetwork: true
enableServiceLinks: false
hostIPC: true
runner:
name: ome-container
ports:
- containerPort: 30000
protocol: TCP
image: 10.24.10.61:20405/sglang:v0.4.10-deepseek3.1-0822-my-re_mooncake
env:
- name: MC_TE_METRIC
value: "true"
- name: PYTHONUNBUFFERED
value: "1"
- name: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
value: "480"
- name: GLOO_SOCKET_IFNAME
value: bond4
- name: NCCL_SOCKET_IFNAME
value: bond4
- name: TORCH_CUDA_ARCH_LIST
value: "9.0"
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
command:
- python3
- -m
- sglang.launch_server
- --port
- "30000"
- --host
- "0.0.0.0"
- --model-path
- $(MODEL_PATH)
- --enable-dp-attention
- --enable-dp-lm-head
- --dp-size
- $(PARALLELISM_SIZE)
- --disable-radix-cache
- --disable-shared-experts-fusion
- --enable-deepep-moe
- --deepep-mode
- low_latency
- --disaggregation-mode
- decode
- --enable-two-batch-overlap
- --mem-fraction-static
- "0.6"
- --disaggregation-ib-device
- mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_9
- --cuda-graph-bs
- "480"
- --max-running-requests
- "46080"
- --tp-size
- $(PARALLELISM_SIZE)
- --dist-init-addr
- $(LWS_LEADER_ADDRESS):5757
- --nnodes
- $(LWS_GROUP_SIZE)
- --node-rank
- $(LWS_WORKER_INDEX)
- --decode-log-interval
- "1"
- --trust-remote-code
- --moe-dense-tp-size
- "1"
- --ep-num-redundant-experts
- $(PARALLELISM_SIZE)
- --enable-eplb
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/infiniband
name: devinf
securityContext:
capabilities:
add:
- IPC_LOCK
- CAP_SYS_ADMIN
privileged: true
resources:
requests:
nvidia.com/gpu: 8
limits:
nvidia.com/gpu: 8
worker:
size: 11
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
dnsPolicy: ClusterFirstWithHostNet
hostNetwork: true
enableServiceLinks: false
hostIPC: true
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: devinf
hostPath:
path: /dev/infiniband
runner:
name: ome-container
image: 10.24.10.61:20405/sglang:v0.4.10-deepseek3.1-0822-my-re_mooncake
ports:
- containerPort: 30001
protocol: TCP
env:
- name: MC_TE_METRIC
value: "true"
- name: PYTHONUNBUFFERED
value: "1"
- name: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK
value: "480"
- name: GLOO_SOCKET_IFNAME
value: bond4
- name: NCCL_SOCKET_IFNAME
value: bond4
- name: TORCH_CUDA_ARCH_LIST
value: "9.0"
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
command:
- python3
- -m
- sglang.launch_server
- --model-path
- $(MODEL_PATH)
- --enable-dp-attention
- --enable-dp-lm-head
- --dp-size
- $(PARALLELISM_SIZE)
- --disable-radix-cache
- --disable-shared-experts-fusion
- --enable-deepep-moe
- --deepep-mode
- low_latency
- --disaggregation-mode
- decode
- --enable-two-batch-overlap
- --mem-fraction-static
- "0.6"
- --disaggregation-ib-device
- mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_9
- --cuda-graph-bs
- "480"
- --max-running-requests
- "46080"
- --tp-size
- $(PARALLELISM_SIZE)
- --dist-init-addr
- $(LWS_LEADER_ADDRESS):5757
- --nnodes
- $(LWS_GROUP_SIZE)
- --node-rank
- $(LWS_WORKER_INDEX)
- --decode-log-interval
- "1"
- --trust-remote-code
- --moe-dense-tp-size
- "1"
- --ep-num-redundant-experts
- $(PARALLELISM_SIZE)
- --enable-eplb
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/infiniband
name: devinf
securityContext:
capabilities:
add:
- IPC_LOCK
- CAP_SYS_ADMIN
privileged: true
resources:
limits:
nvidia.com/gpu: "8"
requests:
nvidia.com/gpu: "8"
routerConfig:
runner:
name: router
image: 10.24.10.61:20405/sglang-router:v0.2.0-curl
resources:
limits:
cpu: "10"
memory: "20Gi"
ports:
- containerPort: 8080
name: http
command:
- sh
- -c
- >
python3 -m sglang_router.launch_router
--host 0.0.0.0
--port 8080
--pd-disaggregation
--policy random
--service-discovery
--service-discovery-namespace "${NAMESPACE}"
--service-discovery-port 30000
--prefill-selector component=engine leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME}
--decode-selector component=decoder leaderworkerset.sigs.k8s.io/worker-index=0 ome.io/inferenceservice=${INFERENCESERVICE_NAME}
--max-payload-size 2147483648
--worker-startup-timeout-secs 1200
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: INFERENCESERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['ome.io/inferenceservice']
Environment
ome install command
# ome
git clone https://github.com/sgl-project/ome
# Step 1: Install OME CRDs
helm upgrade --install ome-crd ./charts/ome-crd --namespace ome
# Step 2: Install OME core resources
helm upgrade --install ome ./charts/ome-resources --namespace ome
# lws
wget https://github.com/kubernetes-sigs/lws/releases/download/v0.6.3/manifests.yaml -O lws.yaml
kubectl apply --server-side -f lws.yaml
# kimi k2
kubectl apply -f ./config/models/moonshotai/Kimi-K2-Instruct.yaml
# Kimi K2
kubectl apply -f ./config/runtimes/srt/kimi-k2-pd-rt.yaml
# deploy
kubectl apply -f ./config/samples/isvc/moonshotai/kimi-k2-pd.yaml
@pallasathena92 could you help me ?