Can SLO and QoS feature be implemented based on PD separation?
🚀 Feature Description and Motivation
Can SLO and QoS feature be implemented based on PD disaggregation?
Use Case
I have created a PD-separated qwen2.5-7B model based on the pool.yaml file. The pool.yaml configuration is as follows:
apiVersion: orchestration.aibrix.ai/v1alpha1
kind: StormService
metadata:
name: pool-xpyd
spec:
replicas: 1
updateStrategy:
type: InPlaceUpdate
stateful: true
selector:
matchLabels:
app: pool-xpyd
template:
metadata:
labels:
app: pool-xpyd
spec:
roles:
- name: routing
replicas: 1
stateful: true
template:
spec:
containers:
- name: mini-lb
image: docker.1ms.run/aibrix/sglang-router:v0.1.6
command: [ "sh", "-c" ]
args:
- |
python3 -m sglang_router.launch_router \
--pd-disaggregation \
--policy random \
--service-discovery \
--service-discovery-port 30000 \
--prefill-selector storm-service-name=$STORM_SERVICE_NAME role-name=prefill \
--decode-selector storm-service-name=$STORM_SERVICE_NAME role-name=decode \
--service-discovery-namespace default
- name: prefill
replicas: 4
stateful: true
template:
metadata:
annotations:
k8s.volcengine.com/pod-networks: |
[
{
"cniConf":{
"name":"rdma"
}
}
]
labels:
model.aibrix.ai/name: qwen2.5-7B
model.aibrix.ai/port: "30000"
model.aibrix.ai/engine: sglang
spec:
nodeSelector:
type: H800
containers:
- name: prefill
image: docker.1ms.run/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
command: ["sh", "-c"]
args:
- |
python3 -m sglang.launch_server \
--model-path /models/Qwen2.5-7B-Instruct \
--served-model-name qwen2.5-7B \
--disaggregation-ib-device mlx5_4 \
--host 0.0.0.0 \
--port 30000 \
--disaggregation-mode prefill \
--disaggregation-transfer-backend=mooncake \
--trust-remote-code \
--watchdog-timeout 1000000 \
--dist-timeout 250 \
--mem-fraction-static 0.8 \
--log-level debug
env:
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NCCL_IB_HCA
value: mlx5_3
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_IB_GID_INDEX
value: "7"
- name: NCCL_DEBUG
value: "INFO"
volumeMounts:
- name: model-vol
mountPath: /models
- mountPath: /dev/shm
name: shared-mem
resources:
limits:
nvidia.com/gpu: 1
rdma/rdma_shared_devices: "1"
securityContext:
capabilities:
add:
- IPC_LOCK
volumes:
- name: model-vol
hostPath:
path: /data/Qwen
type: Directory
- emptyDir:
medium: Memory
name: shared-mem
- name: decode
replicas: 3
stateful: true
template:
metadata:
annotations:
k8s.volcengine.com/pod-networks: |
[
{
"cniConf":{
"name":"rdma"
}
}
]
labels:
model.aibrix.ai/name: qwen2.5-7B
model.aibrix.ai/port: "30000"
model.aibrix.ai/engine: sglang
spec:
nodeSelector:
type: H20
containers:
- name: decode
image: docker.1ms.run/aibrix/sglang:v0.4.9.post3-cu126-nixl-v0.4.1
command: ["sh", "-c"]
args:
- |
python3 -m sglang.launch_server \
--model-path /models/Qwen2.5-7B-Instruct \
--served-model-name qwen2.5-7B \
--disaggregation-ib-device mlx5_3 \
--host 0.0.0.0 \
--port 30000 \
--disaggregation-mode decode \
--disaggregation-transfer-backend=mooncake \
--trust-remote-code \
--watchdog-timeout 1000000 \
--dist-timeout 600 \
--mem-fraction-static 0.8 \
--log-level debug
env:
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NCCL_IB_HCA
value: mlx5_4
- name: NCCL_IB_DISABLE
value: "0"
- name: NCCL_IB_GID_INDEX
value: "7"
- name: NCCL_DEBUG
value: "INFO"
volumeMounts:
- name: model-vol
mountPath: /models
- mountPath: /dev/shm
name: shared-mem
resources:
limits:
nvidia.com/gpu: 1
rdma/rdma_shared_devices: "1"
securityContext:
capabilities:
add:
- IPC_LOCK
volumes:
- name: model-vol
hostPath:
path: /data/Qwen
type: Directory
- emptyDir:
medium: Memory
name: shared-mem
Routing access based on PD separation is functioning normally, as shown below:
root@yf-mgr-01:~# curl -v http://10.6.2.203:80/v1/chat/completions -H "routing-strategy: pd" -H "Content-Type: application/json" -d '{
"model": "qwen2.5-7B",
"messages": [
{"role": "system", "content": "You are a helpful assistant."}
]
}'
* Trying 10.6.2.203:80...
* Connected to 10.6.2.203 (10.6.2.203) port 80 (#0)
> POST /v1/chat/completions HTTP/1.1
> Host: 10.6.2.203
> User-Agent: curl/7.81.0
> Accept: */*
> routing-strategy: pd
> Content-Type: application/json
> Content-Length: 124
>
* Mark bundle as not supporting multiuse
< HTTP/1.1 200 OK
< date: Tue, 02 Sep 2025 06:39:03 GMT
< server: uvicorn
< content-type: application/json
< x-went-into-req-headers: true
< target-pod: 10.233.99.34:30000
< request-id: 93a900e1-e55a-4fa2-a336-21156ab68140
< transfer-encoding: chunked
<
* Connection #0 to host 10.6.2.203 left intact
{"id":"60eaa863ca444c6aa1a3a32a0c7346ed","object":"chat.completion","created":1756795144,"model":"qwen2.5-7B","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! How can I assist you today?","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"stop","matched_stop":151645}],"usage":{"prompt_tokens":14,"total_tokens":24,"completion_tokens":10,"prompt_tokens_details":null}}root@yf-mgr-01:~#
Access using the routing strategy based on SLO results in an error, as shown below:
root@yf-mgr-01:~# curl -v http://10.6.2.203:80/v1/chat/completions -H "model: qwen2.5-7B" -H "Content-Type: application/json" -H "Authorization: Bearer [api_key]" -H "routing-strategy: slo" -d '{ \
"model": "qwen2.5-7B", \
"messages": [{"role": "user", "content": "Say this is a test!"}], \
"temperature": 0.7 \
}'
* Trying 10.6.2.203:80...
* Connected to 10.6.2.203 (10.6.2.203) port 80 (#0)
> POST /v1/chat/completions HTTP/1.1
> Host: 10.6.2.203
> User-Agent: curl/7.81.0
> Accept: */*
> model: qwen2.5-7B
> Content-Type: application/json
> Authorization: Bearer [api_key]
> routing-strategy: slo
> Content-Length: 191
>
* Mark bundle as not supporting multiuse
< HTTP/1.1 400 Bad Request
< x-error-request-body-processing: true
< content-length: 64
< content-type: text/plain
< date: Tue, 02 Sep 2025 06:59:13 GMT
< connection: close
<
* Closing connection 0
{"error":{"code":400,"message":"error processing request body"}}root@yf-mgr-01:~#
Proposed Solution
No response
@zhangjyr @nwangfw please help take a look
@wangchuanfang SLO policy requires workload profiling. Can you enable the verbose log at the gateway plugin and see if the profile was properly loaded?
@zhangjyr The following is the log information”
I0903 02:43:53.949555 1 gateway.go:94] "processing request" requestID="c91c5936-aab5-42d8-aa7d-bdbde90ed668"
I0903 02:43:53.951605 1 pd_disaggregation.go:201] "start_prefill_request" request_id="c91c5936-aab5-42d8-aa7d-bdbde90ed668" llm_engine="sglang" prefill_url="http://10.233.69.129:30000/v1/chat/completions"
I0903 02:43:53.951651 1 pd_disaggregation.go:105] "P/D" prefill_pod="pool-xpyd-roleset-pbj67-prefill-d6fbcc9b7-0" decode_pod="pool-xpyd-roleset-pbj67-decode-59bf665cc4-2"
I0903 02:43:53.951703 1 gateway_req_body.go:91] "request start" requestID="c91c5936-aab5-42d8-aa7d-bdbde90ed668" requestPath="/v1/chat/completions" model="qwen2.5-7B" stream=false routingAlgorithm="pd" targetPodIP="10.233.67.160:30000" routingDuration="1.988986ms"
I0903 02:43:53.999794 1 pd_disaggregation.go:212] "prefill_request_complete" request_id="c91c5936-aab5-42d8-aa7d-bdbde90ed668"
I0903 02:43:54.176771 1 gateway_rsp_body.go:189] request end, requestID: c91c5936-aab5-42d8-aa7d-bdbde90ed668 - targetPod: 10.233.67.160:30000
I0903 02:44:42.479493 1 gateway.go:94] "processing request" requestID="2c1ecf04-db43-47f9-a366-8c7d1490dbe4"
W0903 02:44:42.480758 1 slo_queue.go:147] profile not available for aibrix:profile_qwen2.5-7B_
W0903 02:44:42.480785 1 slo_queue.go:155] SLOQueue found no profile available for model qwen2.5-7B, fallback to FIFO queue
W0903 02:44:42.480855 1 slo_queue.go:147] profile not available for aibrix:profile_qwen2.5-7B_
W0903 02:44:42.480868 1 slo_queue.go:155] SLOQueue found no profile available for model qwen2.5-7B, fallback to FIFO queue
I0903 02:44:42.480976 1 gateway_req_body.go:91] "request start" requestID="2c1ecf04-db43-47f9-a366-8c7d1490dbe4" requestPath="/v1/chat/completions" model="qwen2.5-7B" stream=false routingAlgorithm="slo" targetPodIP="10.233.67.161:30000" routingDuration="1.221188ms"
I0903 02:44:42.495600 1 gateway_rsp_body.go:189] request end, requestID: 2c1ecf04-db43-47f9-a366-8c7d1490dbe4 - targetPod: 10.233.67.161:30000
@zhangjyr Below is the profile configuration information.
redis-cli -h localhost -p 6379
localhost:6379> get aibrix:profile_qwen25-7b_qwen25-7b
"{\"gpu\": \"qwen25-7b\", \"cost\": 1.0, \"tputs\": [[64.50872929534701, 63.33758148240239], [61.81315005207643, 62.046895910680945], [2.16418543755822, 2.148603705410359], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]], \"indexes\": [[4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096], [4, 8]], \"created\": 1756971832.781721, \"e2e\": [[0.1001912517962046, 0.1065298440377228], [0.11949868050403893, 0.12766180806793273], [0.14729135640664026, 0.1464343694240946], [0.26758284060284493, NaN], [0.5096334402100183, NaN], [0.9892026041657664, NaN], [1.9360088507342152, NaN], [3.8460376032209025, NaN], [7.886668739218731, NaN], [16.685072624199094, NaN], [41.33628484920831, NaN]], \"ttft\": [[0.026110959409270437, 6.24852254986763e-07], [6.912229582667351e-07, 6.047985516488552e-07], [1.0057189501821994e-06, 1.0709104250476818e-06], [1.0683690197765826e-06, NaN], [1.0420591570436954e-06, NaN], [1.1244672350585461e-06, NaN], [1.2449733912944793e-06, NaN], [1.186735462397337e-06, NaN], [1.3435515575110913e-06, NaN], [1.2417742982506752e-06, NaN], [1.328669022768736e-06, NaN]], \"slos\": {\"percentile\": 50, \"e2e\": 0.1464343694240946, \"ttft\": 1.0709104250476818e-06}}"
@wangchuanfang Sorry for the late response. The model name is case sensitive. The profile key should match the model name, such as: key: aibrix:profile_qwen2.5-7B_qwen25-7b