High CPU usage from rdtsc during LLM inference
top reports 100% single-core CPU usage when inferring LLMs both with exllamav2 and llama.cpp.
Digging with perf, it seems this loads is coming from libhsa-runtime64, specifically, from a single rdtsc instruction called from hsa_amd_image_get_info_max_dim.
Samples: 79K of event 'cycles:P', 4000 Hz, Event count (approx.): 53226866542 lost: 0/0 drop: 0/0
Overhead Shared Object Symbol
69.64% libhsa-runtime64.so [.] 0x0000000000050f45
2.36% libhsa-runtime64.so [.] 0x0000000000050f3d
1.67% libhsa-runtime64.so [.] 0x0000000000051027
1.20% libhsa-runtime64.so [.] 0x0000000000051056
1.17% libhsa-runtime64.so [.] 0x0000000000051053
0.83% libhsa-runtime64.so [.] 0x0000000000050f6e
0.83% libhsa-runtime64.so [.] 0x0000000000050f23
0.82% libhsa-runtime64.so [.] 0x0000000000051020
0.80% libhsa-runtime64.so [.] 0x0000000000050f58
0.77% libhsa-runtime64.so [.] 0x0000000000050f1d
0.75% libhsa-runtime64.so [.] 0x0000000000051050
0.59% libamdhip64.so [.] 0x00000000002662ed
Example with exllamav2:
Samples: 134K of event 'cycles:P', 4000 Hz, Event count (approx.): 70352266841
0000000000050f45 /home/jerome/project/.venv/lib/python3.12/site-packages/torch/lib/libhsa-runtime64.so [Percent: local period]
Percent│
│
│
│ Disassembly of section .text:
│
│ 0000000000050f45 <hsa_amd_image_get_info_max_dim@@ROCR_1+0x865>:
90.32 │ rdtsc
0.55 │ shl $0x20,%rdx
0.55 │ or %rdx,%rax
│ ↓ js b3
0.48 │ pxor %xmm0,%xmm0
1.06 │ cvtsi2sd %rax,%xmm0
0.53 │ 18: lea _amdgpu_r_debug@@ROCR_1+0x13e2b0,%rax
0.63 │ mulsd (%rax),%xmm0
0.57 │ subsd 0x8(%rsp),%xmm0
0.96 │ ucomisd (%rsp),%xmm0
0.55 │ → ja 1db
0.56 │ cmp $0x1,%r14d
│ ↓ je db
│ movsd hsa_amd_portable_close_dmabuf@@ROCR_1+0x86b40,%xmm4
│ ucomisd %xmm0,%xmm4
│ → jbe 13b
│ lea typeinfo for __cxxabiv1::__forced_unwind@@ROCR_1+0x2941,%rax
│ cmpb $0x0,(%rax)
│ → je 50eff <hsa_amd_image_get_info_max_dim@@ROCR_1+0x81f>
│ xor %ecx,%ecx
│ mov $0xea60,%eax
│ mov $0x2,%ebx
│ mwaitx %eax,%ecx,%ebx
│ lea 0x8(%rsi),%rax
│ mov %ecx,%edx
│ monitorx %rax,%ecx,%edx
│ mov 0x0(%r13),%eax
│ test %eax,%eax
│ → jne 50f0b <hsa_amd_image_get_info_max_dim@@ROCR_1+0x82b>
│ nop
│ xor %ebx,%ebx
│ mov 0x10(%rsp),%rax
│ lock subl $0x1,(%rax)
│ mov %rbp,%rdi
│ → call hsa_amd_image_get_info_max_dim@@ROCR_1+0x1f470
│ add $0x38,%rsp
│ mov %rbx,%rax
│ pop %rbx
│ pop %rbp
│ pop %r12
│ pop %r13
│ pop %r14
│ pop %r15
│ ← ret
│ nop
│ b3: mov %rax,%rdx
│ pxor %xmm0,%xmm0
│ shr %rdx
│ and $0x1,%eax
│ or %rax,%rdx
│ cvtsi2sd %rdx,%xmm0
│ addsd %xmm0,%xmm0
│ ↑ jmp 18
│ cs nopw 0x0(%rax,%rax,1)
1.13 │ db: lea typeinfo for __cxxabiv1::__forced_unwind@@ROCR_1+0x2941,%rax
2.11 │ cmpb $0x0,(%rax)
│ → je 50eff <hsa_amd_image_get_info_max_dim@@ROCR_1+0x81f>
│ xor %ecx,%ecx
│ mov %ecx,%eax
│ mov %ecx,%ebx
│ mwaitx %eax,%ecx,%ebx
│ lea 0x8(%rsi),%rax
│ mov %ecx,%edx
│ monitorx %rax,%ecx,%edx
│ .byte 0xe9
│ .byte 0xb8
│ .byte 0xfe
Example from llama.cpp:
Samples: 30K of event 'cycles:P', 4000 Hz, Event count (approx.): 24997506195
0000000000062464 /opt/rocm/lib/libhsa-runtime64.so.1.11.0 [Percent: local period]
Percent│
│
│
│ Disassembly of section .text:
│
│ 0000000000062464 <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb44>:
91.63 │ rdtsc
0.65 │ shl $0x20,%rdx
0.63 │ or %rdx,%rax
│ ↓ js bc
0.82 │ pxor %xmm0,%xmm0
1.51 │ cvtsi2sd %rax,%xmm0
0.70 │ 18: mulsd (%r15),%xmm0
0.43 │ subsd 0x8(%rsp),%xmm0
1.00 │ comisd (%rsp),%xmm0
0.61 │ → ja 1cc
0.49 │ cmp $0x1,%r13d
│ ↓ je f4
│ movsd hsa_amd_portable_close_dmabuf@@ROCR_1+0x81c60,%xmm4
│ comisd %xmm0,%xmm4
│ → jbe 12c
│ lea hsa_amd_portable_close_dmabuf@@ROCR_1+0xe8d39,%rax
│ cmpb $0x0,(%rax)
│ → je 62428 <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb08>
│ xor %ecx,%ecx
│ mov $0xea60,%eax
│ mov $0x2,%ebx
│ mwaitx %eax,%ecx,%ebx
│ lea 0x8(%rsi),%rax
│ mov %ecx,%edx
│ monitorx %rax,%ecx,%edx
│ mov 0x20(%rbp),%eax
│ test %eax,%eax
│ → jne 62433 <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb13>
│ nop
│ 84: xor %ebx,%ebx
0.10 │ lock subl $0x1,0x10(%rbp)
│ mov %rbp,%rdi
0.01 │ addr32 call 81d20 <hsa_amd_image_get_info_max_dim@@ROCR_1+0x20400>
│ mov 0x28(%rsp),%rax
│ sub %fs:0x28,%rax
│ → jne 1d5
│ add $0x38,%rsp
│ mov %rbx,%rax
│ pop %rbx
│ pop %rbp
│ pop %r12
│ pop %r13
│ pop %r14
│ pop %r15
│ ← ret
│ xchg %ax,%ax
│ bc: mov %rax,%rdx
│ and $0x1,%eax
│ pxor %xmm0,%xmm0
│ shr %rdx
│ or %rax,%rdx
│ cvtsi2sd %rdx,%xmm0
│ addsd %xmm0,%xmm0
│ ↑ jmp 18
│ xchg %ax,%ax
│ cmp $0x3,%r12d
│ ↑ jne 84
│ cmp %rbx,%r14
│ setle %al
│ → jmp 6245c <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb3c>
│ nop
0.49 │ f4: lea hsa_amd_portable_close_dmabuf@@ROCR_1+0xe8d39,%rax
0.93 │ cmpb $0x0,(%rax)
│ .byte 0xf
│ .byte 0x84
I don't need to do anything specific to reproduce. Here's an example command:
llama.cpp/build/bin/main --in-prefix "[INST] " --in-suffix " [/INST]" -t 1 -tb 1 -c 4096 -ngl 99 -m mixtral-8x7b-instruct-v0.1.Q3_K_M.gguf -n 1000 -p "Elaborate on what is quantum mechanics, and its impact on our world."
It looks as if a very tight loop is being timed. I briefly looked for the line of code where this is happening, but with optimizations and some debug symbols missing, I couldn't find it. I could try recompiling ROCm if that could help.
OS: ArchLinux kernel: 6.7.0-rc5 ROCm: 5.7.1 mesa: 23.3.2 GPU: 7900xtx (gfx1100) CPU: Ryzen 9 5900X pytorch: torch==2.3.0.dev20240101+rocm5.7 llama.cpp: 39d8bc71edcb8b6f99d46fa4216af7a15232e218 exllamav2: 45d0ddd402eefc4f40fd3479df7f74702f241a22
@jdecourval Apologies for the lack of response. Can you please check if your issue still exist with the latest ROCm 6.2? If not, please close the ticket. Thanks!