ROCR-Runtime icon indicating copy to clipboard operation
ROCR-Runtime copied to clipboard

High CPU usage from rdtsc during LLM inference

Open jdecourval opened this issue 2 years ago • 1 comments

top reports 100% single-core CPU usage when inferring LLMs both with exllamav2 and llama.cpp.

Digging with perf, it seems this loads is coming from libhsa-runtime64, specifically, from a single rdtsc instruction called from hsa_amd_image_get_info_max_dim.

Samples: 79K of event 'cycles:P', 4000 Hz, Event count (approx.): 53226866542 lost: 0/0 drop: 0/0
Overhead  Shared Object                                  Symbol
  69.64%  libhsa-runtime64.so                            [.] 0x0000000000050f45
   2.36%  libhsa-runtime64.so                            [.] 0x0000000000050f3d
   1.67%  libhsa-runtime64.so                            [.] 0x0000000000051027
   1.20%  libhsa-runtime64.so                            [.] 0x0000000000051056
   1.17%  libhsa-runtime64.so                            [.] 0x0000000000051053
   0.83%  libhsa-runtime64.so                            [.] 0x0000000000050f6e
   0.83%  libhsa-runtime64.so                            [.] 0x0000000000050f23
   0.82%  libhsa-runtime64.so                            [.] 0x0000000000051020
   0.80%  libhsa-runtime64.so                            [.] 0x0000000000050f58
   0.77%  libhsa-runtime64.so                            [.] 0x0000000000050f1d
   0.75%  libhsa-runtime64.so                            [.] 0x0000000000051050
   0.59%  libamdhip64.so                                 [.] 0x00000000002662ed

Example with exllamav2:

Samples: 134K of event 'cycles:P', 4000 Hz, Event count (approx.): 70352266841
0000000000050f45  /home/jerome/project/.venv/lib/python3.12/site-packages/torch/lib/libhsa-runtime64.so [Percent: local period]
Percent│
       │
       │
       │     Disassembly of section .text:
       │
       │     0000000000050f45 <hsa_amd_image_get_info_max_dim@@ROCR_1+0x865>:
 90.32 │       rdtsc
  0.55 │       shl      $0x20,%rdx
  0.55 │       or       %rdx,%rax
       │     ↓ js       b3
  0.48 │       pxor     %xmm0,%xmm0
  1.06 │       cvtsi2sd %rax,%xmm0
  0.53 │ 18:   lea      _amdgpu_r_debug@@ROCR_1+0x13e2b0,%rax
  0.63 │       mulsd    (%rax),%xmm0
  0.57 │       subsd    0x8(%rsp),%xmm0
  0.96 │       ucomisd  (%rsp),%xmm0
  0.55 │     → ja       1db
  0.56 │       cmp      $0x1,%r14d
       │     ↓ je       db
       │       movsd    hsa_amd_portable_close_dmabuf@@ROCR_1+0x86b40,%xmm4
       │       ucomisd  %xmm0,%xmm4
       │     → jbe      13b
       │       lea      typeinfo for __cxxabiv1::__forced_unwind@@ROCR_1+0x2941,%rax
       │       cmpb     $0x0,(%rax)
       │     → je       50eff <hsa_amd_image_get_info_max_dim@@ROCR_1+0x81f>
       │       xor      %ecx,%ecx
       │       mov      $0xea60,%eax
       │       mov      $0x2,%ebx
       │       mwaitx   %eax,%ecx,%ebx
       │       lea      0x8(%rsi),%rax
       │       mov      %ecx,%edx
       │       monitorx %rax,%ecx,%edx
       │       mov      0x0(%r13),%eax
       │       test     %eax,%eax
       │     → jne      50f0b <hsa_amd_image_get_info_max_dim@@ROCR_1+0x82b>
       │       nop
       │       xor      %ebx,%ebx
       │       mov      0x10(%rsp),%rax
       │       lock     subl     $0x1,(%rax)
       │       mov      %rbp,%rdi
       │     → call     hsa_amd_image_get_info_max_dim@@ROCR_1+0x1f470
       │       add      $0x38,%rsp
       │       mov      %rbx,%rax
       │       pop      %rbx
       │       pop      %rbp
       │       pop      %r12
       │       pop      %r13
       │       pop      %r14
       │       pop      %r15
       │     ← ret
       │       nop
       │ b3:   mov      %rax,%rdx
       │       pxor     %xmm0,%xmm0
       │       shr      %rdx
       │       and      $0x1,%eax
       │       or       %rax,%rdx
       │       cvtsi2sd %rdx,%xmm0
       │       addsd    %xmm0,%xmm0
       │     ↑ jmp      18
       │       cs       nopw 0x0(%rax,%rax,1)
  1.13 │ db:   lea      typeinfo for __cxxabiv1::__forced_unwind@@ROCR_1+0x2941,%rax
  2.11 │       cmpb     $0x0,(%rax)
       │     → je       50eff <hsa_amd_image_get_info_max_dim@@ROCR_1+0x81f>
       │       xor      %ecx,%ecx
       │       mov      %ecx,%eax
       │       mov      %ecx,%ebx
       │       mwaitx   %eax,%ecx,%ebx
       │       lea      0x8(%rsi),%rax
       │       mov      %ecx,%edx
       │       monitorx %rax,%ecx,%edx
       │       .byte    0xe9
       │       .byte    0xb8
       │       .byte    0xfe

Example from llama.cpp:

Samples: 30K of event 'cycles:P', 4000 Hz, Event count (approx.): 24997506195
0000000000062464  /opt/rocm/lib/libhsa-runtime64.so.1.11.0 [Percent: local period]
Percent│
       │
       │
       │     Disassembly of section .text:
       │
       │     0000000000062464 <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb44>:
 91.63 │       rdtsc
  0.65 │       shl      $0x20,%rdx
  0.63 │       or       %rdx,%rax
       │     ↓ js       bc
  0.82 │       pxor     %xmm0,%xmm0
  1.51 │       cvtsi2sd %rax,%xmm0
  0.70 │ 18:   mulsd    (%r15),%xmm0
  0.43 │       subsd    0x8(%rsp),%xmm0
  1.00 │       comisd   (%rsp),%xmm0
  0.61 │     → ja       1cc
  0.49 │       cmp      $0x1,%r13d
       │     ↓ je       f4
       │       movsd    hsa_amd_portable_close_dmabuf@@ROCR_1+0x81c60,%xmm4
       │       comisd   %xmm0,%xmm4
       │     → jbe      12c
       │       lea      hsa_amd_portable_close_dmabuf@@ROCR_1+0xe8d39,%rax
       │       cmpb     $0x0,(%rax)
       │     → je       62428 <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb08>
       │       xor      %ecx,%ecx
       │       mov      $0xea60,%eax
       │       mov      $0x2,%ebx
       │       mwaitx   %eax,%ecx,%ebx
       │       lea      0x8(%rsi),%rax
       │       mov      %ecx,%edx
       │       monitorx %rax,%ecx,%edx
       │       mov      0x20(%rbp),%eax
       │       test     %eax,%eax
       │     → jne      62433 <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb13>
       │       nop
       │ 84:   xor      %ebx,%ebx
  0.10 │       lock     subl     $0x1,0x10(%rbp)
       │       mov      %rbp,%rdi
  0.01 │       addr32   call 81d20 <hsa_amd_image_get_info_max_dim@@ROCR_1+0x20400>
       │       mov      0x28(%rsp),%rax
       │       sub      %fs:0x28,%rax
       │     → jne      1d5
       │       add      $0x38,%rsp
       │       mov      %rbx,%rax
       │       pop      %rbx
       │       pop      %rbp
       │       pop      %r12
       │       pop      %r13
       │       pop      %r14
       │       pop      %r15
       │     ← ret
       │       xchg     %ax,%ax
       │ bc:   mov      %rax,%rdx
       │       and      $0x1,%eax
       │       pxor     %xmm0,%xmm0
       │       shr      %rdx
       │       or       %rax,%rdx
       │       cvtsi2sd %rdx,%xmm0
       │       addsd    %xmm0,%xmm0
       │     ↑ jmp      18
       │       xchg     %ax,%ax
       │       cmp      $0x3,%r12d
       │     ↑ jne      84
       │       cmp      %rbx,%r14
       │       setle    %al
       │     → jmp      6245c <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb3c>
       │       nop
  0.49 │ f4:   lea      hsa_amd_portable_close_dmabuf@@ROCR_1+0xe8d39,%rax
  0.93 │       cmpb     $0x0,(%rax)
       │       .byte    0xf
       │       .byte    0x84

I don't need to do anything specific to reproduce. Here's an example command:

llama.cpp/build/bin/main --in-prefix "[INST] " --in-suffix " [/INST]" -t 1 -tb 1 -c 4096 -ngl 99 -m mixtral-8x7b-instruct-v0.1.Q3_K_M.gguf -n 1000 -p "Elaborate on what is quantum mechanics, and its impact on our world."

It looks as if a very tight loop is being timed. I briefly looked for the line of code where this is happening, but with optimizations and some debug symbols missing, I couldn't find it. I could try recompiling ROCm if that could help.

OS: ArchLinux kernel: 6.7.0-rc5 ROCm: 5.7.1 mesa: 23.3.2 GPU: 7900xtx (gfx1100) CPU: Ryzen 9 5900X pytorch: torch==2.3.0.dev20240101+rocm5.7 llama.cpp: 39d8bc71edcb8b6f99d46fa4216af7a15232e218 exllamav2: 45d0ddd402eefc4f40fd3479df7f74702f241a22

jdecourval avatar Jan 04 '24 19:01 jdecourval

@jdecourval Apologies for the lack of response. Can you please check if your issue still exist with the latest ROCm 6.2? If not, please close the ticket. Thanks!

ppanchad-amd avatar Aug 16 '24 16:08 ppanchad-amd