High CPU usage from rdtsc during LLM inference

Open jdecourval opened this issue 2 years ago • 1 comments

top reports 100% single-core CPU usage when inferring LLMs both with exllamav2 and llama.cpp.

Digging with perf, it seems this loads is coming from libhsa-runtime64, specifically, from a single rdtsc instruction called from hsa_amd_image_get_info_max_dim.

Samples: 79K of event 'cycles:P', 4000 Hz, Event count (approx.): 53226866542 lost: 0/0 drop: 0/0
Overhead  Shared Object                                  Symbol
  69.64%  libhsa-runtime64.so                            [.] 0x0000000000050f45
   2.36%  libhsa-runtime64.so                            [.] 0x0000000000050f3d
   1.67%  libhsa-runtime64.so                            [.] 0x0000000000051027
   1.20%  libhsa-runtime64.so                            [.] 0x0000000000051056
   1.17%  libhsa-runtime64.so                            [.] 0x0000000000051053
   0.83%  libhsa-runtime64.so                            [.] 0x0000000000050f6e
   0.83%  libhsa-runtime64.so                            [.] 0x0000000000050f23
   0.82%  libhsa-runtime64.so                            [.] 0x0000000000051020
   0.80%  libhsa-runtime64.so                            [.] 0x0000000000050f58
   0.77%  libhsa-runtime64.so                            [.] 0x0000000000050f1d
   0.75%  libhsa-runtime64.so                            [.] 0x0000000000051050
   0.59%  libamdhip64.so                                 [.] 0x00000000002662ed

Example with exllamav2:

Samples: 134K of event 'cycles:P', 4000 Hz, Event count (approx.): 70352266841
0000000000050f45  /home/jerome/project/.venv/lib/python3.12/site-packages/torch/lib/libhsa-runtime64.so [Percent: local period]
Percent│
       │
       │
       │     Disassembly of section .text:
       │
       │     0000000000050f45 <hsa_amd_image_get_info_max_dim@@ROCR_1+0x865>:
 90.32 │       rdtsc
  0.55 │       shl      $0x20,%rdx
  0.55 │       or       %rdx,%rax
       │     ↓ js       b3
  0.48 │       pxor     %xmm0,%xmm0
  1.06 │       cvtsi2sd %rax,%xmm0
  0.53 │ 18:   lea      _amdgpu_r_debug@@ROCR_1+0x13e2b0,%rax
  0.63 │       mulsd    (%rax),%xmm0
  0.57 │       subsd    0x8(%rsp),%xmm0
  0.96 │       ucomisd  (%rsp),%xmm0
  0.55 │     → ja       1db
  0.56 │       cmp      $0x1,%r14d
       │     ↓ je       db
       │       movsd    hsa_amd_portable_close_dmabuf@@ROCR_1+0x86b40,%xmm4
       │       ucomisd  %xmm0,%xmm4
       │     → jbe      13b
       │       lea      typeinfo for __cxxabiv1::__forced_unwind@@ROCR_1+0x2941,%rax
       │       cmpb     $0x0,(%rax)
       │     → je       50eff <hsa_amd_image_get_info_max_dim@@ROCR_1+0x81f>
       │       xor      %ecx,%ecx
       │       mov      $0xea60,%eax
       │       mov      $0x2,%ebx
       │       mwaitx   %eax,%ecx,%ebx
       │       lea      0x8(%rsi),%rax
       │       mov      %ecx,%edx
       │       monitorx %rax,%ecx,%edx
       │       mov      0x0(%r13),%eax
       │       test     %eax,%eax
       │     → jne      50f0b <hsa_amd_image_get_info_max_dim@@ROCR_1+0x82b>
       │       nop
       │       xor      %ebx,%ebx
       │       mov      0x10(%rsp),%rax
       │       lock     subl     $0x1,(%rax)
       │       mov      %rbp,%rdi
       │     → call     hsa_amd_image_get_info_max_dim@@ROCR_1+0x1f470
       │       add      $0x38,%rsp
       │       mov      %rbx,%rax
       │       pop      %rbx
       │       pop      %rbp
       │       pop      %r12
       │       pop      %r13
       │       pop      %r14
       │       pop      %r15
       │     ← ret
       │       nop
       │ b3:   mov      %rax,%rdx
       │       pxor     %xmm0,%xmm0
       │       shr      %rdx
       │       and      $0x1,%eax
       │       or       %rax,%rdx
       │       cvtsi2sd %rdx,%xmm0
       │       addsd    %xmm0,%xmm0
       │     ↑ jmp      18
       │       cs       nopw 0x0(%rax,%rax,1)
  1.13 │ db:   lea      typeinfo for __cxxabiv1::__forced_unwind@@ROCR_1+0x2941,%rax
  2.11 │       cmpb     $0x0,(%rax)
       │     → je       50eff <hsa_amd_image_get_info_max_dim@@ROCR_1+0x81f>
       │       xor      %ecx,%ecx
       │       mov      %ecx,%eax
       │       mov      %ecx,%ebx
       │       mwaitx   %eax,%ecx,%ebx
       │       lea      0x8(%rsi),%rax
       │       mov      %ecx,%edx
       │       monitorx %rax,%ecx,%edx
       │       .byte    0xe9
       │       .byte    0xb8
       │       .byte    0xfe

Example from llama.cpp:

Samples: 30K of event 'cycles:P', 4000 Hz, Event count (approx.): 24997506195
0000000000062464  /opt/rocm/lib/libhsa-runtime64.so.1.11.0 [Percent: local period]
Percent│
       │
       │
       │     Disassembly of section .text:
       │
       │     0000000000062464 <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb44>:
 91.63 │       rdtsc
  0.65 │       shl      $0x20,%rdx
  0.63 │       or       %rdx,%rax
       │     ↓ js       bc
  0.82 │       pxor     %xmm0,%xmm0
  1.51 │       cvtsi2sd %rax,%xmm0
  0.70 │ 18:   mulsd    (%r15),%xmm0
  0.43 │       subsd    0x8(%rsp),%xmm0
  1.00 │       comisd   (%rsp),%xmm0
  0.61 │     → ja       1cc
  0.49 │       cmp      $0x1,%r13d
       │     ↓ je       f4
       │       movsd    hsa_amd_portable_close_dmabuf@@ROCR_1+0x81c60,%xmm4
       │       comisd   %xmm0,%xmm4
       │     → jbe      12c
       │       lea      hsa_amd_portable_close_dmabuf@@ROCR_1+0xe8d39,%rax
       │       cmpb     $0x0,(%rax)
       │     → je       62428 <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb08>
       │       xor      %ecx,%ecx
       │       mov      $0xea60,%eax
       │       mov      $0x2,%ebx
       │       mwaitx   %eax,%ecx,%ebx
       │       lea      0x8(%rsi),%rax
       │       mov      %ecx,%edx
       │       monitorx %rax,%ecx,%edx
       │       mov      0x20(%rbp),%eax
       │       test     %eax,%eax
       │     → jne      62433 <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb13>
       │       nop
       │ 84:   xor      %ebx,%ebx
  0.10 │       lock     subl     $0x1,0x10(%rbp)
       │       mov      %rbp,%rdi
  0.01 │       addr32   call 81d20 <hsa_amd_image_get_info_max_dim@@ROCR_1+0x20400>
       │       mov      0x28(%rsp),%rax
       │       sub      %fs:0x28,%rax
       │     → jne      1d5
       │       add      $0x38,%rsp
       │       mov      %rbx,%rax
       │       pop      %rbx
       │       pop      %rbp
       │       pop      %r12
       │       pop      %r13
       │       pop      %r14
       │       pop      %r15
       │     ← ret
       │       xchg     %ax,%ax
       │ bc:   mov      %rax,%rdx
       │       and      $0x1,%eax
       │       pxor     %xmm0,%xmm0
       │       shr      %rdx
       │       or       %rax,%rdx
       │       cvtsi2sd %rdx,%xmm0
       │       addsd    %xmm0,%xmm0
       │     ↑ jmp      18
       │       xchg     %ax,%ax
       │       cmp      $0x3,%r12d
       │     ↑ jne      84
       │       cmp      %rbx,%r14
       │       setle    %al
       │     → jmp      6245c <hsa_amd_image_get_info_max_dim@@ROCR_1+0xb3c>
       │       nop
  0.49 │ f4:   lea      hsa_amd_portable_close_dmabuf@@ROCR_1+0xe8d39,%rax
  0.93 │       cmpb     $0x0,(%rax)
       │       .byte    0xf
       │       .byte    0x84

I don't need to do anything specific to reproduce. Here's an example command:

llama.cpp/build/bin/main --in-prefix "[INST] " --in-suffix " [/INST]" -t 1 -tb 1 -c 4096 -ngl 99 -m mixtral-8x7b-instruct-v0.1.Q3_K_M.gguf -n 1000 -p "Elaborate on what is quantum mechanics, and its impact on our world."

It looks as if a very tight loop is being timed. I briefly looked for the line of code where this is happening, but with optimizations and some debug symbols missing, I couldn't find it. I could try recompiling ROCm if that could help.

OS: ArchLinux kernel: 6.7.0-rc5 ROCm: 5.7.1 mesa: 23.3.2 GPU: 7900xtx (gfx1100) CPU: Ryzen 9 5900X pytorch: torch==2.3.0.dev20240101+rocm5.7 llama.cpp: 39d8bc71edcb8b6f99d46fa4216af7a15232e218 exllamav2: 45d0ddd402eefc4f40fd3479df7f74702f241a22

Jan 04 '24 19:01 jdecourval

@jdecourval Apologies for the lack of response. Can you please check if your issue still exist with the latest ROCm 6.2? If not, please close the ticket. Thanks!

Aug 16 '24 16:08 ppanchad-amd