llama.cpp vulkan: improve im2col

vulkan: improve im2col

Open daniandtheweb opened this issue 2 weeks ago • 20 comments

This PR supersedes #11778. Here's the performance numbers on my Radeon RX 5700XT (RADV).

Vulkan:

Master:
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):              13104 runs -    95.82 us/run -    10244 kB/run -  101.96 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):               1640 runs -   639.72 us/run -    40964 kB/run -   61.08 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[256,256,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       52 runs - 79006.56 us/run -   655364 kB/run -    7.93 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,2560,1],ne_kernel=[3,3,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                     1312 runs -   900.50 us/run -   102445 kB/run -  108.53 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,2560,1],ne_kernel=[3,3,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       82 runs - 20310.68 us/run -   409645 kB/run -   19.26 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):               4278 runs -   236.25 us/run -    23536 kB/run -   95.01 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                335 runs -  3518.08 us/run -   100208 kB/run -   27.17 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[256,256,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       20 runs - 313087.20 us/run -  1678448 kB/run -    5.12 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,2560,1],ne_kernel=[5,5,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                      572 runs -  2263.98 us/run -   235365 kB/run -   99.18 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,2560,1],ne_kernel=[5,5,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       34 runs - 71241.88 us/run -  1002085 kB/run -   13.43 GB/s

PR:
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):              19656 runs -    58.61 us/run -    10244 kB/run -  166.70 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):               1640 runs -   664.67 us/run -    40964 kB/run -   58.78 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[256,256,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       52 runs - 79993.31 us/run -   655364 kB/run -    7.83 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,2560,1],ne_kernel=[3,3,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                     1968 runs -   602.13 us/run -   102445 kB/run -  162.31 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,2560,1],ne_kernel=[3,3,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       82 runs - 17203.68 us/run -   409645 kB/run -   22.74 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):               5704 runs -   227.95 us/run -    23536 kB/run -   98.47 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                670 runs -  2895.43 us/run -   100208 kB/run -   33.01 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[256,256,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       20 runs - 315784.75 us/run -  1678448 kB/run -    5.08 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,2560,1],ne_kernel=[5,5,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                      572 runs -  2218.08 us/run -   235365 kB/run -  101.23 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,2560,1],ne_kernel=[5,5,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       34 runs - 72422.56 us/run -  1002085 kB/run -   13.21 GB/s

HIP:

  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):               3276 runs -   923.04 us/run -    10244 kB/run -   10.58 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                820 runs -  3359.14 us/run -    40964 kB/run -   11.63 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[256,256,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       52 runs - 58759.37 us/run -   655364 kB/run -   10.66 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,2560,1],ne_kernel=[3,3,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                      328 runs -  9271.95 us/run -   102445 kB/run -   10.54 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,2560,1],ne_kernel=[3,3,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       82 runs - 79109.07 us/run -   409645 kB/run -    4.94 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):               1426 runs -  1077.72 us/run -    23536 kB/run -   20.83 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                335 runs -  9038.40 us/run -   100208 kB/run -   10.57 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[256,256,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       20 runs - 160059.45 us/run -  1678448 kB/run -   10.02 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,2560,1],ne_kernel=[5,5,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                      143 runs - 12785.92 us/run -   235365 kB/run -   17.56 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,2560,1],ne_kernel=[5,5,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       34 runs - 120635.09 us/run -  1002085 kB/run -    7.93 GB/s

I'm also including the benchmark results when using RADV_PERFTEST=cswave32. It's interesting to note that despite this variable hurting this specific operation's performance it actually improves the speed in stable-diffusion.cpp (sd 1.5 512x512: without PR 1.38 it/s, with PR 1.45 it/s, with PR + cswave32 1.55 it/s).

Vulkan cswave32:

Master:
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):              13104 runs -    94.26 us/run -    10244 kB/run -  103.65 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                820 runs -  1426.19 us/run -    40964 kB/run -   27.40 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[256,256,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       52 runs - 122464.00 us/run -   655364 kB/run -    5.11 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,2560,1],ne_kernel=[3,3,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                      984 runs -  1240.23 us/run -   102445 kB/run -   78.80 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,2560,1],ne_kernel=[3,3,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       82 runs - 67429.80 us/run -   409645 kB/run -    5.80 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):               4278 runs -   325.81 us/run -    23536 kB/run -   68.89 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                335 runs -  7415.25 us/run -   100208 kB/run -   12.89 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[256,256,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       20 runs - 254549.75 us/run -  1678448 kB/run -    6.30 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,2560,1],ne_kernel=[5,5,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                      286 runs -  4823.12 us/run -   235365 kB/run -   46.55 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,2560,1],ne_kernel=[5,5,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       34 runs - 358562.76 us/run -  1002085 kB/run -    2.67 GB/s

PR:
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):              13104 runs -    81.44 us/run -    10244 kB/run -  119.98 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                820 runs -  1439.59 us/run -    40964 kB/run -   27.14 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[256,256,256,1],ne_kernel=[3,3,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       52 runs - 135311.06 us/run -   655364 kB/run -    4.63 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,2560,1],ne_kernel=[3,3,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                      984 runs -  1191.29 us/run -   102445 kB/run -   82.04 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,2560,1],ne_kernel=[3,3,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       82 runs - 62476.68 us/run -   409645 kB/run -    6.26 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):               4278 runs -   315.84 us/run -    23536 kB/run -   71.07 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                335 runs -  7497.88 us/run -   100208 kB/run -   12.75 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[256,256,256,1],ne_kernel=[5,5,256,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       20 runs - 254943.60 us/run -  1678448 kB/run -    6.29 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[32,32,2560,1],ne_kernel=[5,5,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                      286 runs -  4767.25 us/run -   235365 kB/run -   47.10 GB/s
  IM2COL(type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[64,64,2560,1],ne_kernel=[5,5,2560,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1):                       34 runs - 358791.47 us/run -  1002085 kB/run -    2.67 GB/s

Feb 12 '25 13:02 daniandtheweb

llama.cpp llama.cpp copied to clipboard

vulkan: improve im2col

llama.cpp
llama.cpp copied to clipboard