kissfft icon indicating copy to clipboard operation
kissfft copied to clipboard

Neon

Open fbarchard opened this issue 3 years ago • 2 comments

kissfft on ARMv7 with fixed point is slow. A neon version would improve performance quite a bit.

  1. there arent enough registers, so the stack is used
  2. shifts and rounding add a lot of overhead

The float version has neither of those issues. The main loop of bfly4 is 68 instructions for float vs 150 for 16 bit fixed point.

  1. Neon could also process more than 1 value at a time. kiss_fft_cpx has 2 values, and bfly loops process more than 1 in places. C_MUL(scratch[0],Fout1 , *tw1 ); C_MUL(scratch[1],Fout2 , *tw2 );

fbarchard avatar Jun 16 '22 22:06 fbarchard

idk 🤷🏻

j4m3s-101 avatar Jun 28 '22 17:06 j4m3s-101

benchmark of an end to end application, where kiss fft is about 60% of the profile:

32 bit Cortex A53 Original 53.3 us 4 loads 45.8 us neon 41.9 us

4 loads is this change: https://github.com/mborgerding/kissfft/issues/79 neon replaces complex operators in _kiss_fft_guts to use a single neon instruction on the complex structure.

68a69,99

#if defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16) #define C_MUL( res, a,b) asm volatile(
"vmull.s16 q2, %1, %2 \n" /* a.r * b.r, a.i * b.i /
"vrev32.16 d6, %2 \n" /
b.i, b.r /
"vmull.s16 q1, %1, d6 \n" /
a.r * b.i, a.i * b.r /
"vneg.s32 d6, d4 \n" /
-a.r * b.r, -a.i * b.i /
"vmov s9, s13 \n" /
a.r * b.r, -a.i * b.i /
"vpadd.s32 d4, d4, d2 \n" /
a.r * b.r - a.i * b.i, a.r * b.i + a.i * b.r /
"vqrshrn.s32 %0, q2, #15 \n" /
32 -> 16 bit */
: "=w"(res): "w"(a), "w"(b): "q1", "q2", "q3")

#define DIVSCALAR(x,k) asm volatile(
"vdup.16 d2, %1 \n" /* 1 / k /
"vmull.s16 q1, %0, d2 \n" /
x * 1 / k /
"vqrshrn.s32 %0, q1, #15 \n" /
32 -> 16 bit */
: "+w"(x): "r"(SAMP_MAX/k): "q1")

#define C_FIXDIV(c,div)
if (div == 4) asm volatile( "vrshr.s16 %0, %0, #2\n" : "+w"(c)); /* div 4 = shr 2 /
else asm volatile(
"vdup.16 d2, %1 \n" /
1 / div /
"vmull.s16 q1, %0, d2 \n" /
c * 1 / div /
"vqrshrn.s32 %0, q1, #15 \n" /
32 -> 16 bit */
: "+w"(c): "r"(SAMP_MAX/div): "q1")

#define C_MULBYSCALAR(x,s) asm volatile(
"vdup.16 d2, %1 \n" /* s /
"vmull.s16 q1, %0, d2 \n" /
x * s /
"vqrshrn.s32 %0, q1, #15 \n" /
32 -> 16 bit */
: "+w"(x): "r"(s): "q1") #else 82a114,115 #endif

99a133,138

#if defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16) #define C_ADD( res, a,b) asm volatile( "vaddq.i16 %0, %1, %2": "=w"(res): "w"(a), "w"(b)) #define C_SUB( res, a,b) asm volatile( "vsubq.i16 %0, %1, %2": "=w"(res): "w"(a), "w"(b)) #define C_ADDTO( res , a) asm volatile( "vaddq.i16 %0, %0, %1": "+w"(res): "w"(a)) #define C_SUBFROM( res , a) asm volatile( "vsubq.i16 %0, %0, %1": "+w"(res): "w"(a)) #else //defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16) 124a164 #endif // !(defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16))

fbarchard avatar Jun 29 '22 23:06 fbarchard