kissfft
kissfft copied to clipboard
Neon
kissfft on ARMv7 with fixed point is slow. A neon version would improve performance quite a bit.
- there arent enough registers, so the stack is used
- shifts and rounding add a lot of overhead
The float version has neither of those issues. The main loop of bfly4 is 68 instructions for float vs 150 for 16 bit fixed point.
- Neon could also process more than 1 value at a time. kiss_fft_cpx has 2 values, and bfly loops process more than 1 in places. C_MUL(scratch[0],Fout1 , *tw1 ); C_MUL(scratch[1],Fout2 , *tw2 );
idk 🤷🏻
benchmark of an end to end application, where kiss fft is about 60% of the profile:
32 bit Cortex A53 Original 53.3 us 4 loads 45.8 us neon 41.9 us
4 loads is this change: https://github.com/mborgerding/kissfft/issues/79 neon replaces complex operators in _kiss_fft_guts to use a single neon instruction on the complex structure.
68a69,99
#if defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16) #define C_MUL( res, a,b) asm volatile(
"vmull.s16 q2, %1, %2 \n" /* a.r * b.r, a.i * b.i /
"vrev32.16 d6, %2 \n" / b.i, b.r /
"vmull.s16 q1, %1, d6 \n" / a.r * b.i, a.i * b.r /
"vneg.s32 d6, d4 \n" / -a.r * b.r, -a.i * b.i /
"vmov s9, s13 \n" / a.r * b.r, -a.i * b.i /
"vpadd.s32 d4, d4, d2 \n" / a.r * b.r - a.i * b.i, a.r * b.i + a.i * b.r /
"vqrshrn.s32 %0, q2, #15 \n" / 32 -> 16 bit */
: "=w"(res): "w"(a), "w"(b): "q1", "q2", "q3")#define DIVSCALAR(x,k) asm volatile(
"vdup.16 d2, %1 \n" /* 1 / k /
"vmull.s16 q1, %0, d2 \n" / x * 1 / k /
"vqrshrn.s32 %0, q1, #15 \n" / 32 -> 16 bit */
: "+w"(x): "r"(SAMP_MAX/k): "q1")#define C_FIXDIV(c,div)
if (div == 4) asm volatile( "vrshr.s16 %0, %0, #2\n" : "+w"(c)); /* div 4 = shr 2 /
else asm volatile(
"vdup.16 d2, %1 \n" / 1 / div /
"vmull.s16 q1, %0, d2 \n" / c * 1 / div /
"vqrshrn.s32 %0, q1, #15 \n" / 32 -> 16 bit */
: "+w"(c): "r"(SAMP_MAX/div): "q1")#define C_MULBYSCALAR(x,s) asm volatile(
"vdup.16 d2, %1 \n" /* s /
"vmull.s16 q1, %0, d2 \n" / x * s /
"vqrshrn.s32 %0, q1, #15 \n" / 32 -> 16 bit */
: "+w"(x): "r"(s): "q1") #else 82a114,115 #endif
99a133,138
#if defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16) #define C_ADD( res, a,b) asm volatile( "vaddq.i16 %0, %1, %2": "=w"(res): "w"(a), "w"(b)) #define C_SUB( res, a,b) asm volatile( "vsubq.i16 %0, %1, %2": "=w"(res): "w"(a), "w"(b)) #define C_ADDTO( res , a) asm volatile( "vaddq.i16 %0, %0, %1": "+w"(res): "w"(a)) #define C_SUBFROM( res , a) asm volatile( "vsubq.i16 %0, %0, %1": "+w"(res): "w"(a)) #else //defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16) 124a164 #endif // !(defined(KISS_FFT_USE_NEON) && (FIXED_POINT==16))