csFastFloat
csFastFloat copied to clipboard
Port SIMD code to ARM NEON
Initially, SIMD code supports only x64 processors, but it should be simple to add support for ARM NEON as well.
Possible approach for parsing of 8 UTF-16 digits (assumes little endian). Note: I know very little about ARM or NEON.
uint16x8_t x = vld1q_u16(src);
x = vsubq_u16(x, vdupq_n_u16(0x0030));
uint8x8_t m = vqmovn_u16(vcgtq_u16(x, vdupq_n_u16(0x0009)));
if(vget_lane_u64(vreinterpret_u64_u8(m), 0) == 0) { // all 8 chars are digits
static const uint32_t mul1[4] = {
0x03E80064, 0x0000A0001, // 100 + (1000 << 16), 1 + (10 << 16)
0x03E80064, 0x0000A0001
};
uint32x4_t v = vmulq_u32(vreinterpretq_u32_u16(x), vld1q_u32(mul1));
uint32x2_t t = vpaddl_u16(vshrn_n_u32(v, 16));
uint32_t a = vget_lane_u32(t, 0);
uint32_t b = vget_lane_u32(t, 1);
return (a * 10000) + b;
}