auto vectorize for any cpu and platform (x86_64, aarch64 ...) instead of only avx2
Here You have (probably invalid) fast prof of concept only that using pure intrinsics is waste of time with current compilers and auto vectorization works nice.
For last 15 years I was in large project step by step removing intrinsics and replacing them with generic vector code , not introducing ..
All we need is to prepare special types( int64x4_t declared in example is suitable for x86_64 but for aarch64 i would declare with operators struct uint64x4_t { uint64x2_t low; uint64x2_t high;};
And compiler does the magic, of course there need to be some amendments to be compatibile with not only clang but gcc and msvc, but they are trivial.
For example code
void generic(char const * __restrict__ c, char const * __restrict__ e, size_t n , char * __restrict__ data)
{
if (n > 31) {
for (const auto end_m7 = e - 31; c < end_m7;) {
uint64x4_t swar;
std::memcpy(data, c, 32);
std::memcpy(&swar, c, 32);
constexpr auto lo7_mask {repeat_byte32(0b01111111)};
const auto lo7 { swar & lo7_mask };
const auto quote { (lo7 ^ repeat_byte32('"')) + lo7_mask };
const auto backslash {(lo7 ^ repeat_byte32('\\')) + lo7_mask};
const auto less_32 {(swar & repeat_byte32(0b01100000)) + lo7_mask};
auto next = ~((quote & backslash & less_32) | swar);
next &= repeat_byte32(0b10000000);
if (is_zero(next)) {
data += 32;
c += 32;
continue;
}
const auto length = (countr_zero32(next) >> 3);
...
You get proper vectorization and instruction set best possible znver5
generic(char const*, char const*, unsigned long, char*):
cmp rdx, 32
jb .LBB0_10
lea rax, [rsi - 31]
cmp rdi, rax
jae .LBB0_10
vpbroadcastq ymm0, qword ptr [rip + .LCPI0_0]
vpbroadcastq ymm1, qword ptr [rip + .LCPI0_1]
vpbroadcastq ymm2, qword ptr [rip + .LCPI0_2]
vpbroadcastq ymm3, qword ptr [rip + .LCPI0_3]
vpbroadcastq ymm4, qword ptr [rip + .LCPI0_4]
lea rdx, [rip + char_escape_table]
jmp .LBB0_3
.LBB0_4:
add rcx, 32
add rdi, 32
cmp rdi, rax
jae .LBB0_10
.LBB0_3:
vmovups ymm5, ymmword ptr [rdi]
vmovdqu ymm6, ymmword ptr [rdi]
vmovups ymmword ptr [rcx], ymm5
vpand ymm5, ymm6, ymm0
vpand ymm8, ymm6, ymm3
vpxor ymm7, ymm5, ymm1
vpxor ymm5, ymm5, ymm2
vpaddq ymm8, ymm8, ymm0
vpaddq ymm5, ymm5, ymm0
vpaddq ymm7, ymm7, ymm0
vpand ymm5, ymm8, ymm5
vpternlogq ymm5, ymm6, ymm7, 236
vptest ymm5, ymm4
jb .LBB0_4
vpandn ymm5, ymm5, ymm4
vmovq r8, xmm5
tzcnt r8, r8
cmp r8, 63
penryn
generic(char const*, char const*, unsigned long, char*):
cmp rdx, 32
jb .LBB0_10
lea rax, [rsi - 31]
cmp rdi, rax
jae .LBB0_10
movdqa xmm0, xmmword ptr [rip + .LCPI0_0]
movdqa xmm1, xmmword ptr [rip + .LCPI0_1]
movdqa xmm2, xmmword ptr [rip + .LCPI0_2]
movdqa xmm3, xmmword ptr [rip + .LCPI0_3]
movdqa xmm4, xmmword ptr [rip + .LCPI0_4]
lea rdx, [rip + char_escape_table]
jmp .LBB0_3
.LBB0_4:
add rcx, 32
add rdi, 32
cmp rdi, rax
jae .LBB0_10
.LBB0_3:
mov r8, qword ptr [rdi + 24]
mov qword ptr [rcx + 24], r8
mov r8, qword ptr [rdi + 16]
mov qword ptr [rcx + 16], r8
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi + 8]
mov qword ptr [rcx + 8], r9
mov qword ptr [rcx], r8
movdqu xmm7, xmmword ptr [rdi]
movdqu xmm6, xmmword ptr [rdi + 16]
movdqa xmm9, xmm7
pand xmm9, xmm0
movdqa xmm10, xmm6
pand xmm10, xmm0
movdqa xmm11, xmm10
pxor xmm11, xmm1
movdqa xmm12, xmm9
pxor xmm12, xmm1
paddq xmm12, xmm0
paddq xmm11, xmm0
pxor xmm9, xmm2
etc ... see golbot link for details.
Thanks for sharing this approach, it seems like an excellent improvement for more generic code. I will want to carefully examine performance impacts before implementing it, but thanks so much!
Thanks for sharing this approach, it seems like an excellent improvement for more generic code. I will want to carefully examine performance impacts before implementing it, but thanks so much!
I simply can not be slower than using pure integral types as long whole calculation sequence uses proper vector types, then all calculations will be done in simd unit. simd especially on arm may be slower only in cases when a lot of data constantly is transfered from simd unit to memory because this must be done using main registers ( memory <> gen pup registers <> simd registers ) at that has large latancy.
Yes, it is Arm that I often see SWAR (SIMD within a register) being faster than SIMD intrinsics.
moved comment from PR as it belongs here :
In my production code I was using approach hiding explicit intrinsic calls with forced inline functions written for generic, x86_64 variants an aarch64 So in global code everything was generic and not infected with any explicit intrinsic calls, more readable and explicit wrappers over intrinsic sets were able to be testable. So in case of BMI2 i wold not use them directly but create 3 wrapper functions [bmi2, ararch64 neon, generic c++ code]
examples:
template<int... args, typename vector_type>
constexpr vector_type shuffle_vector(vector_type a, vector_type b) noexcept
{
#if defined(__clang__)
return __builtin_shufflevector(a, b, args...);
#else
using element_type = typename std::remove_reference<typename std::remove_cv<decltype(a[0])>::type>::type;
return __builtin_shuffle(a, b, vector_type{static_cast<element_type>(args)...});
#endif
}
or
#if defined(__ARM_NEON)
[[nodiscard, gnu::always_inline, gnu::const]]
inline float64x2_t max_pd(float64x2_t a, float64x2_t b)
{
return vpmaxq_f64(a, b);
}
#elif defined(__SSE2__)
using float64x2_t = __m128d;
[[nodiscard, gnu::always_inline, gnu::const]]
inline float64x2_t max_pd(float64x2_t a, float64x2_t b) noexcept
{
return _mm_max_pd(a, b);
}
#else
[[nodiscard, gnu::always_inline, gnu::const]]
inline float64x2_t max_pd(float64x2_t a, float64x2_t b) noexcept
{
return
and that way i was able to avoid using direct intrinsic
So I wanted to point out that normal production code function should look like always generic code even when they use some intrinsics via wrappers, You avoid that ugly part You have with this AVX2 #ifdef code block