simde icon indicating copy to clipboard operation
simde copied to clipboard

Merge implementations from "missing SSE implementations" to NEON

Open nemequ opened this issue 2 years ago • 8 comments

http://www.alfredklomp.com/programming/sse-intrinsics/ has a great list of implementations of "missing" SSE instructions.

Unlike SSE, NEON isn't missing a lot of this functionality, so we should steal that code and use it to implement parts of the NEON API. For example:

  • _mm_cmple_epu8vcleq_u8 (see 5906cc923b5ba8044e49d9cadb66373550e58758)
  • _mm_cmpge_epu8vcgeq_u8
  • _mm_cmpgt_epu8vcgtq_u8
  • _mm_min_epu16vminq_u16
  • _mm_absdiff_epu8vabdq_u8
  • _mm_bswap_epi16vrev16q_u16/vrev16q_s16

We can also use the same techniques for a bunch of other functions which that page doesn't explicitly include (e.g., vcleq_u16/vcleq_u32/vcleq_u64 can all use the same technique as _mm_cmple_epu8, though 16/32-bit versions require SSE4.1 and 64-bit requires AVX-512VL).

Many of the same implementations could also be used in WASM (wasm_u8x16_le, wasm_u8x16_ge, wasm_u8x16_gt, wasm_u16x8_min, etc.).

There are also a few functions which are present in later versions of SSE, but can be emulated with earlier versions. We should make sure our implementations of SSE also have these versions, too.

As an example, 5906cc923b5ba8044e49d9cadb66373550e58758 implements vcleq_u* using the code from _mm_cmple_epu8.

nemequ avatar Jul 10 '21 21:07 nemequ

__m128i sse2_min_epu16 (__m128i a, __m128i b) {
	return _mm_sub_epi16(a, _mm_subs_epu16(a, b));
}

__m128i sse2_max_epu16 (__m128i a, __m128i b) {
	return _mm_add_epi16(_mm_subs_epu16(a, b), b);
}

These are missed optimizations in gcc, but clang has them. AFAIK, both compilers automatically use a blend for the 32-bits versions.

aqrit avatar Jul 16 '21 19:07 aqrit

Here is another one:

__m128i sse2_subs_epi32(a, b) {
	__m128i t;
	t = _mm_xor_si128(_mm_set1_epi32(0x7FFFFFFF), _mm_cmpgt_epi32(b, a));
	a = _mm_sub_epi32(a, b);

	// return _mm_blendv_ps(a, t, a ^ t) 
	t = _mm_xor_si128(t, a);
	return _mm_xor_si128(a, _mm_and_si128(t, _mm_srai_epi32(t, 31)));
}

(edit) more:

__m128i sse2_absdiff_epi16 (__m128i x, __m128i y) {
	return _mm_sub_epi16(_mm_max_epi16(x, y), _mm_min_epi16(x, y));
}

__m128i sse2_absdiff_epi8 (__m128i a, __m128i b) {
	__m128i m = _mm_cmpgt_epi8(b, a);
	a = _mm_sub_epi8(a, b);
	a = _mm_add_epi8(a, m);
	return _mm_xor_si128(a, m);
}

aqrit avatar Jul 16 '21 19:07 aqrit

Nice, thanks. Those would be great for vminq_u16, vmaxq_u16, and vqsubq_s32 ☺.

There are tons of these floating around the internet, and I'd like to try to get as many as possible merged into SIMDe. Sometimes they are for missing functions, sometimes for emulating a newer instruction using an older extension (like the min/max functions you mentioned). Both are very useful to us.

nemequ avatar Jul 16 '21 20:07 nemequ

Better lowering:

__m128i sse2_cmpgt_epu16 (__m128i x, __m128i y) {
	x = _mm_subs_epu16(x, y);
	return _mm_adds_epu16(x, _mm_sub_epi16(_mm_setzero_si128(), x));
}

aqrit avatar Jul 21 '21 22:07 aqrit

__m128i sse2_cmpgt_epu32(__m128i x, __m128i y) {
    return _mm_xor_si128(_mm_cmpgt_epi32(x, y),
                         _mm_srai_epi32(_mm_xor_si128(x, y), 31));
}

Might help with min_u32.

aqrit avatar Jul 24 '21 13:07 aqrit

collection so far:

__m128i sse2_cmpgt_epu8 (__m128i a, __m128i b) {
    a = _mm_subs_epu8(a, b);
    return _mm_adds_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a));
}
__m128i sse2_cmplt_epu8 (__m128i a, __m128i b) {
    return sse2_cmpgt_epu8(b, a);
}
__m128i sse2_cmpge_epu8 (__m128i a, __m128i b) {
    return _mm_cmpeq_epi8(_mm_min_epu8(a, b), b);
}
__m128i sse2_cmple_epu8 (__m128i a, __m128i b) {
    return _mm_cmpeq_epi8(_mm_max_epu8(a, b), b);
}


__m128i sse2_cmpgt_epu16 (__m128i a, __m128i b) {
    a = _mm_subs_epu16(a, b);
    return _mm_adds_epu16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
}
__m128i sse2_cmplt_epu16 (__m128i a, __m128i b) {
    return sse2_cmpgt_epu16(b, a);
}
__m128i sse2_cmpge_epu16 (__m128i a, __m128i b) {
    return _mm_cmpeq_epi16(_mm_subs_epu16(b, a), _mm_setzero_si128());
}
__m128i sse2_cmple_epu16 (__m128i a, __m128i b) {
    return _mm_cmpeq_epi16(_mm_subs_epu16(a, b), _mm_setzero_si128());
}


__m128i sse2_cmpgt_epu32(__m128i a, __m128i b) {
    return _mm_xor_si128(_mm_cmpgt_epi32(a, b),
                         _mm_srai_epi32(_mm_xor_si128(a, b), 31));
}
__m128i sse2_cmplt_epu32(__m128i a, __m128i b) {
    return sse2_cmpgt_epu32(b, a);
}
__m128i sse2_cmpge_epu32(__m128i a, __m128i b) {
    ??
}
__m128i sse2_cmple_epu32(__m128i a, __m128i b) {
    ??
}


__m128i sse2_min_epi8 (__m128i a, __m128i b) {
    ??
}
__m128i sse2_min_epu8 (__m128i a, __m128i b) {
    return _mm_min_epu8(a, b);
}
__m128i sse2_min_epi16 (__m128i a, __m128i b) {
    return _mm_min_epi16(a, b);
}
__m128i sse2_min_epu16 (__m128i a, __m128i b) {
    return _mm_sub_epi16(a, _mm_subs_epu16(a, b));
}
__m128i sse2_min_epi32 (__m128i a, __m128i b) {
    __m128i m = _mm_cmpgt_epi32(b, a);
    return _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
}
__m128i sse2_min_epu32 (__m128i a, __m128i b) {
    __m128i m = sse2_cmpgt_epu32(b, a);
    return _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
}


__m128i sse2_max_epi8 (__m128i a, __m128i b) {
    ??
}
__m128i sse2_max_epu8 (__m128i a, __m128i b) {
    return _mm_max_epu8(a, b);
}
__m128i sse2_max_epi16 (__m128i a, __m128i b) {
    return _mm_max_epi16(a, b);
}
__m128i sse2_max_epu16 (__m128i a, __m128i b) {
    return _mm_add_epi16(_mm_subs_epu16(a, b), b);
}
__m128i sse2_max_epi32 (__m128i a, __m128i b) {
    __m128i m = _mm_cmpgt_epi32(a, b);
    return _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
}
__m128i sse2_max_epu32 (__m128i a, __m128i b) {
    __m128i m = sse2_cmpgt_epu32(a, b);
    return _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
}


__m128i sse2_absdiff_epi8 (__m128i a, __m128i b) {
    __m128i m = _mm_cmpgt_epi8(b, a);
    a = _mm_sub_epi8(a, b);
    a = _mm_add_epi8(a, m);
    return _mm_xor_si128(a, m);
}
__m128i sse2_absdiff_epi16 (__m128i x, __m128i y) {
    return _mm_sub_epi16(_mm_max_epi16(x, y), _mm_min_epi16(x, y));
}
__m128i sse2_absdiff_epi32 (__m128i a, __m128i b) {
    __m128i m = _mm_cmpgt_epi32(b, a);
    a = _mm_sub_epi32(a, b);
    a = _mm_add_epi32(a, m);
    return _mm_xor_si128(a, m);
}


__m128i sse2_absdiff_epu8 (__m128i x, __m128i y) {
    return _mm_or_si128(_mm_subs_epu8(x, y), _mm_subs_epu8(y, x));
}
__m128i sse2_absdiff_epu16 (__m128i x, __m128i y) {
    return _mm_or_si128(_mm_subs_epu16(x, y), _mm_subs_epu16(y, x));
}
__m128i sse2_absdiff_epu32 (__m128i a, __m128i b) {
    ??
}


__m128i sse2_subs_epu32 (__m128i a, __m128i b) {
    __m128i m = sse2_cmpgt_epu32(a, b);
    return _mm_and_si128(m, _mm_sub_epi32(a, b));
}
__m128i sse2_subs_epi32(a, b) {
    __m128i t;
    t = _mm_xor_si128(_mm_set1_epi32(0x7FFFFFFF), _mm_cmpgt_epi32(b, a));
    a = _mm_sub_epi32(a, b);
    t = _mm_xor_si128(t, a);
    return _mm_xor_si128(a, _mm_and_si128(t, _mm_srai_epi32(t, 31)));
}


__m128i sse2_adds_epu32 (__m128i a, __m128i b) {
    __m128i t = _mm_add_epi32(a, b);
    return _mm_or_si128(t, sse2_cmpgt_epu32(b, t));
}
__m128i sse2_adds_epi32 (__m128i a, __m128i b) {
    // https://stackoverflow.com/q/29498824
}

Avoids generating/loading constants which may not be desirable.

cc @aklomp

aqrit avatar Jul 24 '21 17:07 aqrit

return _mm_xor_si128(_mm_cmpgt_epi32(x, y),
                     _mm_srai_epi32(_mm_xor_si128(x, y), 31));

https://godbolt.org/z/T73MbPEnh

I agree, the throughput isn't quite as good, but the latency on that mov is painful, plus the memory to store the data…

I'll go through your last comment soon, but I think I've got most of them in place (though not merged yet). Thanks for putting them together ☺

nemequ avatar Jul 24 '21 18:07 nemequ

FWIW, my "missing SSE intrinsics" project is now canonically hosted at https://github.com/aklomp/missing-sse-intrinsics.

aklomp avatar May 20 '22 21:05 aklomp