simdtutor
simdtutor copied to clipboard
小彭老师,我想请教一下下面这段代码为什么在-O2的情况下,avx2的版本比sse2的版本性能要差呢?(x86 gcc version 11.2.1)
// sse2 version template<typename Char> inline bool bytescompare(const Char* a, const Char* b, size_t n) { size_t offset = 0; size_t offset_end = n / 16 * 16; #ifdef SSE2
for (; offset < offset_end; offset += 16)
{
const __m128i vec_1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset));
const __m128i vec_2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset));
__m128i compare_result = _mm_cmpeq_epi8(vec_1, vec_2);
int mask = _mm_movemask_epi8(compare_result);
if (mask != 0xFFFF) return false;
}
#endif for (; offset < n; ++offset) { if (a[offset] != b[offset]) return false; } return true; }
// avx version template<typename Char> inline bool bytescompare_avx(const Char* a, const Char* b, size_t n) { size_t offset = 0; size_t offset_end = n / 32 * 32; #ifdef AVX2 for (; offset < offset_end; offset += 32) { const __m256i vec_1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(a + offset)); const __m256i vec_2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(b + offset)); __m256i compare_result = _mm256_cmpeq_epi8(vec_1, vec_2); int mask = _mm256_movemask_epi8(compare_result); if (mask != 0xFFFFFFFF) return false; } #endif for (; offset < n; ++offset) { if (a[offset] != b[offset]) return false; } return true; }
其中测试用例如下:
void test_for_bytescompare()
{
std::srand(static_cast