simdtutor icon indicating copy to clipboard operation
simdtutor copied to clipboard

小彭老师,我想请教一下下面这段代码为什么在-O2的情况下,avx2的版本比sse2的版本性能要差呢?(x86 gcc version 11.2.1)

Open AJ-mider opened this issue 1 year ago • 0 comments

// sse2 version template<typename Char> inline bool bytescompare(const Char* a, const Char* b, size_t n) { size_t offset = 0; size_t offset_end = n / 16 * 16; #ifdef SSE2

for (; offset < offset_end; offset += 16)
{
    const __m128i vec_1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset));
    const __m128i vec_2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset));
    __m128i compare_result = _mm_cmpeq_epi8(vec_1, vec_2);
    int mask = _mm_movemask_epi8(compare_result);
    if (mask != 0xFFFF) return false;
}

#endif for (; offset < n; ++offset) { if (a[offset] != b[offset]) return false; } return true; }

// avx version template<typename Char> inline bool bytescompare_avx(const Char* a, const Char* b, size_t n) { size_t offset = 0; size_t offset_end = n / 32 * 32; #ifdef AVX2 for (; offset < offset_end; offset += 32) { const __m256i vec_1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(a + offset)); const __m256i vec_2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(b + offset)); __m256i compare_result = _mm256_cmpeq_epi8(vec_1, vec_2); int mask = _mm256_movemask_epi8(compare_result); if (mask != 0xFFFFFFFF) return false; } #endif for (; offset < n; ++offset) { if (a[offset] != b[offset]) return false; } return true; }

其中测试用例如下: void test_for_bytescompare() { std::srand(static_cast(std::time(nullptr))); // 初始化随机数生成器 const size_t arraySize = 320000; // 设置数组大小 // 生成两个随机字节数组 char array1[arraySize]; char array2[arraySize]; generateRandomByteArray(array1, arraySize); std::copy(array1, array1 + arraySize, array2); auto start = chrono::high_resolution_clock::now(); bool res = NBSimdBooster::bytescompare_avx(array1, array2, arraySize); auto end = chrono::high_resolution_clock::now(); std::chrono::duration elapsed_seconds = end - start; cout << "time: " << elapsed_seconds.count() << endl; cout << res << endl; } 其中 编译命令:g++ -O2 test.cpp -o pj1 -mavx2 sse版本耗时:1.4581e-05s avx版本耗时:2.7021e-05s

AJ-mider avatar Dec 11 '23 10:12 AJ-mider