sse2neon icon indicating copy to clipboard operation
sse2neon copied to clipboard

gcc sanitizer fails on _mm_loadu_si128

Open romange opened this issue 1 year ago • 6 comments

sse2neon implements _mm_loadu_si128 like this:

__m128i _mm_loadu_si128(const __m128i *p)
{
    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
}

which involves casting to int32_t. This, in turn, enforces 4-bytes alignment that the original pointer may not have. As a result, the sanitizer crashes the program in debug mode.

  1. does vld1q_s32 require alignment? if yes, then this seem to contradict semantics of _mm_loadu_si128.
  2. why not just use memcpy that is optimized away by a compiler to the optimal vectorized instruction like this https://godbolt.org/z/84hePd61d ?

romange avatar Apr 02 '23 12:04 romange

Hi @romange ,

does vld1q_s32 require alignment? if yes, then this seem to contradict semantics of _mm_loadu_si128.

According to the document, vld1q_s32 may generate LD1 {Vt.4S},[Xn]. What's more, GCC (to my experiment, Armv7-A) will generate VLD1.dt {Dd},[Rn]. Though NEON does support unaligned data access for NEON data, NEON can accept alignment hint for faster implementation with [<Rn>:<align>] register syntax (see Alignment in NEON document), which the disassemblies of Armv8-A and Armv7-A do not use alignment at all (pasted in References section). Hence, I think vld1q_s32 does not requirement alignment.

why not just use memcpy that is optimized away by a compiler to the optimal vectorized instruction like this https://godbolt.org/z/84hePd61d ?

Though using memcpy will let the vld1q_s32 conversion relies on C library, I think it can be a possible implementation.

References

The disassemblies are originated from this function:

// Originated from tests/impl.cpp
result_t test_mm_loadu_si128(const SSE2NEONTestImpl &impl, uint32_t iter) 
{
    const int32_t *_a = (const int32_t *) impl.mTestIntPointer1;
    __m128i c = _mm_loadu_si128((const __m128i *) _a);
    return VALIDATE_INT32_M128(c, _a);
}

Armv7-A

00046528 <_ZN8SSE2NEON19test_mm_loadu_si128ERKNS_16SSE2NEONTestImplEj>:
   46528:   b580        push    {r7, lr}
   4652a:   b08e        sub sp, #56 ; 0x38 
   4652c:   af00        add r7, sp, #0 
   4652e:   6078        str r0, [r7, #4]
   46530:   6039        str r1, [r7, #0]
   46532:   687b        ldr r3, [r7, #4]
   46534:   68db        ldr r3, [r3, #12]
   46536:   60fb        str r3, [r7, #12]
   46538:   68fb        ldr r3, [r7, #12]
   4653a:   613b        str r3, [r7, #16]
   4653c:   693b        ldr r3, [r7, #16]
   4653e:   617b        str r3, [r7, #20]
   46540:   697b        ldr r3, [r7, #20]
   46542:   f963 0a8f   vld1.32 {d16-d17}, [r3]
   46546:   bf00        nop
   46548:   edc7 0b0a   vstr    d16, [r7, #40]  ; 0x28 
   4654c:   edc7 1b0c   vstr    d17, [r7, #48]  ; 0x30 
   46550:   edd7 0b0a   vldr    d16, [r7, #40]  ; 0x28 
   46554:   edd7 1b0c   vldr    d17, [r7, #48]  ; 0x30 
   46558:   bf00        nop
   4655a:   edc7 0b06   vstr    d16, [r7, #24]
   4655e:   edc7 1b08   vstr    d17, [r7, #32]
   46562:   68fb        ldr r3, [r7, #12]
   46564:   6818        ldr r0, [r3, #0]
   46566:   68fb        ldr r3, [r7, #12]
   46568:   3304        adds    r3, #4 
   4656a:   6819        ldr r1, [r3, #0]
   4656c:   68fb        ldr r3, [r7, #12]
   4656e:   3308        adds    r3, #8 
   46570:   681a        ldr r2, [r3, #0]
   46572:   68fb        ldr r3, [r7, #12]
   46574:   330c        adds    r3, #12
   46576:   681b        ldr r3, [r3, #0]
   46578:   ed97 0b06   vldr    d0, [r7, #24]
   4657c:   ed97 1b08   vldr    d1, [r7, #32]
   46580:   f7d1 f8e0   bl  17744 <_ZN8SSE2NEON13validateInt32E17__simd128_int64_tiiii>
   46584:   4603        mov r3, r0 
   46586:   4618        mov r0, r3
   46588:   3738        adds    r7, #56 ; 0x38
   4658a:   46bd        mov sp, r7
   4658c:   bd80        pop {r7, pc}

Armv8-A

0000000000423b90 <_ZN8SSE2NEON19test_mm_loadu_si128ERKNS_16SSE2NEONTestImplEj>:
  423b90:   a9ba7bfd    stp x29, x30, [sp, #-96]! 
  423b94:   910003fd    mov x29, sp
  423b98:   f9000fe0    str x0, [sp, #24]
  423b9c:   b90017e1    str w1, [sp, #20]
  423ba0:   f9400fe0    ldr x0, [sp, #24]
  423ba4:   f9400c00    ldr x0, [x0, #24]
  423ba8:   f90017e0    str x0, [sp, #40]
  423bac:   f94017e0    ldr x0, [sp, #40]
  423bb0:   f9001be0    str x0, [sp, #48]
  423bb4:   f9401be0    ldr x0, [sp, #48]
  423bb8:   f9001fe0    str x0, [sp, #56]
  423bbc:   f9401fe0    ldr x0, [sp, #56]
  423bc0:   3dc00000    ldr q0, [x0]
  423bc4:   d503201f    nop    
  423bc8:   3d8017e0    str q0, [sp, #80]
  423bcc:   3dc017e0    ldr q0, [sp, #80]
  423bd0:   d503201f    nop    
  423bd4:   3d8013e0    str q0, [sp, #64]
  423bd8:   f94017e0    ldr x0, [sp, #40]
  423bdc:   b9400004    ldr w4, [x0]
  423be0:   f94017e0    ldr x0, [sp, #40]
  423be4:   91001000    add x0, x0, #0x4
  423be8:   b9400001    ldr w1, [x0]
  423bec:   f94017e0    ldr x0, [sp, #40]
  423bf0:   91002000    add x0, x0, #0x8
  423bf4:   b9400002    ldr w2, [x0]
  423bf8:   f94017e0    ldr x0, [sp, #40]
  423bfc:   91003000    add x0, x0, #0xc
  423c00:   b9400000    ldr w0, [x0]
  423c04:   2a0003e3    mov w3, w0 
  423c08:   2a0403e0    mov w0, w4 
  423c0c:   3dc013e0    ldr q0, [sp, #64]
  423c10:   97ff83a2    bl  404a98 <_ZN8SSE2NEON13validateInt32E11__Int64x2_tiiii>
  423c14:   a8c67bfd    ldp x29, x30, [sp], #96
  423c18:   d65f03c0    ret

Cuda-Chen avatar Apr 07 '23 08:04 Cuda-Chen

I made a simple test here It shows that using memcpy doesn't run faster. I run the following test with the current make check command.

The implementation

FORCE_INLINE __m128i old_mm_loadu_si128(const __m128i *p)
{
    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
}
FORCE_INLINE __m128i new_mm_loadu_si128(const __m128i *p)
{
    int64x2_t res;
    // res = vreinterpretq_s64_s32(vld1q_s32((const int32_t *) ptr));
    memcpy(&res, (const int64_t *) p, sizeof(res));
    return vreinterpretq_m128i_s64(res);
}

The test function

result_t test_mm_loadu_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
{
    const int32_t *_a = (const int32_t *) impl.mTestIntPointer1;
    const int test_times = 100000;

    clock_t t;
    double time_taken = 0;
    t = clock();
    for (int i = 0; i < test_times; i++) {
        __m128i c = old_mm_loadu_si128((const __m128i *) _a+(i%8));
    }
    time_taken = ((double)t)/CLOCKS_PER_SEC;
    printf("NEON implementation: %f\n", time_taken);

    t = clock();
    for (int i = 0; i < test_times; i++) {
        __m128i c = new_mm_loadu_si128((const __m128i *) _a+(i%8));
    }
    time_taken = ((double)t)/CLOCKS_PER_SEC;
    printf("memcpy implementation: %f\n", time_taken);;
    return TEST_FAIL;
}

The result:

NEON implementation: 0.443778
memcpy implementation: 0.444066

If there any thing I could do to improve the test please point it out.

howjmay avatar Apr 13 '23 00:04 howjmay

Why not use vld1q_u8?

aqrit avatar Dec 02 '23 18:12 aqrit

Hi @aqrit,

Based on my experiment, the vld1q_u8 runs slightly slower.

Test Code

FORCE_INLINE __m128i old_mm_loadu_si128(const __m128i *p)
{
    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); 
}

FORCE_INLINE __m128i new_mm_loadu_si128(const __m128i *p)
{
    return vreinterpretq_m128i_u8(vld1q_u8((const uint8_t *) p)); 
}

Test Function (thanks to @howjmay's code)

result_t test_mm_loadu_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
{
    const int32_t *_a = (const int32_t *) impl.mTestIntPointer1;
    const int test_times = 100000;

    clock_t t;
    double time_taken = 0;
    t = clock();
    for (int i = 0; i < test_times; i++) {
        __m128i c = old_mm_loadu_si128((const __m128i *) _a+(i%8));
    }
    time_taken = ((double)t)/CLOCKS_PER_SEC;
    printf("NEON implementation: %f\n", time_taken);

    t = clock();
    for (int i = 0; i < test_times; i++) {
        __m128i c = new_mm_loadu_si128((const __m128i *) _a+(i%8));
    }
    time_taken = ((double)t)/CLOCKS_PER_SEC;
    printf("NEON vld1q_u8 implementation: %f\n", time_taken);;
    return TEST_FAIL;
}

Test Results

Armv8-A

NEON implementation: 7.452971                                         
NEON vld1q_u8 implementation: 7.456146   

Armv7-A test result

NEON implementation: 8.484269                                         
NEON vld1q_u8 implementation: 8.487463    

Armv8-A (32-bit)

NEON implementation: 10.073829                                        
NEON vld1q_u8 implementation: 10.078687

Cuda-Chen avatar Dec 03 '23 14:12 Cuda-Chen

should be fixed by #632 ?

aqrit avatar May 21 '24 20:05 aqrit

should be fixed by #632 ?

Hi @aqrit ,

Though I am busy for work these weeks, I will arrange myself a time for testing #632.

Cuda-Chen avatar May 27 '24 15:05 Cuda-Chen

Hi @aqrit ,

Seems that it is fixed by #632, I think that we may close this issue.

Cuda-Chen avatar May 31 '24 12:05 Cuda-Chen

Close as confirmed

jserv avatar May 31 '24 12:05 jserv