cm256 icon indicating copy to clipboard operation
cm256 copied to clipboard

sigbus illegal alignment at gf256.cpp gf256_add_mem line 790

Open zerojiu opened this issue 5 years ago • 3 comments

HI, I am trying to run this program at android platform.

extern "C" void gf256_add_mem(void * GF256_RESTRICT vx,
                              const void * GF256_RESTRICT vy, int bytes)
{
    GF256_M128 * GF256_RESTRICT x16 = reinterpret_cast<GF256_M128 *>(vx);
    const GF256_M128 * GF256_RESTRICT y16 = reinterpret_cast<const GF256_M128 *>(vy);

#if defined(GF256_TARGET_MOBILE)
# if defined(GF256_TRY_NEON)
    // Handle multiples of 64 bytes
    if (CpuHasNeon)
    {
        while (bytes >= 64)
        {
            GF256_M128 x0 = vld1q_u8((uint8_t*) x16);
            GF256_M128 x1 = vld1q_u8((uint8_t*)(x16 + 1) );
            GF256_M128 x2 = vld1q_u8((uint8_t*)(x16 + 2) );
            GF256_M128 x3 = vld1q_u8((uint8_t*)(x16 + 3) );
            GF256_M128 y0 = vld1q_u8((uint8_t*)y16);
            GF256_M128 y1 = vld1q_u8((uint8_t*)(y16 + 1));
            GF256_M128 y2 = vld1q_u8((uint8_t*)(y16 + 2));
            GF256_M128 y3 = vld1q_u8((uint8_t*)(y16 + 3));

            vst1q_u8((uint8_t*)x16,     veorq_u8(x0, y0));
            vst1q_u8((uint8_t*)(x16 + 1), veorq_u8(x1, y1));
            vst1q_u8((uint8_t*)(x16 + 2), veorq_u8(x2, y2));
            vst1q_u8((uint8_t*)(x16 + 3), veorq_u8(x3, y3));

            bytes -= 64, x16 += 4, y16 += 4;
        }

        // Handle multiples of 16 bytes
        while (bytes >= 16)
        {
            GF256_M128 x0 = vld1q_u8((uint8_t*)x16);
            GF256_M128 y0 = vld1q_u8((uint8_t*)y16);

            vst1q_u8((uint8_t*)x16, veorq_u8(x0, y0));

            bytes -= 16, ++x16, ++y16;
        }
    }
    else
# endif // GF256_TRY_NEON
    {
        uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x16);
        const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y16);

        const unsigned count = (unsigned)bytes / 8;
        for (unsigned ii = 0; ii < count; ++ii)
            x8[ii] ^= y8[ii];

        x16 = reinterpret_cast<GF256_M128 *>(x8 + count);
        y16 = reinterpret_cast<const GF256_M128 *>(y8 + count);

        bytes -= (count * 8);
    }
#else // GF256_TARGET_MOBILE
# if defined(GF256_TRY_AVX2)
    if (CpuHasAVX2)
    {
        GF256_M256 * GF256_RESTRICT x32 = reinterpret_cast<GF256_M256 *>(x16);
        const GF256_M256 * GF256_RESTRICT y32 = reinterpret_cast<const GF256_M256 *>(y16);

        while (bytes >= 128)
        {
            GF256_M256 x0 = _mm256_loadu_si256(x32);
            GF256_M256 y0 = _mm256_loadu_si256(y32);
            x0 = _mm256_xor_si256(x0, y0);
            GF256_M256 x1 = _mm256_loadu_si256(x32 + 1);
            GF256_M256 y1 = _mm256_loadu_si256(y32 + 1);
            x1 = _mm256_xor_si256(x1, y1);
            GF256_M256 x2 = _mm256_loadu_si256(x32 + 2);
            GF256_M256 y2 = _mm256_loadu_si256(y32 + 2);
            x2 = _mm256_xor_si256(x2, y2);
            GF256_M256 x3 = _mm256_loadu_si256(x32 + 3);
            GF256_M256 y3 = _mm256_loadu_si256(y32 + 3);
            x3 = _mm256_xor_si256(x3, y3);

            _mm256_storeu_si256(x32, x0);
            _mm256_storeu_si256(x32 + 1, x1);
            _mm256_storeu_si256(x32 + 2, x2);
            _mm256_storeu_si256(x32 + 3, x3);

            bytes -= 128, x32 += 4, y32 += 4;
        }

        // Handle multiples of 32 bytes
        while (bytes >= 32)
        {
            // x[i] = x[i] xor y[i]
            _mm256_storeu_si256(x32,
                _mm256_xor_si256(
                    _mm256_loadu_si256(x32),
                    _mm256_loadu_si256(y32)));

            bytes -= 32, ++x32, ++y32;
        }

        x16 = reinterpret_cast<GF256_M128 *>(x32);
        y16 = reinterpret_cast<const GF256_M128 *>(y32);
    }
    else
# endif // GF256_TRY_AVX2
    {
        while (bytes >= 64)
        {
            GF256_M128 x0 = _mm_loadu_si128(x16);
            GF256_M128 y0 = _mm_loadu_si128(y16);
            x0 = _mm_xor_si128(x0, y0);
            GF256_M128 x1 = _mm_loadu_si128(x16 + 1);
            GF256_M128 y1 = _mm_loadu_si128(y16 + 1);
            x1 = _mm_xor_si128(x1, y1);
            GF256_M128 x2 = _mm_loadu_si128(x16 + 2);
            GF256_M128 y2 = _mm_loadu_si128(y16 + 2);
            x2 = _mm_xor_si128(x2, y2);
            GF256_M128 x3 = _mm_loadu_si128(x16 + 3);
            GF256_M128 y3 = _mm_loadu_si128(y16 + 3);
            x3 = _mm_xor_si128(x3, y3);

            _mm_storeu_si128(x16, x0);
            _mm_storeu_si128(x16 + 1, x1);
            _mm_storeu_si128(x16 + 2, x2);
            _mm_storeu_si128(x16 + 3, x3);

            bytes -= 64, x16 += 4, y16 += 4;
        }
    }
#endif // GF256_TARGET_MOBILE

#if !defined(GF256_TARGET_MOBILE)
    // Handle multiples of 16 bytes
    while (bytes >= 16)
    {
        // x[i] = x[i] xor y[i]
        _mm_storeu_si128(x16,
            _mm_xor_si128(
                _mm_loadu_si128(x16),
                _mm_loadu_si128(y16)));

        bytes -= 16, ++x16, ++y16;
    }
#endif

    uint8_t * GF256_RESTRICT x1 = reinterpret_cast<uint8_t *>(x16);
    const uint8_t * GF256_RESTRICT y1 = reinterpret_cast<const uint8_t *>(y16);

    // Handle a block of 8 bytes
    const int eight = bytes & 8;
    if (eight)
    {
        uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
        const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
        *x8 ^= *y8;
    }

    // Handle a block of 4 bytes
    const int four = bytes & 4;
    if (four)
    {
        uint32_t * GF256_RESTRICT x4 = reinterpret_cast<uint32_t *>(x1 + eight);
        const uint32_t * GF256_RESTRICT y4 = reinterpret_cast<const uint32_t *>(y1 + eight);
        *x4 ^= *y4;
    }

    // Handle final bytes
    const int offset = eight + four;
    switch (bytes & 3)
    {
    case 3: x1[offset + 2] ^= y1[offset + 2];
    case 2: x1[offset + 1] ^= y1[offset + 1];
    case 1: x1[offset] ^= y1[offset];
    default:
        break;
    }
}

we have define GF256_TARGET_MOBILE and GF256_TRY_NEON, and CpuHasNeon is true.

    // Handle a block of 8 bytes
    const int eight = bytes & 8;
    if (eight)
    {
        uint64_t * GF256_RESTRICT x8 = reinterpret_cast<uint64_t *>(x1);
        const uint64_t * GF256_RESTRICT y8 = reinterpret_cast<const uint64_t *>(y1);
        *x8 ^= *y8;
    }

*x8^=*y8 cause sigbus illegal alignment error. is there some compile arguments must be set? thanks very much.

zerojiu avatar Oct 31 '18 08:10 zerojiu

I find, if use two four bytes operation replace eight bytes operation, it will be ok.

zerojiu avatar Oct 31 '18 09:10 zerojiu

Just ran into this as well.

You can work around this by using cm256_encode_block instead of cm256_encode, since cm256_encode increments the recovery block buffer by BlockBytes (see https://github.com/catid/cm256/blob/master/cm256.cpp#L215) which means that your BlockBytes would have to be a multiple of the std::uint64_t alignment requirement in order to guarantee that the first byte of each recovery block falls on an aligned address (i.e. if BlockBytes is 3, but your alignment requirement is 4, your second block would start at a misaligned address even if the first block is aligned).

We decided to create a byte buffer per recovery block and call cm256_encode_block directly:

  // Set up params and recovery_block_size.

  // Create recovery count buffers, each of which will be aligned.
  std::vector<std::vector<std::uint8_t>> recovery_blocks(params.RecoveryCount);

  for (int block_index = 0; block_index < params.RecoveryCount; ++block_index) {
    recovery_blocks[block_index].resize(recovery_block_size); // Set correct size of recovery buffer.

    cm256_encode_block(params, blocks.data(), params.OriginalCount + block_index, recovery_blocks[block_index].data());
  }

You will likely want to do manual parameter validation as well, since cm256_encode_block doesn't do this for you. Hope this helps :)

RNabel avatar Feb 28 '19 15:02 RNabel

Yes on ARM platform the start of the buffers need to be aligned to 16 bytes, so the low 4 bits of the pointer should be zero. On Intel you don't have that restriction.

On Thu, Feb 28, 2019, 7:36 AM Robin Nabel <[email protected] wrote:

Just ran into this as well.

You can work around this by using cm256_encode_block instead of cm256_encode, since cm256_encode increments the recovery block buffer by BlockBytes (see https://github.com/catid/cm256/blob/master/cm256.cpp#L215) which means that your BlockBytes would have to be a multiple of the std::uint64_t in order to guarantee that the first byte of each recovery block falls on an aligned address (i.e. if BlockBytes is 3, but your alignment requirement is 4, your second block would start at a misaligned address even if the first block is aligned).

We decided to create a byte buffer per recovery block and call cm256_encode_block directly:

// Set up params and recovery_block_size.

// Create recovery count buffers, each of which will be aligned. std::vector<std::vectorstd::uint8_t> recovery_blocks(params.RecoveryCount);

for (int block_index = 0; block_index < params.RecoveryCount; ++block_index) { recovery_blocks[block_index].resize(recovery_block_size); // Ensure correct size of recovery buffer.

cm256_encode_block(params, blocks.data(), params.OriginalCount + block_index, recovery_blocks[block_index].data());

}

You will likely want to do manual parameter validation as well, since cm256_encode_block doesn't do this for you. Hope this helps :)

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/catid/cm256/issues/5#issuecomment-468318777, or mute the thread https://github.com/notifications/unsubscribe-auth/AAPZISb2NLHJhsLo1_LTmsR0wCD6PxGrks5vR_eagaJpZM4YDiuf .

catid avatar Feb 28 '19 18:02 catid