highway icon indicating copy to clipboard operation
highway copied to clipboard

Add support for TwoTablesLookupBytes

Open johnplatts opened this issue 2 years ago • 2 comments

Some SIMD instruction sets such as NEON, Altivec, and AVX-512VBMI have 2-table lookup instructions.

TwoTablesLookupBytes is more efficient than doing two TableLookupBytes operations, computing a mask, and combining the results using an IfThenElse operation on SIMD targets with 2-table lookup instructions such as NEON, Altivec, and AVX-512VBMI.

Here is how TwoTablesLookupBytes can be implemented for full 128-bit vectors for the tables on x86 targets with SSSE3 support:

template <typename T, typename TI, size_t NI>
HWY_API Vec128<TI, NI> TwoTablesLookupBytes(const Vec128<T> bytes1,
                                            const Vec128<T> bytes2,
                                            const Vec128<TI, NI> from) {
#if HWY_TARGET <= HWY_AVX3_DL
  return Vec128<TI, NI>{_mm_permutex2var_epi8(bytes1.raw, from.raw, bytes2.raw)};
#else
  const DFromV<decltype(from)> d;
  const Repartition<int8_t, decltype(d)> d8;
  const Repartition<uint16_t, decltype(d)> du16;
  using V8 = VFromD<decltype(d8)>;
  
  auto selMaskVect = BitCast(d8, ShiftLeft<3>(BitCast(du16, from)));
  #if HWY_TARGET == HWY_SSSE3
  selMaskVect = BroadcastSignBit(selMaskVect);
  #endif
  
  const auto selMask = MaskFromVec(selMaskVect);
  const V8 resultVect1{_mm_shuffle_epi8(bytes1.raw, from.raw)};

  #if HWY_TARGET == HWY_AVX3
  return Vec128<TI, NI>{_mm_mask_shuffle_epi8(resultVect1.raw, selMask.raw, bytes2.raw, from.raw)};
  #else
  const V8 resultVect2{_mm_shuffle_epi8(bytes2.raw, from.raw)};
  return BitCast(d, IfThenElse(selMask, resultVect2, resultVect1));
  #endif
#endif
}

Here is how TwoTablesLookupBytes can be implemented for full 128-bit vectors for the tables on ARM targets with NEON support:

template <typename T, typename TI>
HWY_API Vec128<TI> TwoTablesLookupBytes(const Vec128<T> bytes1,
                                        const Vec128<T> bytes2,
                                        const Vec128<TI> from) {
  const Full128<TI> d;
  const Repartition<uint8_t, decltype(d)> d8;
#if HWY_ARCH_ARM_A64
  uint8x16x2_t table;
  table.val[0] = BitCast(d8, bytes1).raw;
  table.val[1] = BitCast(d8, bytes2).raw;
  return BitCast(d, Vec128<uint8_t>(vqtbl2q_u8(table, BitCast(d8, from).raw)));
#else
  uint8x16_t table0 = BitCast(d8, bytes1).raw;
  uint8x16_t table1 = BitCast(d8, bytes2).raw;
  uint8x8x4_t table;
  table.val[0] = vget_low_u8(table0);
  table.val[1] = vget_high_u8(table0);
  table.val[2] = vget_low_u8(table1);
  table.val[3] = vget_high_u8(table1);
  uint8x16_t idx = BitCast(d8, from).raw;
  uint8x8_t low = vtbl4_u8(table, vget_low_u8(idx));
  uint8x8_t hi = vtbl4_u8(table, vget_high_u8(idx));
  return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi)));
#endif
}

template <typename T, typename TI, size_t NI, HWY_IF_LE64(TI, NI)>
HWY_API Vec128<TI, NI> TwoTablesLookupBytes(const Vec128<T> bytes1,
                                            const Vec128<T> bytes2,
                                            const Vec128<TI, NI> from) {
  const Full128<T> dt;
  const Repartition<uint8_t, decltype(dt)> dt8;
  const DFromV<decltype(from)> di;
  const Repartition<uint8_t, decltype(di)> di8;
#if HWY_ARCH_ARM_A64
  uint8x16x2_t table;
  table.val[0] = BitCast(dt8, bytes1).raw;
  table.val[1] = BitCast(dt8, bytes2).raw;
  return BitCast(d, Vec128<uint8_t>(vqtbl2_u8(table, BitCast(di8, from).raw)));
#else
  uint8x16_t table0 = BitCast(dt8, bytes1).raw;
  uint8x16_t table1 = BitCast(dt8, bytes2).raw;
  uint8x8x4_t table;
  table.val[0] = vget_low_u8(table0);
  table.val[1] = vget_high_u8(table0);
  table.val[2] = vget_low_u8(table1);
  table.val[3] = vget_high_u8(table1);
  uint8x8_t idx = BitCast(di8, from).raw;
  return BitCast(d, VFromD<decltype(di8)>(vtbl4_u8(table, idx)));
#endif
}

Here is how TwoTablesLookupBytes can be implemented for full 128-bit vectors for the tables on PowerPC targets with Altivec/VSX support:

template <typename T, typename TI, size_t NI>
HWY_API Vec128<TI, NI> TwoTablesLookupBytes(const Vec128<T> bytes1,
                                            const Vec128<T> bytes2,
                                            const Vec128<TI, NI> from) {
  const DFromV<decltype(from)> d;
  const Repartition<int8_t, decltype(d)> d8;
  using V8 = VFromD<decltype(d8)>;

  const V8 permResult {vec_perm((__vector unsigned char)bytes1.raw, (__vector unsigned char)bytes2.raw,
    (__vector unsigned char)from.raw)};
  return BitCast(d, permResult);
}

Here is how TwoTablesLookupBytes can be implemented for 64-bit or smaller vectors for the tables for targets that have fixed-size vector types such as SSSE3, AVX, NEON, and AltiVec:

template<typename T, size_t N, typename TI, typename NI, HWY_IF_LE64(T, N)>
HWY_API Vec128<T, N> TwoTablesLookupBytes(const Vec128<T, N> bytes1,
                                          const Vec128<T, N> bytes2,
                                          const Vec128<TI, NI> from) {
  const Twice<DFromV<decltype(bytes1)>> d2;
  return TableLookupBytes(Combine(d2, bytes1, bytes2), from);
}

johnplatts avatar Sep 14 '22 11:09 johnplatts

Nice, thank you @johnplatts for sharing the idea. Will be happy to add these soon.

Would you like to have a quick chat via video call to exchange notes?

jan-wassenberg avatar Sep 14 '22 14:09 jan-wassenberg

A quick follow-up: TableLookupBytes has the quirk of staying within 128-bit blocks. But the AVX-512 operations here support full permutes across all vector lanes, same as SVE and RVV. Should we adopt those semantics here and call it TwoTablesLookupLanes instead?

jan-wassenberg avatar Sep 15 '22 11:09 jan-wassenberg

A quick follow-up: TableLookupBytes has the quirk of staying within 128-bit blocks. But the AVX-512 operations here support full permutes across all vector lanes, same as SVE and RVV. Should we adopt those semantics here and call it TwoTablesLookupLanes instead?

Yes

johnplatts avatar Feb 25 '23 20:02 johnplatts

Thanks, added to wishlist :) Would also welcome a patch for this.

jan-wassenberg avatar Feb 27 '23 11:02 jan-wassenberg

Thanks @johnplatts for implementing this :D

jan-wassenberg avatar Apr 19 '23 07:04 jan-wassenberg