Feature: Low overhead case insensitive find
Describe what you are looking for
The sz_tolower function requires copying to another buffer. Within find and find_byte routines a lowercasing step can be done quickly in a few extra cycles. I'm happy to add the feature, just gauging interest here.
For avx512
SZ_INTERNAL __m512i sz_lower_avx512(__m512i in)
{
__m512i A = _mm512_set1_epi8('A');
__m512i Z = _mm512_set1_epi8('Z');
__m512i to_lower = _mm512_set1_epi8('a' - 'A');
__mmask64 ge_A = _mm512_cmpge_epi8_mask(in, A);
__mmask64 le_Z = _mm512_cmple_epi8_mask(in, Z);
__mmask64 is_upper = _kand_mask64(ge_A, le_Z);
return _mm512_mask_add_epi8(in, is_upper, in, to_lower);
}
SZ_PUBLIC sz_cptr_t sz_find_byte_case_insensitive_avx512(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n)
{
__mmask64 mask;
sz_u512_vec_t h_vec, n_vec;
/// PATCH!!!
n_vec.zmm = _mm512_set1_epi8(sz_u8_tolower(n[0]));
/// PATCH!!!
while (h_length >= 64) {
/// PATCH!!!
h_vec.zmm = sz_lower_avx512(_mm512_loadu_si512(h));
/// PATCH!!!
mask = _mm512_cmpeq_epi8_mask(h_vec.zmm, n_vec.zmm);
if (mask)
return h + sz_u64_ctz(mask);
h += 64, h_length -= 64;
}
if (h_length) {
mask = _sz_u64_mask_until(h_length);
/// PATCH!!!
h_vec.zmm = sz_lower_avx512(_mm512_maskz_loadu_epi8(mask, h));
/// PATCH!!!
// Reuse the same `mask` variable to find the bit that doesn't match
mask = _mm512_mask_cmpeq_epu8_mask(mask, h_vec.zmm, n_vec.zmm);
if (mask)
return h + sz_u64_ctz(mask);
}
return SZ_NULL_CHAR;
}
Can you contribute to the implementation?
- [X] I can contribute
Is your feature request specific to a certain interface?
It applies to everything
Contact Details
Is there an existing issue for this?
- [X] I have searched the existing issues
Code of Conduct
- [X] I agree to follow this project's Code of Conduct
I've considered this before, and it raises questions about the character set encodings. The suggested lowering function limits the applicability to ASCII content. There is probably a better way to future-proof the API for UTF8 as well. Feel free to open a PR, and I'll modify/integrate it down the road when we get to it.
The internal function can branch to a fallback if non-ascii characters are present. The public call signature is the same as find / find_byte.
SZ_INTERNAL __m512i sz_lower_avx512(__m512i in)
{
if (sz_is_ascii_avx512(in)) {
__m512i A = _mm512_set1_epi8('A');
__m512i Z = _mm512_set1_epi8('Z');
__m512i to_lower = _mm512_set1_epi8('a' - 'A');
__mmask64 ge_A = _mm512_cmpge_epi8_mask(in, A);
__mmask64 le_Z = _mm512_cmple_epi8_mask(in, Z);
__mmask64 is_upper = _kand_mask64(ge_A, le_Z);
return _mm512_mask_add_epi8(in, is_upper, in, to_lower);
} else {
// UTF-8 fallback
}
}