SWAR acceleration for UTF8 Hamming Distance

Open ashvardanian opened this issue 1 year ago • 0 comments

The upcoming implementation contains only a serial variant. Once the demand for a more efficient implementation grows, one can add a SWAR accelerated implementation.

Draft

SZ_PUBLIC sz_size_t sz_hamming_distance_utf8( //
    sz_cptr_t a, sz_size_t a_length,          //
    sz_cptr_t b, sz_size_t b_length,          //
    sz_size_t bound) {

    sz_size_t const min_length = sz_min_of_two(a_length, b_length);
    sz_size_t const max_length = sz_max_of_two(a_length, b_length);
    sz_cptr_t const a_min_end = a + min_length;
    sz_size_t distance = 0;
    bound = bound == 0 ? max_length : bound;

#if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN
    if (min_length >= SZ_SWAR_THRESHOLD) {
        sz_u64_vec_t a_vec, b_vec;
        sz_u64_vec_t a_2byte_vec, b_2byte_vec;
        sz_u64_vec_t prefixes_2byte_vec, match_vec, prefix_resolution_mask_vec;
        prefixes_2byte_vec.u64 = 0xC080C080C080C080ull;

        // Walk through both strings using SWAR and counting the number of differing characters.
        // At every point check if the next 8 bytes of content contain characters of equal length.
        for (; a + 8 <= a_min_end && distance < bound; a += 8, b += 8) {
            a_vec = sz_u64_load(a);
            b_vec = sz_u64_load(b);

            // Check if the prefixes contain up to 8x codepoints of length 1.
            // UTF8 1-byte codepoints look like: 0xxxxxxx.
            sz_size_t a_1byte_chars = sz_u64_ctz(a_vec.u64 & 0x8080808080808080ull) / 8;
            sz_size_t b_1byte_chars = sz_u64_ctz(b_vec.u64 & 0x8080808080808080ull) / 8;
            sz_size_t max_1byte_chars = sz_max_of_two(a_1byte_chars, b_1byte_chars);
            sz_size_t min_1byte_chars = sz_min_of_two(a_1byte_chars, b_1byte_chars);

            // Check if at least one of the strings starts with a 1-byte character.
            if (max_1byte_chars) {
                // One of the strings doesn't start with 1-byte characters.
                if (!a_1byte_chars && !b_1byte_chars) {
                    a += min_1byte_chars, b += min_1byte_chars;
                    continue;
                }
                // Both strings start with `min_1byte_chars`-many 1-byte characters.
                // Compare them using SWAR and skip.
                else {
                    match_vec = _sz_u64_each_byte_equal(a_vec, b_vec);
                    prefix_resolution_mask_vec = ...;
                    sz_size_t matching_prefix_length = sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull);
                    ...
                }
            }

            // Check if the prefixes contain up to 4x codepoints of length 2.
            // UTF8 2-byte codepoints look like: 110xxxxx 10xxxxxx.
            // Those bits will be matches with a 0xE0C0 repeated mask, matching the 0xC080 pattern.
            a_2byte_vec.u64 = a_vec.u64 & 0xE0C0E0C0E0C0E0C0ull;
            b_2byte_vec.u64 = b_vec.u64 & 0xE0C0E0C0E0C0E0C0ull;
            sz_size_t a_2byte_chars = sz_u64_ctz(_sz_u64_each_2byte_equal(a_2byte_vec, prefixes_2byte_vec).u64) / 16;
            sz_size_t b_2byte_chars = sz_u64_ctz(_sz_u64_each_2byte_equal(b_2byte_vec, prefixes_2byte_vec).u64) / 16;
            
        }
    }
#endif

    sz_rune_t a_rune, b_rune;
    sz_rune_length_t a_rune_length, b_rune_length;
    for (; a != a_min_end && distance < bound; a += a_rune_length, b += b_rune_length) {
        _sz_extract_utf8_rune(a, &a_rune_length, &a_rune);
        _sz_extract_utf8_rune(b, &b_rune_length, &b_rune);
        distance += (a_rune != b_rune);
    }

    return sz_min_of_two(distance, bound);
}

Feb 18 '24 03:02 ashvardanian