confusables icon indicating copy to clipboard operation
confusables copied to clipboard

The normalize function doesn't return 'm' for 'rn', 'r' followed by 'n'

Open lazydog2 opened this issue 1 year ago โ€ข 2 comments

When using the normalize function for 'rn', 'r' followed by 'n', it doesn't include 'm' in the returned list despite the fact that applying confusable_characters to 'm' includes 'rn' in the list, presumably normalize is only applying to single characters e.g:

>>> normalize('rn')
['rn']  # should return ['m', 'rn']
>>> confusable_characters('m')
['๐‘š', 'ฮผ', 'แ—ฐ', 'แธพ', '๐˜”', 'โฒ˜', '๐“‚', 'โ„ณ', '๐‘ด', '๐“œ', 'ะผ', '๐šณ', '๐š–', '๐™ผ', '๐œง', 'แŽท', '๐‘€', '๐˜ฎ', '๐•ž', '๐“ถ', '๐‘œ€', '๏ฝ', '๐—บ', '๐ž›', '๐—†', 'แ›–', '๐›ญ', 'โ…ฏ', 'M', 'rn', 'ยต', '๐™ข', 'แนƒ', '๐ฆ', '๊ญ‘', 'แน‚', '๐•ธ', 'โฒ™', '๐’Ž', 'แธฟ', 'ฯป', '๊“Ÿ', '๏ผญ', 'm', '๊ฎ‡', 'โ…ฟ', '๐Šฐ', '๐”', 'ฯบ', 'ฮœ', '๐ก', 'แน€', '๐–ฌ', '๐Œ', '๐Œ‘', '๐”ช', '๐™ˆ', '๐•„', 'แน', '๐‘ฃฃ', 'ะœ', '๐–’', '๐— ']

lazydog2 avatar Oct 12 '23 22:10 lazydog2

This version of normalize seems to handle multi-character confusables and closely matches the behavior of the existing normalize function:

import string
from copy import copy

def normalize(string, prioritize_chars=False, prioritized_char_set=string.ascii_lowercase):
    cache = {}
    for (k, v) in {key:set([value2.lower() for value2 in value if all(is_ascii(char) and char not in NON_NORMAL_ASCII_CHARS and (not prioritize_chars or char in prioritized_char_set) for char in value2) and (len(key) == 1 or key != value2)]) for (key,value) in CONFUSABLE_MAP.items() if string.lower().startswith(key.lower()) and (len(key) == 1 or key != value)}.items():
        cache[k.lower()].extend(v) if k.lower() in cache else cache.update({k.lower():list(v)})
    for x in range(1, len(string)):
        completed_string = string[0:x]
        remaining_string = string[x:]
        matching_confusables = {}
        for (k, v) in {key:set([value2.lower() for value2 in value if all(is_ascii(char) and char not in NON_NORMAL_ASCII_CHARS and (not prioritize_chars or char in prioritized_char_set) for char in value2) and (len(key) == 1 or key != value2)]) for (key,value) in CONFUSABLE_MAP.items() if remaining_string.lower().startswith(key.lower())}.items():
            matching_confusables[k.lower()].update(v) if k.lower() in matching_confusables else matching_confusables.update({k.lower():set(v)})
        for (k, v) in matching_confusables.items():
            normal_forms = [product(cache[completed_string], v)]
            cache_key = f'{completed_string}{k}'
            cache[cache_key].extend(normal_forms) if cache_key in cache else cache.update({cache_key:normal_forms})
        del cache[completed_string]
    
    for temp in next_string(cache[string]):
        yield temp

def next_string(node):
    if isinstance(node, tuple) and isinstance(node[0], str):
        yield f'{node[0]}{node[1]}'
    elif isinstance(node, tuple) and isinstance(node[0], product):
        for temp in next_string(copy(node[0])):
            yield f'{temp}{node[1]}'
    else:
        for temp in node:
            if isinstance(temp, tuple) and isinstance(temp[0], str):
                yield f'{temp[0]}{temp[1]}'
            elif isinstance(temp, tuple) and isinstance(temp[0], product):
                for temp2 in next_string(copy(temp[0])):
                    yield f'{temp2}{temp[1]}'
            else:
                for temp2 in copy(temp):
                    if isinstance(temp2, tuple) and isinstance(temp2[0], str):
                        yield f'{temp2[0]}{temp2[1]}'
                    else:
                        for temp3 in next_string(copy(temp2)):
                            yield f'{temp3}'

lazydog2 avatar Oct 16 '23 00:10 lazydog2

To clarify, is_confusable('rn', 'm') does work as expected (it returns True). It's only normalize that doesn't.

drothlis avatar Feb 27 '24 11:02 drothlis