confusables
confusables copied to clipboard
The normalize function doesn't return 'm' for 'rn', 'r' followed by 'n'
When using the normalize function for 'rn', 'r' followed by 'n', it doesn't include 'm' in the returned list despite the fact that applying confusable_characters to 'm' includes 'rn' in the list, presumably normalize is only applying to single characters e.g:
>>> normalize('rn')
['rn'] # should return ['m', 'rn']
>>> confusable_characters('m')
['๐', 'ฮผ', 'แฐ', 'แธพ', '๐', 'โฒ', '๐', 'โณ', '๐ด', '๐', 'ะผ', '๐ณ', '๐', '๐ผ', '๐ง', 'แท', '๐', '๐ฎ', '๐', '๐ถ', '๐', '๏ฝ', '๐บ', '๐', '๐', 'แ', '๐ญ', 'โ
ฏ', 'M', 'rn', 'ยต', '๐ข', 'แน', '๐ฆ', '๊ญ', 'แน', '๐ธ', 'โฒ', '๐', 'แธฟ', 'ฯป', '๊', '๏ผญ', 'm', '๊ฎ', 'โ
ฟ', '๐ฐ', '๐', 'ฯบ', 'ฮ', '๐ก', 'แน', '๐ฌ', '๐', '๐', '๐ช', '๐', '๐', 'แน', '๐ฃฃ', 'ะ', '๐', '๐ ']
This version of normalize seems to handle multi-character confusables and closely matches the behavior of the existing normalize function:
import string
from copy import copy
def normalize(string, prioritize_chars=False, prioritized_char_set=string.ascii_lowercase):
cache = {}
for (k, v) in {key:set([value2.lower() for value2 in value if all(is_ascii(char) and char not in NON_NORMAL_ASCII_CHARS and (not prioritize_chars or char in prioritized_char_set) for char in value2) and (len(key) == 1 or key != value2)]) for (key,value) in CONFUSABLE_MAP.items() if string.lower().startswith(key.lower()) and (len(key) == 1 or key != value)}.items():
cache[k.lower()].extend(v) if k.lower() in cache else cache.update({k.lower():list(v)})
for x in range(1, len(string)):
completed_string = string[0:x]
remaining_string = string[x:]
matching_confusables = {}
for (k, v) in {key:set([value2.lower() for value2 in value if all(is_ascii(char) and char not in NON_NORMAL_ASCII_CHARS and (not prioritize_chars or char in prioritized_char_set) for char in value2) and (len(key) == 1 or key != value2)]) for (key,value) in CONFUSABLE_MAP.items() if remaining_string.lower().startswith(key.lower())}.items():
matching_confusables[k.lower()].update(v) if k.lower() in matching_confusables else matching_confusables.update({k.lower():set(v)})
for (k, v) in matching_confusables.items():
normal_forms = [product(cache[completed_string], v)]
cache_key = f'{completed_string}{k}'
cache[cache_key].extend(normal_forms) if cache_key in cache else cache.update({cache_key:normal_forms})
del cache[completed_string]
for temp in next_string(cache[string]):
yield temp
def next_string(node):
if isinstance(node, tuple) and isinstance(node[0], str):
yield f'{node[0]}{node[1]}'
elif isinstance(node, tuple) and isinstance(node[0], product):
for temp in next_string(copy(node[0])):
yield f'{temp}{node[1]}'
else:
for temp in node:
if isinstance(temp, tuple) and isinstance(temp[0], str):
yield f'{temp[0]}{temp[1]}'
elif isinstance(temp, tuple) and isinstance(temp[0], product):
for temp2 in next_string(copy(temp[0])):
yield f'{temp2}{temp[1]}'
else:
for temp2 in copy(temp):
if isinstance(temp2, tuple) and isinstance(temp2[0], str):
yield f'{temp2[0]}{temp2[1]}'
else:
for temp3 in next_string(copy(temp2)):
yield f'{temp3}'
To clarify, is_confusable('rn', 'm')
does work as expected (it returns True). It's only normalize
that doesn't.