lingua-py
lingua-py copied to clipboard
Use Unicode to detect missing languages.
For languages that there isn't an underlying model (or the data isn't accessible), can you add a simple unicode range check to add support for them?
import unicodedata
# Define Unicode ranges for various languages
UNICODE_RANGES = {
"Thai": (0x0E00, 0x0E7F),
"Lao": (0x0E80, 0x0EFF),
"Khmer": (0x1780, 0x17FF),
"Greek": (0x0370, 0x03FF),
"Cyrillic": (0x0400, 0x04FF),
"Hebrew": (0x0590, 0x05FF),
"Arabic": (0x0600, 0x06FF),
"Devanagari (Hindi, Marathi, Nepali)": (0x0900, 0x097F),
"Bengali": (0x0980, 0x09FF),
"Tamil": (0x0B80, 0x0BFF),
"Georgian": (0x10A0, 0x10FF),
"Armenian": (0x0530, 0x058F),
"Hangul (Korean)": (0xAC00, 0xD7AF),
"Hiragana (Japanese)": (0x3040, 0x309F),
"Katakana (Japanese)": (0x30A0, 0x30FF),
"Chinese (Simplified/Traditional)": (0x4E00, 0x9FFF),
"Ethiopic (Amharic, Tigrinya)": (0x1200, 0x137F),
}
def detect_language(text):
script_counts = {}
for char in text:
char_code = ord(char)
# Check which script the character belongs to
for script, (start, end) in UNICODE_RANGES.items():
if start <= char_code <= end:
script_counts[script] = script_counts.get(script, 0) + 1
break # Stop checking once we find a match
if not script_counts:
return "Unknown or Latin-based language"
# Return the script with the highest count
return max(script_counts, key=script_counts.get)
# Example usage
print(detect_language("ສະບາຍດີ")) # Lao
print(detect_language("こんにちは")) # Hiragana (Japanese)
print(detect_language("안녕하세요")) # Hangul (Korean)
print(detect_language("Привет")) # Cyrillic
print(detect_language("مرحبا")) # Arabic
print(detect_language("Γειά σου")) # Greek