lingua-py icon indicating copy to clipboard operation
lingua-py copied to clipboard

Use Unicode to detect missing languages.

Open KevinColemanInc opened this issue 7 months ago • 1 comments

For languages that there isn't an underlying model (or the data isn't accessible), can you add a simple unicode range check to add support for them?

import unicodedata

# Define Unicode ranges for various languages
UNICODE_RANGES = {
    "Thai": (0x0E00, 0x0E7F),
    "Lao": (0x0E80, 0x0EFF),
    "Khmer": (0x1780, 0x17FF),
    "Greek": (0x0370, 0x03FF),
    "Cyrillic": (0x0400, 0x04FF),
    "Hebrew": (0x0590, 0x05FF),
    "Arabic": (0x0600, 0x06FF),
    "Devanagari (Hindi, Marathi, Nepali)": (0x0900, 0x097F),
    "Bengali": (0x0980, 0x09FF),
    "Tamil": (0x0B80, 0x0BFF),
    "Georgian": (0x10A0, 0x10FF),
    "Armenian": (0x0530, 0x058F),
    "Hangul (Korean)": (0xAC00, 0xD7AF),
    "Hiragana (Japanese)": (0x3040, 0x309F),
    "Katakana (Japanese)": (0x30A0, 0x30FF),
    "Chinese (Simplified/Traditional)": (0x4E00, 0x9FFF),
    "Ethiopic (Amharic, Tigrinya)": (0x1200, 0x137F),
}

def detect_language(text):
    script_counts = {}

    for char in text:
        char_code = ord(char)

        # Check which script the character belongs to
        for script, (start, end) in UNICODE_RANGES.items():
            if start <= char_code <= end:
                script_counts[script] = script_counts.get(script, 0) + 1
                break  # Stop checking once we find a match

    if not script_counts:
        return "Unknown or Latin-based language"

    # Return the script with the highest count
    return max(script_counts, key=script_counts.get)

# Example usage
print(detect_language("ສະບາຍດີ"))  # Lao
print(detect_language("こんにちは"))  # Hiragana (Japanese)
print(detect_language("안녕하세요"))  # Hangul (Korean)
print(detect_language("Привет"))  # Cyrillic
print(detect_language("مرحبا"))  # Arabic
print(detect_language("Γειά σου"))  # Greek

KevinColemanInc avatar Mar 10 '25 07:03 KevinColemanInc