german-nouns
german-nouns copied to clipboard
add non exact match strategy
in our testing we found too many words missing than we can realistically add to Wiktionary (see also https://github.com/gambolputty/german-nouns/issues/8). so we now implemented the following strategy which at least allows us to detect the genus. would this be interesting to add to your package?
primary_german_genus_endings = {
"n": [
"chen",
"ett",
"eau",
"lein",
"icht",
"il",
"ium",
"it",
"ma",
"ment",
"tel",
"tum",
"um",
],
"f": [
"in",
"a",
"ade",
"age",
"anz",
"elle",
"ette",
"ere",
"enz",
"ei",
"ine",
"isse",
"itis",
"ive",
"ie",
"heit",
"keit",
"ik",
"sion",
"se",
"sis",
"tät",
"ung",
"ur",
"schaft",
],
"m": [
"ant",
"ast",
"ich",
"ist",
"ig",
"ling",
"or",
"us",
"ismus",
"är",
"eur",
"iker",
"ps",
],
}
secondary_german_genus_endings = {
# 3 out of four words ending with -nis and -sal are neuter nouns
"n": [
"nis", "sal",
],
# There are exceptions such as Postillion, which is masculine while the oberwhelming majority of -ion words in German is feminine.
"f": [
"ion",
],
# More than half of words ending with -er, -en, -el are masculine
"m": [
"er", "en", "el",
],
}
def determine_genus_from_ending(word, german_genus_endings):
for genus in german_genus_endings:
for ending in german_genus_endings[genus]:
if word.endswith(ending):
return {"genus": genus}
return None
def german_noun_lookup(word):
result = german_nouns[word]
if not len(result):
return None
result = result[0]
if "genus" in result:
return result
if "genus 1" in result:
result["genus"] = result["genus 1"]
return result
if word[-5:].lower() == "leute":
result["genus"] = "f"
return result
genus_result = determine_genus_from_ending(word, primary_german_genus_endings)
if genus_result == None or "genus" not in genus_result:
genus_result = determine_genus_from_ending(word, secondary_german_genus_endings)
if genus_result == None or "genus" not in genus_result:
return None
result["genus"] = genus_result["genus"]
return result
def german_noun_analysis(word, genus_only=False):
result = german_noun_lookup(word)
if result != None:
return result
if genus_only:
result = determine_genus_from_ending(word, primary_german_genus_endings)
if result != None:
return result
# skip the first 2 letters
i = 2
# skip the last 2 letters
while i < len(word) - 2:
partial_word = word[i:]
# avoid cases like 'Ende' at the end of 'Arbeitgebende'
if partial_word == "ende":
break
result = german_noun_lookup(partial_word.capitalize())
if result == None:
i += 1
continue
result["Lemma"] = word
if not genus_only:
word_prefix = word[0:i]
for flexion in result["flexion"]:
result["flexion"][flexion] = (
word_prefix + result["flexion"][flexion].lower()
)
return result
if genus_only:
result = determine_genus_from_ending(word, primary_german_genus_endings)
return result