Snippet: concatenating various dicts
I.e. when you want to make a big dictionary for all of Latin, from the data in git://anongit.freedesktop.org/libreoffice/dictionaries:
from pathlib import Path
from fontTools.unicodedata import script
ACCEPTED_SCRIPTS = {"Zyyy", "Zzzz", "Zinh", "Latn"}
dic_data = []
for p in Path("../dictionaries").glob("**/*.aff"):
aff = p.read_bytes()
encoding = None
for line in aff.splitlines():
if line.startswith(b"SET"):
encoding = line.replace(b"\t", b" ").split(b" ")[1]
break
if encoding is None:
print("Can't find encoding for", p, ", assuming utf-8")
encoding = "utf-8"
else:
encoding = encoding.decode("ascii")
print("Reading", p, "with encoding", encoding)
try_chars = None
for line in aff.splitlines():
if line.startswith(b"TRY"):
try_chars = line.replace(b"\t", b" ").split(b" ")[1]
break
if try_chars is None:
print("Can't find TRY for", p)
continue
try_chars = try_chars.decode(encoding)
if any(script(c) not in ACCEPTED_SCRIPTS for c in try_chars):
print("Not Latin, skipping", p)
continue
dic = p.with_suffix(".dic").read_text(encoding=encoding).splitlines()
del dic[0] # Remove "number of entries" line
dic_data.extend(dic)
Path("all.dic").write_text("\n".join(dic_data))
Note: some .aff files like Hungarian start with some ISO encoding and then switch to UTF-8, so you'll have to read the files in as bytes.
Oh thank you!
@madig Any chance you can somehow integrate this into the CLI? If not, I'll do it. Just thought to ask. :)
Uhm, eventually, but I'm currently busy with other stuff...
I incorporated some of this into the code now: https://github.com/behdad/halfkern/blob/ed6d2df80ffbcb3b05b6d6a5c0be0f04d5a122b2/ngrams.py#L75