Zeno
Zeno copied to clipboard
Ebook extractor
Need to handle many/all ebook formats.
epub ebooks are standard zip files containing HTML.
This is a python function that works in production. The same could be done as a Zeno epub extractor.
def extract_epub_outlinks(epub_data: bytes) -> list[str]:
"""Extract outlinks from epub which are zip files containing HTML.
try using URL pattern matching from text.
"""
outlinks: set[str] = set()
try:
zip_stream = io.BytesIO(epub_data)
with zipfile.ZipFile(zip_stream) as zf:
for filename in zf.namelist():
if filename.endswith((".htm", ".html", ".xhtml")):
with zf.open(filename) as file:
content = file.read().decode('utf-8')
outlinks.update(extract_text_outlinks(content))
except Exception:
logging.error('Could not parse epub.', exc_info=True)
return list(outlinks)