LinguaCafe
LinguaCafe copied to clipboard
Ebook import sometimes messes up chapter order.
There should be an option to use the current default option, or order chapters by the
Someone sent me this code, I will look at it later.
def loadBook(file):
# rp and rt tags are used in adding prononciation over words, we need to remove the content of the tags
cleaner = lxml.html.clean.Cleaner(allow_tags=[''], remove_unknown_tags=False, kill_tags = ['rp','rt'], page_structure=False)
content = ''
book = epub.read_epub(file)
items = list(book.get_items())
spine_keys = {idref: ii for ii, (idref, _) in enumerate(book.spine)}
sorted_items = sorted(items, key=lambda item: spine_keys.get(item.id, float('inf')))
for item in sorted_items:
if item.get_type() == ebooklib.ITEM_DOCUMENT:
epubPage = cleaner.clean_html(item.get_content()).decode('utf-8')
# needed to removed extra div created by cleaner...
epubPage = lxml.html.fromstring(epubPage).text_content()
content += epubPage