pdfdir
pdfdir copied to clipboard
实际页码可能因为pdf中没包含空白页而需要中途校正,另附简化后无gui脚本
有很多书新章是在奇数页的,上一章最后页可能会成空白,但扫描后的pdf没有包含进空白页,就造成ocr目录后的页码加上个数字还是会有错位,错位还会变大,只有写入bookmark后尝试点击才会发现是否有错位。 解决方式也简单,就是在有错位的地方,加上数字校正,在校正变化的地方加上新数字,这样不用却在目录文件中更新很多数字,还与实际目录中页码对应。我简化成脚本中增加了该功能。
#!/bin/python
import os
import re
import sys
from collections import defaultdict
from pypdf import PdfWriter, PdfReader, PageObject
from pypdf.generic import Destination
class Pdf(object):
def __init__(self, path):
self.path = path
self.reader = PdfReader(open(path, "rb"), strict=False)
self.pages_num = self._get_pages_num(self.reader.pages)
self._writer = None
@property
def _new_path(self):
name, ext = os.path.splitext(self.path)
return name + '_new' + ext
@property
def writer(self):
if not self._writer:
writer = PdfWriter()
self.copy_reader_to_writer(self.reader, writer)
writer._root_object.pop("/Outlines", None)
self._writer = writer
return self._writer
@staticmethod
def copy_reader_to_writer(reader, writer):
try:
writer.append(reader, import_outline=False)
except Exception as e:
print("Copy pdf failed, {}, try to exclude /Annots and /B".format(e))
try:
writer.append(reader, import_outline=False, excluded_fields=["/Annots", "/B"])
except Exception as e:
print("Copy pdf failed again, {}, try to use append_pages_from_reader".format(e))
writer.append_pages_from_reader(reader)
@staticmethod
def _get_pages_num(pages):
pages_num = {}
for page in pages:
try:
if isinstance(page, PageObject):
pages_num[page.indirect_ref.idnum] = page.page_number
else:
print("Unknown page type {} for {}".format(type(page), page.page_number))
except Exception as e:
print(e)
return pages_num
def _outlines_to_bookmarks(self, outlines, current_level=0):
index_list = []
for o in outlines:
if isinstance(o, Destination):
try:
idnum = o.page if isinstance(o.page, int) else o.page.idnum
title = "\t" * current_level + o.title.strip()
page_num = self.pages_num[idnum] + 1
index_list.append("{title}\t{page_num}".format(title=title, page_num=page_num))
except Exception as e:
print(e)
elif isinstance(o, list):
index_list += self._outlines_to_bookmarks(o, current_level + 1)
else:
print("Unknown outline type: {} in {}".format(type(o), o))
continue
return index_list
def exist_bookmarks(self):
return self._outlines_to_bookmarks(self.reader.outline)
def add_bookmark(self, title, pagenum, parent=None):
return self.writer.add_outline_item(title, pagenum, parent=parent)
def save_pdf(self):
if os.path.exists(self._new_path):
os.remove(self._new_path)
with open(self._new_path, 'wb') as out:
self.writer.write(out)
return self._new_path
def _add_bookmark(pdf, index_dict):
if not index_dict:
return None
m = max(index_dict.keys())
parent_dict = {}
max_page_num = len(pdf.writer.pages) - 1
for i in range(m+1):
value = index_dict[i]
inobject = pdf.add_bookmark(value.get('title', ''),
min(value.get('pagenum', 1) - 1, max_page_num),
parent_dict.get(value.get('parent')))
parent_dict[i] = inobject
def add_bookmark(path, index_dict):
pdf = Pdf(path)
_add_bookmark(pdf, index_dict)
return pdf.save_pdf()
def get_bookmarks(path):
if not path:
return []
try:
return Pdf(path).exist_bookmarks()
except Exception as e:
print("Read pdf %s failed! %s" % (path, e))
return []
def toc_reader(path, gap):
pattern = re.compile(r'^(\t*)([^\t]+)\t(-?\d+)(?:\t([-+]\d+))?$')
tocdict = {}
levels = defaultdict(list)
lastpagenum = 0
fix = 0
with open(path, 'r') as toc:
for line, item in enumerate(toc):
content = pattern.search(item)
assert content, f"line {line}:{item} line ill-formatted"
indent, title, pagenum, fixpagenum = content.group(1, 2, 3, 4)
if fixpagenum:
fix = int(fixpagenum)
pagenum = int(pagenum) + int(gap) + fix
assert pagenum >= lastpagenum, f"line {line}:{item} pagenum wrong"
tocdict[line] = {'title': title, 'pagenum': pagenum}
levels[len(indent)].append(line)
if len(indent) > 0:
tocdict[line]['parent'] = levels[len(indent) - 1][-1]
lastpagenum = pagenum
return tocdict
if __name__ == '__main__':
args = len(sys.argv)
if args == 2:
file = sys.argv[1]
toclist = get_bookmarks(file)
for item in toclist:
print(item)
elif args == 4:
file, toc, gap = sys.argv[1:]
tocdict = toc_reader(toc, gap)
add_bookmark(file, tocdict)
根据不是很近的api更新的,页码严格增加。 pdfbookmark.py xxx.pdf 输出toc pdfbookmark.py xxx.pdf tocfile 10 写入toc,生成新pdf 我自己觉得用脚本更方便,不敢藏私,借用了api才很好用。
感谢提供优化代码