pdfdir
pdfdir copied to clipboard
这个有点overkill了,我简化成一个脚本
我用之前的版本,当尝试建立下级目录,需要想出精巧的正则公式,这就overkill了,我想要tab来指示层级。不知道现在是不是还忽略行首tab的。
#!/bin/python
import os
import re
import sys
from collections import defaultdict
from pypdf import PdfWriter, PdfReader
class Pdf(object):
def __init__(self, path):
self.path = path
reader = PdfReader(open(path, "rb"), strict=False)
self.writer = PdfWriter()
self.writer.append(reader)
self.writer._root_object.pop("/Outlines", None)
@property
def _new_path(self):
name, ext = os.path.splitext(self.path)
return name + '_new' + ext
def add_bookmark(self, title, pagenum, parent=None):
return self.writer.add_outline_item(title, pagenum, parent=parent)
def save_pdf(self):
if os.path.exists(self._new_path):
os.remove(self._new_path)
with open(self._new_path, 'wb') as out:
self.writer.write(out)
return self._new_path
def _add_bookmark(pdf, index_dict):
if not index_dict:
return None
m = max(index_dict.keys())
parent_dict = {} # {parent index:IndirectObject}
for i in range(m+1):
value = index_dict[i]
inobject = pdf.add_bookmark(value['title'],
value['pagenum'] - 1,
parent_dict.get(value.get('parent')))
parent_dict[i] = inobject
def add_bookmark(path, index_dict):
pdf = Pdf(path)
_add_bookmark(pdf, index_dict)
return pdf.save_pdf()
def toc_reader(path, gap):
pattern = re.compile(r'^(\t*)([^\t]+)\t(\d+)$')
tocdict = {}
levels = defaultdict(list)
lastpagenum = 0
with open(path, 'r') as toc:
for line, item in enumerate(toc):
content = pattern.search(item)
assert content, f"line {line}:{item} line ill-formatted"
indent, title, pagenum = content.group(1, 2, 3)
pagenum = int(pagenum) + int(gap)
assert pagenum >= lastpagenum, f"line {line}:{item} pagenum wrong"
tocdict[line] = {'title': title, 'pagenum': pagenum}
levels[len(indent)].append(line)
if len(indent) > 0:
tocdict[line]['parent'] = levels[len(indent) - 1][-1]
lastpagenum = pagenum
return tocdict
if __name__ == '__main__':
file, toc, gap = sys.argv[1:]
index_dict = toc_reader(toc, gap)
add_bookmark(file, index_dict)
使用pdfbookmark.py xxx.pdf toc 10来运行它,toc用tab缩进来分级
新版本支持空格分层了,不过脚本写的挺好的