PaddleOCR
PaddleOCR copied to clipboard
quickstart.md的2.2.3 版面分析 第二段程序无法正确运行
问题描述 / Problem Description
使用BML Codelab运行quickstart.md的2.2.3 版面分析 第二段程序无法正确运行,详细见日志
运行环境 / Runtime Environment
使用 https://aistudio.baidu.com/ 提供的gpu测试环境 项目框架:PaddlePaddle 2.6.1 使用套件功能下载PaddleOCR v2.6.0
- OS:
- Paddle:
- PaddleOCR:
复现代码 / Reproduction Code
# !git clone https://github.com/PaddlePaddle/PaddleOCR.git
!git clone https://github.com/PaddlePaddle/PaddleOCR.git
!python3 -m pip install --upgrade pip
!python3 -m pip install "paddlepaddle-gpu" -i https://mirror.baidu.com/pypi/simple
!python3 -m pip install -r /home/aistudio/PaddleOCR-2.6.0/ppstructure/recovery/requirements.txt
!wget https://paddleocr.bj.bcebos.com/whl/pdf2docx-0.0.0-py3-none-any.whl
!pip3 install pdf2docx-0.0.0-py3-none-any.whl
!pip install premailer
!pip install openpyxl
from paddleocr import PPStructure,save_structure_res
from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx
table_engine = PPStructure(recovery=True)
save_folder = './output'
img_path = '/home/aistudio/PaddleOCR-2.6.0/ppstructure/docs/table/1.png'
img = cv2.imread(img_path)
result = table_engine(img)
save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
for line in result:
line.pop('img')
print(line)
h, w, _ = img.shape
res = sorted_layout_boxes(result, w)
convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])
完整报错 / Complete Error Message
[2024/07/18 23:42:51] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/aistudio/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/aistudio/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text
[2024/07/18 23:42:58] ppocr DEBUG: dt_boxes num : 223, elapsed : 0.23255515098571777
[2024/07/18 23:43:00] ppocr DEBUG: rec_res num : 223, elapsed : 1.5538420677185059
[2024/07/18 23:43:01] ppocr DEBUG: dt_boxes num : 80, elapse : 0.0471186637878418
[2024/07/18 23:43:01] ppocr DEBUG: rec_res num : 80, elapse : 0.4826929569244385
[2024/07/18 23:43:03] ppocr DEBUG: dt_boxes num : 110, elapse : 0.059357404708862305
[2024/07/18 23:43:04] ppocr DEBUG: rec_res num : 110, elapse : 0.6696820259094238
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 16
14 img = cv2.imread(img_path)
15 result = table_engine(img)
---> 16 save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])
18 for line in result:
19 line.pop('img')
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/predict_system.py:280, in save_structure_res(res, save_folder, img_name, img_idx)
272 if (
273 region["type"].lower() == "table"
274 and len(region["res"]) > 0
275 and "html" in region["res"]
276 ):
277 excel_path = os.path.join(
278 excel_save_folder, "{}_{}.xlsx".format(region["bbox"], img_idx)
279 )
--> 280 to_excel(region["res"]["html"], excel_path)
281 elif region["type"].lower() == "figure":
282 img_path = os.path.join(
283 excel_save_folder, "{}_{}.jpg".format(region["bbox"], img_idx)
284 )
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/predict_table.py:153, in to_excel(html_table, excel_path)
150 def to_excel(html_table, excel_path):
151 from tablepyxl import tablepyxl
--> 153 tablepyxl.document_to_xl(html_table, excel_path)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/tablepyxl.py:118, in document_to_xl(doc, filename, base_url)
113 def document_to_xl(doc, filename, base_url=None):
114 """
115 Takes a string representation of an html document and writes one sheet for
116 every table in the document. The workbook is written out to a file called filename
117 """
--> 118 wb = document_to_workbook(doc, base_url=base_url)
119 wb.save(filename)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/tablepyxl.py:105, in document_to_workbook(doc, wb, base_url)
100 wb.remove(wb.active)
102 inline_styles_doc = Premailer(
103 doc, base_url=base_url, remove_classes=False
104 ).transform()
--> 105 tables = get_Tables(inline_styles_doc)
107 for table in tables:
108 table_to_sheet(table, wb)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/tablepyxl.py:23, in get_Tables(doc)
21 for comment in comments:
22 comment.drop_tag()
---> 23 return [Table(table) for table in tree.xpath("//table")]
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/tablepyxl.py:23, in <listcomp>(.0)
21 for comment in comments:
22 comment.drop_tag()
---> 23 return [Table(table) for table in tree.xpath("//table")]
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/style.py:234, in Table.__init__(self, table)
231 super(Table, self).__init__(table)
232 table_head = table.find("thead")
233 self.head = (
--> 234 TableHead(table_head, parent=self) if table_head is not None else None
235 )
236 table_body = table.find("tbody")
237 self.body = TableBody(
238 table_body if table_body is not None else table, parent=self
239 )
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/style.py:249, in TableHead.__init__(self, head, parent)
247 def __init__(self, head, parent=None):
248 super(TableHead, self).__init__(head, parent=parent)
--> 249 self.rows = [TableRow(tr, parent=self) for tr in head.findall("tr")]
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/style.py:249, in <listcomp>(.0)
247 def __init__(self, head, parent=None):
248 super(TableHead, self).__init__(head, parent=parent)
--> 249 self.rows = [TableRow(tr, parent=self) for tr in head.findall("tr")]
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/style.py:269, in TableRow.__init__(self, tr, parent)
267 def __init__(self, tr, parent=None):
268 super(TableRow, self).__init__(tr, parent=parent)
--> 269 self.cells = [
270 TableCell(cell, parent=self) for cell in tr.findall("th") + tr.findall("td")
271 ]
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/style.py:270, in <listcomp>(.0)
267 def __init__(self, tr, parent=None):
268 super(TableRow, self).__init__(tr, parent=parent)
269 self.cells = [
--> 270 TableCell(cell, parent=self) for cell in tr.findall("th") + tr.findall("td")
271 ]
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/style.py:312, in TableCell.__init__(self, cell, parent)
310 super(TableCell, self).__init__(cell, parent=parent)
311 self.value = element_to_string(cell)
--> 312 self.number_format = self.get_number_format()
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/style.py:337, in TableCell.get_number_format(self)
335 if "TYPE_DATE" in self.element.get("class", "").split():
336 return FORMAT_DATE_MMDDYYYY
--> 337 if self.data_type() == cell.TYPE_NUMERIC:
338 try:
339 int(self.value)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleocr/ppstructure/table/tablepyxl/style.py:326, in TableCell.data_type(self)
324 else:
325 cell_type = "TYPE_STRING"
--> 326 return getattr(cell, cell_type)
NameError: name 'cell' is not defined
可能解决方案 / Possible solutions
None
附件 / Appendix
None