camelot
camelot copied to clipboard
ZeroDivisionError: float division by zero
ZeroDivisionError encountered while extracting tables using camelot.
tables = camelot.read_pdf('page_39.pdf', pages='1', flavour='stream')
---------------------------------------------------------------------------
ZeroDivisionError Traceback (most recent call last)
Input In [244], in <cell line: 1>()
----> 1 tables = camelot.read_pdf(arabic_english, pages='1', flavour='stream')
File ~/.venv/lib/python3.8/site-packages/camelot/io.py:113, in read_pdf(filepath, pages, password, flavor, suppress_stdout, layout_kwargs, **kwargs)
111 p = PDFHandler(filepath, pages=pages, password=password)
112 kwargs = remove_extra(kwargs, flavor=flavor)
--> 113 tables = p.parse(
114 flavor=flavor,
115 suppress_stdout=suppress_stdout,
116 layout_kwargs=layout_kwargs,
117 **kwargs
118 )
119 return tables
File ~/.venv/lib/python3.8/site-packages/camelot/handlers.py:176, in PDFHandler.parse(self, flavor, suppress_stdout, layout_kwargs, **kwargs)
174 parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
175 for p in pages:
--> 176 t = parser.extract_tables(
177 p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
178 )
179 tables.extend(t)
180 return TableList(sorted(tables))
File ~/.venv/lib/python3.8/site-packages/camelot/parsers/lattice.py:430, in Lattice.extract_tables(self, filename, suppress_stdout, layout_kwargs)
426 # sort tables based on y-coord
427 for table_idx, tk in enumerate(
428 sorted(self.table_bbox.keys(), key=lambda x: x[1], reverse=True)
429 ):
--> 430 cols, rows, v_s, h_s = self._generate_columns_and_rows(table_idx, tk)
431 table = self._generate_table(table_idx, cols, rows, v_s=v_s, h_s=h_s)
432 table._bbox = tk
File ~/.venv/lib/python3.8/site-packages/camelot/parsers/lattice.py:322, in Lattice._generate_columns_and_rows(self, table_idx, tk)
318 t_bbox = {}
319 v_s, h_s = segments_in_bbox(
320 tk, self.vertical_segments, self.horizontal_segments
321 )
--> 322 t_bbox["horizontal"] = text_in_bbox(tk, self.horizontal_text)
323 t_bbox["vertical"] = text_in_bbox(tk, self.vertical_text)
325 t_bbox["horizontal"].sort(key=lambda x: (-x.y0, x.x0))
File ~/.venv/lib/python3.8/site-packages/camelot/utils.py:376, in text_in_bbox(bbox, text)
373 continue
374 if bbox_intersect(ba, bb):
375 # if the intersection is larger than 80% of ba's size, we keep the longest
--> 376 if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8:
377 if bbox_longer(bb, ba):
378 rest.discard(ba)
ZeroDivisionError: float division by zero
Steps to reproduce the bug
- Download the PDF file. https://file.io/lVEXzype0AGL
- Import camelot and issue the following expression:
tables = camelot.read_pdf(file_path, pages='1', flavour='stream')
Expected behavior
Expected to parse the page and extract tables without any error.
Code
import camelot
tables = camelot.read_pdf('page_39.pdf', pages='1', flavour='stream')
Environment
- OS: GNU/Linux
- Python version: 3.8.10
- Numpy version: 1.22.3
- OpenCV version: 4.5.5.64
- Ghostscript version: 0.7
- Camelot version: 0.10.1
When trying to avoid duplicating text by discarding overlapping bounding boxes, a threshold of 80% is used (intersection of bounding boxes over the first bounding box in the pair). In your sample pdf, there is a bounding box with area = 0. Adding a line to filter out textbox with 0 area in utils.text_in_bbox will do.
`def text_in_bbox(bbox, text): """Returns all text objects present inside a bounding box.
Parameters
----------
bbox : tuple
Tuple (x1, y1, x2, y2) representing a bounding box where
(x1, y1) -> lb and (x2, y2) -> rt in the PDF coordinate
space.
text : List of PDFMiner text objects.
Returns
-------
t_bbox : list
List of PDFMiner text objects that lie inside table, discarding the overlapping ones
"""
lb = (bbox[0], bbox[1])
rt = (bbox[2], bbox[3])
t_bbox = [
t
for t in text
if lb[0] - 2 <= (t.x0 + t.x1) / 2.0 <= rt[0] + 2
and lb[1] - 2 <= (t.y0 + t.y1) / 2.0 <= rt[1] + 2
and bbox_area(t) > 0 # Filtering out bbox with area = 0
]
# Avoid duplicate text by discarding overlapping boxes
rest = {t for t in t_bbox}
for ba in t_bbox:
for bb in rest.copy():
if ba == bb:
continue
if bbox_intersect(ba, bb):
# if the intersection is larger than 80% of ba's size, we keep the longest
if (bbox_intersection_area(ba, bb) / bbox_area(ba)) > 0.8:
if bbox_longer(bb, ba):
rest.discard(ba)
unique_boxes = list(rest)
return unique_boxes`