Elimination of Document Header and Footers
Requested Feature
I am working on a document extraction problem, and it seems that Docling keeps treating some headers and footers as sections due to their format. Is there any possibility of providing a set of known headers or footers to automatically exclude them from the extracted text?
Alternatives
I am currently removing them manually, but it seems that Docling should at least be able to detect potential headers and footers, as most official documents follow a specific structure. In my case, I am identifying headers and footers in a separate process using an external library.
Hello, I had the same problem and this code worked for me, as a part of the preprocessing step.
import re
from difflib import SequenceMatcher
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
def get_headers_and_footers(path):
# Open a PDF file.
sorted_footer_units = []
sorted_header_units = []
headers_footers = []
fp = open(path, 'rb')
parser = PDFParser(fp)
device = PDFPageAggregator(PDFResourceManager(),laparams=LAParams(line_overlap=0.5,line_margin=0.5,char_margin= 0.5,detect_vertical=False))
interpreter = PDFPageInterpreter(PDFResourceManager(), device)
page_nr = 0
for page in PDFPage.create_pages(PDFDocument(parser)):
page_nr+=1
p_height = page.mediabox[3]
interpreter.process_page(page)
layout = device.get_result()
units = []
for element in layout:
if isinstance(element, LTTextBoxHorizontal):
paragraph = element.get_text()
if not paragraph.isspace():
units.append({'page':page_nr,'para':paragraph,'x0':element.bbox[0],'y0':element.bbox[1]})
else:
pass
if not units:
continue
most_bottom_unit = sorted(units, key=lambda d: d['y0'], reverse=False)
footer_area_units = []
header_area_units = []
#there is the unit that has the largest y0 so it is at the tom, and i want to get [-1] since this list is sorted by the smallest y0 so smallest y0 is the first index and largest is the last index marked with [-1]
headers = [most_bottom_unit[-1]]
#theopposite of headers
footers = [most_bottom_unit[0]]
#check if there is any other unit close enough to be consider as the same line and if yes add it to its corresponding list (header,footer)
for el in most_bottom_unit:
smallest = most_bottom_unit[0]['y0']
largest = most_bottom_unit[-1]['y0']
if (el['y0']-smallest) >= 0 and (int(el['y0'])-int(smallest)) < 3:
if el['para'] != most_bottom_unit[0]['para']:
footers.append(el)
continue
else:
continue
if (largest - float(el['y0'])) >= 0 and (largest - float(el['y0'])) < 3:
if el['para'] != most_bottom_unit[-1]['para']:
headers.append(el)
continue
else:
continue
if int(el['y0']) - p_height/2 >= 0:
header_area_units.append(el)
if int(el['y0']) - p_height/2 < 0:
footer_area_units.append(el)
header_area_units = sorted(header_area_units, key=lambda d: d['y0'], reverse=True)
sorted_footer_units.append(footer_area_units)
sorted_header_units.append(header_area_units)
headers = sorted(headers, key=lambda d: d['x0'], reverse=False)
headers = (el['para'] for el in headers)
footers = sorted(footers, key=lambda d: d['x0'], reverse=False)
footers = (el['para'] for el in footers)
header = '!!??!!'.join(headers)
footer = '!!??!!'.join(footers)
headers_footers.append({'page':page_nr,'header':" ".join(header.split()),'footer':" ".join(footer.split())})
footers = []
headers = []
#------------------------------------------------------
counter_in_loop_hf = 0
while counter_in_loop_hf < len(sorted_footer_units) and sorted_footer_units:
units_with_same_index = []
i_break = False
for el in sorted_footer_units:
try:
if counter_in_loop_hf < len(el): # Add length check
units_with_same_index.append(el[counter_in_loop_hf])
except Exception as e:
pass
if not units_with_same_index: # Break if no units found
break
for unitt in units_with_same_index:
similar_counter = 0
for rest in units_with_same_index:
if similar(unitt['para'],rest['para']) > 0.8:
similar_counter += 1
if similar_counter > (page_nr-5):
a = " ".join(unitt['para'].split())
for el in headers_footers:
if el['page'] == unitt['page']:
el['footer'] = str(el['footer']+'!!??!!'+a)
else:
i_break = True
if i_break:
break
counter_in_loop_hf +=1
#_____________
counter_in_loop_hf = 0
while counter_in_loop_hf < len(sorted_header_units) and sorted_header_units:
units_with_same_index = []
i_break = False
for el in sorted_header_units:
try:
if counter_in_loop_hf < len(el): # Add length check
units_with_same_index.append(el[counter_in_loop_hf])
except Exception as e:
pass
if not units_with_same_index: # Break if no units found
break
for unitt in units_with_same_index:
similar_counter = 0
for rest in units_with_same_index:
if similar(unitt['para'],rest['para']) > 0.8:
similar_counter += 1
if similar_counter > (page_nr-5):
a = " ".join(unitt['para'].split())
for el in headers_footers:
if el['page'] == unitt['page']:
el['header'] = str(el['header']+'!!??!!'+a)
else:
i_break = True
if i_break:
break
counter_in_loop_hf +=1
#------------------------------------------------------
# Original code collected per-page headers and footers
page_headers = []
page_footers = []
for el in headers_footers:
counter_f = 0
counter_h = 0
for rest in headers_footers:
if similar(el['footer'],rest['footer']) > 0.7:
counter_f +=1
for rest in headers_footers:
if similar(el['header'],rest['header']) > 0.7:
counter_h +=1
if counter_f >= len(headers_footers) -3:
page_footers.append({'page':el['page'],'content':el['footer'].split(sep='!!??!!')})
if counter_h >= len(headers_footers) -3:
page_headers.append({'page':el['page'],'content':el['header'].split(sep='!!??!!')})
# New code to extract unique headers and footers
unique_headers = set()
unique_footers = set()
# Extract unique header content
for header_entry in page_headers:
for content in header_entry['content']:
if content and content.strip(): # Ensure non-empty content
unique_headers.add(content.strip())
# Extract unique footer content
for footer_entry in page_footers:
for content in footer_entry['content']:
if content and content.strip(): # Ensure non-empty content
unique_footers.add(content.strip())
# Return in the requested format
return {
'headers': list(unique_headers),
'footers': list(unique_footers)
}
def clean_headers_footers(md: str, headers_footers: dict) -> str:
if not md or not headers_footers:
return md
lines = md.split('\n')
cleaned_lines = []
i = 0
while i < len(lines):
# Look ahead for potential header/footer sections
section = []
look_ahead = 20 # Increased look ahead to 20 lines
for j in range(i, min(i + look_ahead, len(lines))):
section.append(lines[j])
section_text = ' '.join(line.strip() for line in section
if not line.strip().startswith('!'))
section_text = ' '.join(section_text.split())
is_header_footer = False
# Check headers
for header in headers_footers.get('headers', []):
header_cleaned = ' '.join(header.split())
similarity = similar(header_cleaned, section_text)
if similarity > 0.5: # Lowered threshold for more aggressive matching
#print(f"Found header match with similarity {similarity}")
#print(f"Header: {header_cleaned}")
#print(f"Section: {section_text}")
is_header_footer = True
break
# Check footers
if not is_header_footer:
for footer in headers_footers.get('footers', []):
footer_cleaned = ' '.join(footer.split())
similarity = similar(footer_cleaned, section_text)
if similarity > 0.5: # Lowered threshold for more aggressive matching
#print(f"Found footer match with similarity {similarity}")
#print(f"Footer: {footer_cleaned}")
#print(f"Section: {section_text}")
is_header_footer = True
break
if is_header_footer:
# Skip all lines in this section
i = j + 1
break
else:
# No match found, keep the current line
cleaned_lines.append(lines[i])
i += 1
return '\n'.join(cleaned_lines)
Hi, thank you so much. I had a similar code, I think we got it from the same source. I will try to do something similar for OCR-based documents as well.
The original code had a problem with an infinite loop in case no headers and footers were detected
I would also appreciate an easy way to remove pictures in PDF headers.
When DoclingDocument is exported to markdown I noticed that the picture occurring in the header on every page is included in the markdown export. In the documentation I see no easy way to filter out such header pictures.
-
I can easily identify these pictures - either as the first picture on every page or as all the pictures without captions - and put them in a list, but how do I remove these pictures from my DoclingDocument, so I can export it to markdown without these header pictures?
-
Alternatively, I can find the positions of these header pictures from the
bbox.tvalue, but how can I convert the PDF document to a DoclingDocument while ignoring everything above the header position, which I identify below?converter = DocumentConverter() result = converter.convert(pdf_file) doc = result.document # maximum value of 't' per page for pictures max_t_per_page = {} for pic in doc.pictures: page_no = pic.prov[0].page_no bbox = pic.prov[0].bbox if page_no not in max_t_per_page: max_t_per_page[page_no] = bbox.t else: max_t_per_page[page_no] = max(max_t_per_page[page_no], bbox.t) # maximum 't' value for each page for page_no, max_t in max_t_per_page.items(): print(f"Page {page_no}: max(t) = {max_t}") # minimum 't' value across all pages min_max_t = min(max_t_per_page.values()) print(f"min(max(t)) across all pages = {min_max_t}")Page 1: max(t) = 808.3127822875977 Page 2: max(t) = 808.4529838562012 Page 3: max(t) = 808.33154296875 Page 4: max(t) = 808.263298034668 Page 5: max(t) = 808.2783737182617 Page 6: max(t) = 808.195125579834 Page 7: max(t) = 808.241024017334 Page 8: max(t) = 808.295955657959 Page 9: max(t) = 808.2980880737305 Page 10: max(t) = 808.2382431030273 Page 11: max(t) = 808.3212394714355 Page 12: max(t) = 808.3475227355957 Page 13: max(t) = 808.3207511901855 Page 14: max(t) = 808.2054481506348 Page 15: max(t) = 808.2580680847168 min(max(t)) across all pages = 808.195125579834
For me, the textitems never get labeld as header or footer, which makes their removals harder
Would also be interested in functionality related to this. This makes more straightforward to use Docling in order to only extract the meaningful information from the documents.