docling icon indicating copy to clipboard operation
docling copied to clipboard

Elimination of Document Header and Footers

Open TheMrguiller opened this issue 8 months ago • 6 comments

Requested Feature

I am working on a document extraction problem, and it seems that Docling keeps treating some headers and footers as sections due to their format. Is there any possibility of providing a set of known headers or footers to automatically exclude them from the extracted text?

Alternatives

I am currently removing them manually, but it seems that Docling should at least be able to detect potential headers and footers, as most official documents follow a specific structure. In my case, I am identifying headers and footers in a separate process using an external library.

TheMrguiller avatar Mar 31 '25 14:03 TheMrguiller

Hello, I had the same problem and this code worked for me, as a part of the preprocessing step.

import re
from difflib import SequenceMatcher
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()
     

def get_headers_and_footers(path):
    # Open a PDF file.
    sorted_footer_units = []
    sorted_header_units = []
    headers_footers = []
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    device = PDFPageAggregator(PDFResourceManager(),laparams=LAParams(line_overlap=0.5,line_margin=0.5,char_margin= 0.5,detect_vertical=False))
    interpreter = PDFPageInterpreter(PDFResourceManager(), device)
    page_nr = 0
    for page in PDFPage.create_pages(PDFDocument(parser)):
        page_nr+=1
        p_height = page.mediabox[3]
        interpreter.process_page(page)
        layout = device.get_result()
        units = []
        for element in layout:
            if isinstance(element, LTTextBoxHorizontal):
                paragraph = element.get_text()
                if not paragraph.isspace():
                    units.append({'page':page_nr,'para':paragraph,'x0':element.bbox[0],'y0':element.bbox[1]})
            else:
                pass
        if not units:
            continue
        most_bottom_unit = sorted(units, key=lambda d: d['y0'], reverse=False)
        footer_area_units = []
        header_area_units = []
        #there is the unit that has the largest y0 so it is at the tom, and i want to get [-1] since this list is sorted by the smallest y0 so smallest y0 is the first index and largest is the last index marked with [-1]
        headers = [most_bottom_unit[-1]]
        #theopposite of headers
        footers = [most_bottom_unit[0]]
        #check if there is any other unit close enough to be consider as the same line and if yes add it to its corresponding list (header,footer)
        for el in most_bottom_unit:
            smallest = most_bottom_unit[0]['y0']
            largest = most_bottom_unit[-1]['y0']
            if (el['y0']-smallest) >= 0 and (int(el['y0'])-int(smallest)) < 3:
                if el['para'] != most_bottom_unit[0]['para']:
                    footers.append(el)
                    continue
                else:
                    continue
            if (largest - float(el['y0'])) >= 0 and (largest - float(el['y0'])) < 3:
                if el['para'] != most_bottom_unit[-1]['para']:
                    headers.append(el)
                    continue
                else:
                    continue
            if int(el['y0']) - p_height/2 >= 0:
                header_area_units.append(el)
            if int(el['y0']) - p_height/2 < 0:
                footer_area_units.append(el)
        header_area_units = sorted(header_area_units, key=lambda d: d['y0'], reverse=True)
        sorted_footer_units.append(footer_area_units)
        sorted_header_units.append(header_area_units)
        headers = sorted(headers, key=lambda d: d['x0'], reverse=False)
        headers = (el['para'] for el in headers)
        footers = sorted(footers, key=lambda d: d['x0'], reverse=False)
        footers = (el['para'] for el in footers)
        header = '!!??!!'.join(headers)
        footer = '!!??!!'.join(footers)
        headers_footers.append({'page':page_nr,'header':" ".join(header.split()),'footer':" ".join(footer.split())})
    footers = []
    headers = []

    #------------------------------------------------------
    counter_in_loop_hf = 0
    while counter_in_loop_hf < len(sorted_footer_units) and sorted_footer_units:
        units_with_same_index = []
        i_break = False
        for el in sorted_footer_units:
            try:
                if counter_in_loop_hf < len(el):  # Add length check
                    units_with_same_index.append(el[counter_in_loop_hf])
            except Exception as e:
                pass
        
        if not units_with_same_index:  # Break if no units found
            break
            
        for unitt in units_with_same_index:
            similar_counter = 0
            for rest in units_with_same_index:
                if similar(unitt['para'],rest['para']) > 0.8:
                    similar_counter += 1
            if similar_counter > (page_nr-5):
                a = " ".join(unitt['para'].split())
                for el in headers_footers:
                    if el['page'] == unitt['page']:
                        el['footer'] = str(el['footer']+'!!??!!'+a)
            else:
                i_break = True
        if i_break:
            break
        counter_in_loop_hf +=1
    #_____________
    counter_in_loop_hf = 0
    while counter_in_loop_hf < len(sorted_header_units) and sorted_header_units:
        units_with_same_index = []
        i_break = False
        for el in sorted_header_units:
            try:
                if counter_in_loop_hf < len(el):  # Add length check
                    units_with_same_index.append(el[counter_in_loop_hf])
            except Exception as e:
                pass
                
        if not units_with_same_index:  # Break if no units found
            break
            
        for unitt in units_with_same_index:
            similar_counter = 0
            for rest in units_with_same_index:
                if similar(unitt['para'],rest['para']) > 0.8:
                    similar_counter += 1
            if similar_counter > (page_nr-5):
                a = " ".join(unitt['para'].split())
                for el in headers_footers:
                    if el['page'] == unitt['page']:
                        el['header'] = str(el['header']+'!!??!!'+a)
            else:
                i_break = True
        if i_break:
            break
        counter_in_loop_hf +=1
    #------------------------------------------------------
    
    # Original code collected per-page headers and footers
    page_headers = []
    page_footers = []
    for el in headers_footers:
        counter_f = 0
        counter_h = 0
        for rest in headers_footers:
            if similar(el['footer'],rest['footer']) > 0.7:
                counter_f +=1
        for rest in headers_footers:
            if similar(el['header'],rest['header']) > 0.7:
                counter_h +=1
        if counter_f >= len(headers_footers) -3:
            page_footers.append({'page':el['page'],'content':el['footer'].split(sep='!!??!!')})
        if counter_h >= len(headers_footers) -3:
            page_headers.append({'page':el['page'],'content':el['header'].split(sep='!!??!!')})
    
    # New code to extract unique headers and footers
    unique_headers = set()
    unique_footers = set()
    
    # Extract unique header content
    for header_entry in page_headers:
        for content in header_entry['content']:
            if content and content.strip():  # Ensure non-empty content
                unique_headers.add(content.strip())
    
    # Extract unique footer content
    for footer_entry in page_footers:
        for content in footer_entry['content']:
            if content and content.strip():  # Ensure non-empty content
                unique_footers.add(content.strip())
    
    # Return in the requested format
    return {
        'headers': list(unique_headers),
        'footers': list(unique_footers)
    }

def clean_headers_footers(md: str, headers_footers: dict) -> str:
    if not md or not headers_footers:
        return md

    lines = md.split('\n')
    cleaned_lines = []
    i = 0
    
    while i < len(lines):
        # Look ahead for potential header/footer sections
        section = []
        look_ahead = 20  # Increased look ahead to 20 lines
        
        for j in range(i, min(i + look_ahead, len(lines))):
            section.append(lines[j])
            section_text = ' '.join(line.strip() for line in section 
                                  if not line.strip().startswith('!'))
            section_text = ' '.join(section_text.split())
            
            is_header_footer = False
            # Check headers
            for header in headers_footers.get('headers', []):
                header_cleaned = ' '.join(header.split())
                similarity = similar(header_cleaned, section_text)
                if similarity > 0.5:  # Lowered threshold for more aggressive matching
                    #print(f"Found header match with similarity {similarity}")
                    #print(f"Header: {header_cleaned}")
                    #print(f"Section: {section_text}")
                    is_header_footer = True
                    break
            
            # Check footers
            if not is_header_footer:
                for footer in headers_footers.get('footers', []):
                    footer_cleaned = ' '.join(footer.split())
                    similarity = similar(footer_cleaned, section_text)
                    if similarity > 0.5:  # Lowered threshold for more aggressive matching
                        #print(f"Found footer match with similarity {similarity}")
                        #print(f"Footer: {footer_cleaned}")
                        #print(f"Section: {section_text}")
                        is_header_footer = True
                        break
            
            if is_header_footer:
                # Skip all lines in this section
                i = j + 1
                break
        else:
            # No match found, keep the current line
            cleaned_lines.append(lines[i])
            i += 1

    return '\n'.join(cleaned_lines)

magaton avatar Mar 31 '25 14:03 magaton

Hi, thank you so much. I had a similar code, I think we got it from the same source. I will try to do something similar for OCR-based documents as well.

TheMrguiller avatar Apr 01 '25 06:04 TheMrguiller

The original code had a problem with an infinite loop in case no headers and footers were detected

magaton avatar Apr 01 '25 08:04 magaton

I would also appreciate an easy way to remove pictures in PDF headers.

When DoclingDocument is exported to markdown I noticed that the picture occurring in the header on every page is included in the markdown export. In the documentation I see no easy way to filter out such header pictures.

  1. I can easily identify these pictures - either as the first picture on every page or as all the pictures without captions - and put them in a list, but how do I remove these pictures from my DoclingDocument, so I can export it to markdown without these header pictures?

  2. Alternatively, I can find the positions of these header pictures from the bbox.t value, but how can I convert the PDF document to a DoclingDocument while ignoring everything above the header position, which I identify below?

    converter = DocumentConverter()
    result = converter.convert(pdf_file)
    doc = result.document
    
    # maximum value of 't' per page for pictures
    max_t_per_page = {}
    for pic in doc.pictures:
        page_no = pic.prov[0].page_no
        bbox = pic.prov[0].bbox
    
        if page_no not in max_t_per_page:
            max_t_per_page[page_no] = bbox.t
        else:
            max_t_per_page[page_no] = max(max_t_per_page[page_no], bbox.t)
    
    # maximum 't' value for each page
    for page_no, max_t in max_t_per_page.items():
        print(f"Page {page_no}: max(t) = {max_t}")
    
    # minimum 't' value across all pages
    min_max_t = min(max_t_per_page.values())
    print(f"min(max(t)) across all pages = {min_max_t}")
    
    Page 1: max(t) = 808.3127822875977
    Page 2: max(t) = 808.4529838562012
    Page 3: max(t) = 808.33154296875
    Page 4: max(t) = 808.263298034668
    Page 5: max(t) = 808.2783737182617
    Page 6: max(t) = 808.195125579834
    Page 7: max(t) = 808.241024017334
    Page 8: max(t) = 808.295955657959
    Page 9: max(t) = 808.2980880737305
    Page 10: max(t) = 808.2382431030273
    Page 11: max(t) = 808.3212394714355
    Page 12: max(t) = 808.3475227355957
    Page 13: max(t) = 808.3207511901855
    Page 14: max(t) = 808.2054481506348
    Page 15: max(t) = 808.2580680847168
    min(max(t)) across all pages = 808.195125579834
    

jtkarb avatar Apr 02 '25 10:04 jtkarb

For me, the textitems never get labeld as header or footer, which makes their removals harder

amadou-6e avatar Apr 10 '25 07:04 amadou-6e

Would also be interested in functionality related to this. This makes more straightforward to use Docling in order to only extract the meaningful information from the documents.

ckanaar avatar May 30 '25 10:05 ckanaar