antiword icon indicating copy to clipboard operation
antiword copied to clipboard

Antiword can't extract text from header of .doc file.

Open ramtalentrecruit opened this issue 2 years ago • 1 comments

Is there a way to extract content in .doc files?

ramtalentrecruit avatar Jan 23 '23 09:01 ramtalentrecruit

Yes it is.

In my case I used python with some libraries that require Antiword to be installed in the machine. I hope it helps.

I would suggest working with temp files in memory in case of running this code in docker, good luck.

Note: I am using Antiword installed from apt install, not sure if that Antiword in the same as this repo, I hope it is since I am trying to install it in an Amazon Linux container and yum does not have Antiword.

import io

from app.logger import logger
import os
import tempfile
import pypdf
from docx import Document
import subprocess
import numpy as np
from pdf2image import convert_from_bytes
import cv2
from odf import text, teletype
from odf.opendocument import load

def extract_text_from_doc(file_path):
    """
    Extract text from a Word file (.doc).
    Args:
        file_path:

    Returns:

    """
    return subprocess.check_output(['antiword', file_path])


def extract_text_from_docx(file_path):
    try:
        doc = Document(file_path)
        return " ".join([paragraph.text for paragraph in doc.paragraphs])
    except Exception as e:
        logger.error(f'Error reading Word file {file_path}, error {str(e)}')


def extract_text_from_odt(file_path):
    doc = load(file_path)
    all_text = []
    for text_element in doc.getElementsByType(text.P):
        all_text.append(teletype.extractText(text_element))

    return ' '.join(all_text)


def extract_text_from_word(file_path):
    """
    Extract text from a doc file based on extension(.doc or .docx).
    Args:
        file_path:

    Returns:

    """
    if file_path.endswith('.doc') or file_path.endswith('.dot'):
        return extract_text_from_doc(file_path)
    elif file_path.endswith('.docx') or \
            file_path.endswith('.dotx') or \
            file_path.endswith('.docm') or \
            file_path.endswith('.dotm'):
        return extract_text_from_docx(file_path)
    elif file_path.endswith('.odt'):
        return extract_text_from_odt(file_path)
    elif file_path.endswith('.txt'):
        with open(file_path, 'r') as file:
            return file.read()
    else:
        logger.error(msg=f'Unsupported file format',
                     file_path=file_path)
        raise ValueError(f'Unsupported file format {file_path}')

jorgesisco avatar Jan 13 '24 16:01 jorgesisco