antiword
antiword copied to clipboard
Antiword can't extract text from header of .doc file.
Is there a way to extract content in .doc files?
Yes it is.
In my case I used python with some libraries that require Antiword to be installed in the machine. I hope it helps.
I would suggest working with temp files in memory in case of running this code in docker, good luck.
Note: I am using Antiword installed from apt install
, not sure if that Antiword in the same as this repo, I hope it is since I am trying to install it in an Amazon Linux container and yum
does not have Antiword.
import io
from app.logger import logger
import os
import tempfile
import pypdf
from docx import Document
import subprocess
import numpy as np
from pdf2image import convert_from_bytes
import cv2
from odf import text, teletype
from odf.opendocument import load
def extract_text_from_doc(file_path):
"""
Extract text from a Word file (.doc).
Args:
file_path:
Returns:
"""
return subprocess.check_output(['antiword', file_path])
def extract_text_from_docx(file_path):
try:
doc = Document(file_path)
return " ".join([paragraph.text for paragraph in doc.paragraphs])
except Exception as e:
logger.error(f'Error reading Word file {file_path}, error {str(e)}')
def extract_text_from_odt(file_path):
doc = load(file_path)
all_text = []
for text_element in doc.getElementsByType(text.P):
all_text.append(teletype.extractText(text_element))
return ' '.join(all_text)
def extract_text_from_word(file_path):
"""
Extract text from a doc file based on extension(.doc or .docx).
Args:
file_path:
Returns:
"""
if file_path.endswith('.doc') or file_path.endswith('.dot'):
return extract_text_from_doc(file_path)
elif file_path.endswith('.docx') or \
file_path.endswith('.dotx') or \
file_path.endswith('.docm') or \
file_path.endswith('.dotm'):
return extract_text_from_docx(file_path)
elif file_path.endswith('.odt'):
return extract_text_from_odt(file_path)
elif file_path.endswith('.txt'):
with open(file_path, 'r') as file:
return file.read()
else:
logger.error(msg=f'Unsupported file format',
file_path=file_path)
raise ValueError(f'Unsupported file format {file_path}')