grobid_client_python
grobid_client_python copied to clipboard
Extract Images / use processFulltextAssetDocument service
Hi GROBID team,
I want to extract the images from pdfs together with the conversion of the PDF (to TEI-XML) using GROBID. Using the batch command everything works fine.
The grobid client is also working as it should, but I miss the function of extracting the images. Is it possible to add the "processFulltextAssetDocument" service to the python grobid-client to extract images as well? In the documentation is written: "There is a web service doing the same, returning everything in a big zip file, processFulltextAssetDocument, still usable but deprecated." When I use it as service
service="processFulltextAssetDocument"
calling
"client.process(service, input, output)"
it fails with ERROR 406.
Thanks in advance!
I never use it to extract image, but you can extract image by coordinate and pymupdf:
for example, you can reference my code:
import fitz
def get_image_from_corrd(pdf_path, coord, to_path, fig_type):
doc = fitz.open(pdf_path)
page_map = {}
for cor in coord:
cor2 = cor.split(",")
page = int(cor2[0])
x1 = float(cor2[1])
y1 = float(cor2[2])
x2 = float(cor2[1])+float(cor2[3])
y2 = float(cor2[2])+float(cor2[4])
if page_map.get(page) is None:
page_map[page] = {
"x1": x1,
"y1": y1,
"x2": x2,
"y2": y2
}
else:
page_map[page] = {
"x1": min(page_map[page]["x1"], x1),
"y1": min(page_map[page]["y1"], y1),
"x2": max(page_map[page]["x2"], x2),
"y2": max(page_map[page]["y2"], y2)
}
to_paths = []
for page_number, data in page_map.items():
# Get the page (page_number is 1-based, fitz uses 0-based indexing)
page_number, x, y, x2, y2 = int(page_number), float(data["x1"]), float(data["y1"]), float(data["x2"]), float(data["y2"])
page = doc.load_page(int(page_number) - 1)
# Define the rectangle for the image extraction
rect = fitz.Rect(x, y, x2, y2)
# Clip the page using the rectangle and get the image
pix = page.get_pixmap(clip=rect)
# Save the image to the specified path
if fig_type == "figure":
pix.save(os.path.join(to_path, f"{fig_type}_{self.fig_num}.png"))
to_paths.append(os.path.join(to_path, f"{fig_type}_{self.fig_num}.png"))
self.fig_num += 1
elif fig_type == "table":
pix.save(os.path.join(to_path, f"{fig_type}_{self.table_num}.png"))
to_paths.append(os.path.join(to_path, f"{fig_type}_{self.table_num}.png"))
self.table_num += 1
# Close the document
doc.close()
return to_paths
get_image_from_corrd(pdf_path, coords.split(";"), img_path, types)