markitdown
markitdown copied to clipboard
feat: adding support for images inside docx
@microsoft-github-policy-service agree
@microsoft-github-policy-service agree
After adding the keep_data_uris flag, im just doing some post processing with some vibe-coded utils, and its working great. Would love to see this capability make it into master.
def _get_llm_description_from_base64(base64_str: str, extension: str, client: Any, model: str, prompt: Optional[str] = None) -> str:
"""Get LLM description for a base64-encoded image string."""
if prompt is None or prompt.strip() == "":
prompt = "Write a detailed caption for this image."
# Remove data URI prefix if present
if ',' in base64_str:
base64_str = base64_str.split(',', 1)[1]
# Create data URI
content_type, _ = mimetypes.guess_type("_dummy." + extension)
if content_type is None:
content_type = "image/jpeg"
data_uri = f"data:{content_type};base64,{base64_str}"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": data_uri},
},
],
}
]
response = client.chat.completions.create(model=model, messages=messages)
return response.choices[0].message.content.strip()
def replace_base64_images_with_descriptions(md_result, llm_client, llm_model, llm_prompt: Optional[str] = None, filename: Optional[str] = None):
"""
Replace all base64 image markdown in md_result.text_content with LLM-generated descriptions, using the filename as the reference.
"""
import os
text_content = md_result.text_content
base64_pattern = r'!\[([^\]]*)\]\((data:image/([a-zA-Z0-9]+);base64,([^\)]+))\)'
image_counter = 1
replacements = []
def _repl(match):
nonlocal image_counter
alt_text = match.group(1)
extension = match.group(3)
base64_str = match.group(4)
description = _get_llm_description_from_base64(base64_str, extension, llm_client, llm_model, llm_prompt)
# Use provided filename or generate a placeholder
ref = filename if filename else f"image{image_counter}"
image_counter += 1
replacements.append((description, ref))
return f"![{description}][{ref}]"
text_content = re.sub(base64_pattern, _repl, text_content)
md_result.text_content = text_content
return md_result
Adding support #277
#277 adding support
#277