markitdown feat: adding support for images inside docx

Jan 10 '25 12:01 PedroMiolaSilva

@microsoft-github-policy-service agree

@microsoft-github-policy-service agree

Jan 10 '25 14:01 PedroMiolaSilva

After adding the keep_data_uris flag, im just doing some post processing with some vibe-coded utils, and its working great. Would love to see this capability make it into master.

def _get_llm_description_from_base64(base64_str: str, extension: str, client: Any, model: str, prompt: Optional[str] = None) -> str:
    """Get LLM description for a base64-encoded image string."""
    if prompt is None or prompt.strip() == "":
        prompt = "Write a detailed caption for this image."
    # Remove data URI prefix if present
    if ',' in base64_str:
        base64_str = base64_str.split(',', 1)[1]
    # Create data URI
    content_type, _ = mimetypes.guess_type("_dummy." + extension)
    if content_type is None:
        content_type = "image/jpeg"
    data_uri = f"data:{content_type};base64,{base64_str}"
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {
                    "type": "image_url",
                    "image_url": {"url": data_uri},
                },
            ],
        }
    ]
    response = client.chat.completions.create(model=model, messages=messages)
    return response.choices[0].message.content.strip()

def replace_base64_images_with_descriptions(md_result, llm_client, llm_model, llm_prompt: Optional[str] = None, filename: Optional[str] = None):
    """
    Replace all base64 image markdown in md_result.text_content with LLM-generated descriptions, using the filename as the reference.
    """
    import os
    text_content = md_result.text_content
    base64_pattern = r'!\[([^\]]*)\]\((data:image/([a-zA-Z0-9]+);base64,([^\)]+))\)'
    image_counter = 1
    replacements = []
    def _repl(match):
        nonlocal image_counter
        alt_text = match.group(1)
        extension = match.group(3)
        base64_str = match.group(4)
        description = _get_llm_description_from_base64(base64_str, extension, llm_client, llm_model, llm_prompt)
        # Use provided filename or generate a placeholder
        ref = filename if filename else f"image{image_counter}"
        image_counter += 1
        replacements.append((description, ref))
        return f"![{description}][{ref}]"
    text_content = re.sub(base64_pattern, _repl, text_content)
    md_result.text_content = text_content
    return md_result

Apr 28 '25 03:04 joshjm

Adding support #277

May 18 '25 15:05 Sheikh1980

#277 adding support

May 18 '25 15:05 Sheikh1980

#277

May 18 '25 15:05 Sheikh1980