markitdown icon indicating copy to clipboard operation
markitdown copied to clipboard

document_intelligence is not working

Open ai-puppy opened this issue 11 months ago • 8 comments

from markitdown import MarkItDown

md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert("test.pdf")
print(result.text_content)

code is throwing this error

DefaultAzureCredential failed to retrieve a token from the included credentials.
Attempted credentials:
        EnvironmentCredential: EnvironmentCredential authentication unavailable. Environment variables are not fully configured.
Visit https://aka.ms/azsdk/python/identity/environmentcredential/troubleshoot to troubleshoot this issue.
        ManagedIdentityCredential: ManagedIdentityCredential authentication unavailable, no response from the IMDS endpoint.
        SharedTokenCacheCredential: SharedTokenCacheCredential authentication unavailable. No accounts were found in the cache.
        AzureCliCredential: Azure CLI not found on path
        AzurePowerShellCredential: PowerShell is not installed
        AzureDeveloperCliCredential: Azure Developer CLI could not be found. Please visit https://aka.ms/azure-dev for installation instructions and then,once installed, authenticate to your Azure account using 'azd auth login'.
To mitigate this issue, please refer to the troubleshooting guidelines here at https://aka.ms/azsdk/python/identity/defaultazurecredential/troubleshoot.
CropBox missing from /Page, defaulting to MediaBox

same set up with document_intelligence sdk directly is working?

# import libraries
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest

# set `<your-endpoint>` and `<your-key>` variables with the values from the Azure portal
endpoint = "<your-endpoint>"
key = "<your-key>"

# helper functions

def get_words(page, line):
    result = []
    for word in page.words:
        if _in_span(word, line.spans):
            result.append(word)
    return result


def _in_span(word, spans):
    for span in spans:
        if word.span.offset >= span.offset and (
            word.span.offset + word.span.length
        ) <= (span.offset + span.length):
            return True
    return False


def analyze_layout():
    # sample document
    formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"

    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", AnalyzeDocumentRequest(url_source=formUrl
    ))

    result: AnalyzeResult = poller.result()

    if result.styles and any([style.is_handwritten for style in result.styles]):
        print("Document contains handwritten content")
    else:
        print("Document does not contain handwritten content")

    for page in result.pages:
        print(f"----Analyzing layout from page #{page.page_number}----")
        print(
            f"Page has width: {page.width} and height: {page.height}, measured with unit: {page.unit}"
        )

        if page.lines:
            for line_idx, line in enumerate(page.lines):
                words = get_words(page, line)
                print(
                    f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
                    f"within bounding polygon '{line.polygon}'"
                )

                for word in words:
                    print(
                        f"......Word '{word.content}' has a confidence of {word.confidence}"
                    )

        if page.selection_marks:
            for selection_mark in page.selection_marks:
                print(
                    f"Selection mark is '{selection_mark.state}' within bounding polygon "
                    f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
                )

    if result.tables:
        for table_idx, table in enumerate(result.tables):
            print(
                f"Table # {table_idx} has {table.row_count} rows and "
                f"{table.column_count} columns"
            )
            if table.bounding_regions:
                for region in table.bounding_regions:
                    print(
                        f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}"
                    )
            for cell in table.cells:
                print(
                    f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'"
                )
                if cell.bounding_regions:
                    for region in cell.bounding_regions:
                        print(
                            f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'"
                        )

    print("----------------------------------------")


if __name__ == "__main__":
    analyze_layout()

any idea why? I am pretty I have the endpoint url and api key set up properly

ai-puppy avatar Apr 07 '25 02:04 ai-puppy

Ahh yes, it's because the default uses managed identity. This PR should solve the problem I think: https://github.com/microsoft/markitdown/pull/1151

Try setting the "AZURE_API_KEY" environment variable.

Otherwise,

You can do:

md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>", docintel_credential=AzureKeyCredential("<your_key>"))

afourney avatar Apr 07 '25 16:04 afourney

Ahh yes, it's because the default uses managed identity. This PR should solve the problem I think: #1151

Try setting the "AZURE_API_KEY" environment variable.

Otherwise,

You can do:

md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>", docintel_credential=AzureKeyCredential("<your_key>"))

thanks for the reply @afourney , I think I am using the latest code with your fix and I tried with your suggestion, but still get the same error it is kind of strange, because the way you use

document_intelligence_client = DocumentIntelligenceClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

seems pretty solid to me, I will try to attach a debugger and see why

ai-puppy avatar Apr 07 '25 20:04 ai-puppy

Thanks! The option is pretty new, so any help on this front is appreciated. It’s meant to work, and we'll shake out the bugs I'm sure.

afourney avatar Apr 07 '25 21:04 afourney

Hi, I am having the same issue. Cannot make it work, although for me is not giving any error, it just extracts plain text from the pdf. Not sure why this happens, still debugging.

But I notice that that PR is not included in the last release (v0.1.1), so that may be the reason why is not working. In the below code the docintel_credential is missing v0.1.1 - packages/markitdown/src/markitdown/_markitdown.py

basshbj avatar Apr 08 '25 04:04 basshbj

@afourney A release would be great—docintel_credential isn’t working for me either.

Isydmr avatar Apr 08 '25 12:04 Isydmr

@afourney although this PR is merged, I guess its not published. we are not seeing these changes in latest version. please check

saibhaskerraju avatar Apr 11 '25 16:04 saibhaskerraju

@afourney

please publish this commit. we want to use this ASAP https://github.com/microsoft/markitdown/commit/9e067c42b647eaf14e842e70e47540b36c0c4a08

saibhaskerraju avatar Apr 11 '25 16:04 saibhaskerraju

@saibhaskerraju there have been other large changes (formula extraction for word documents) that I want a little more feedback on before I publish a new release (e.g., to PyPi). That said, I'll see if I can publish a pre-release today to make it available for easier distribution.

afourney avatar Apr 14 '25 16:04 afourney

FYI It worked when I changed it to v0.1.2 and did the following

from markitdown import MarkItDown
from azure.core.credentials import AzureKeyCredential
ALLOWED_EXTENSIONS = ['pdf']

md = MarkItDown(docintel_endpoint = "<document_intelligence_endpoint>", docintel_file_types = ALLOWED_EXTENSIONS, docintel_credential = AzureKeyCredential("<azure_api_key>"))

result = md.convert('test.pdf')

print(result.text_content)

kuroponzu avatar Jun 20 '25 00:06 kuroponzu

I have tired the document intelligent but there is no image description added in the final markdown file.

import os
from markitdown import MarkItDown
from dotenv import load_dotenv
load_dotenv()
from azure.core.credentials import AzureKeyCredential

AZURE_DOC_INT_ENDPOINT = os.getenv("AZURE_FORM_RECOGNIZER_ENDPOINT")
AZURE_DOC_INT_KEY = os.getenv("AZURE_FORM_RECOGNIZER_KEY")
os.environ["AZURE_API_KEY"] = AZURE_DOC_INT_KEY

def save_to_markdown(output_dir: str, filename: str, text: str):
    os.makedirs(output_dir, exist_ok=True)
    markdown_path = os.path.join(output_dir, filename)
    with open(markdown_path, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Saved: {markdown_path}")

# Input/output folders
input_folder = "sop"
output_folder = "markdowns2"

# Initialize MarkItDown
md = MarkItDown(docintel_endpoint = AZURE_DOC_INT_ENDPOINT, 
                docintel_file_types = ['pdf'],
                docintel_credential = AzureKeyCredential(AZURE_DOC_INT_KEY))

# Loop through all PDF files
for file in os.listdir(input_folder):
    if file.lower().endswith(".pdf"):
        pdf_path = os.path.join(input_folder, file)
        print(f"🔄 Converting: {pdf_path}")
        result = md.convert(pdf_path)
        markdown_filename = os.path.splitext(file)[0] + ".md"
        save_to_markdown(output_folder, markdown_filename, result.markdown)

bhavyajoshi-mahindra avatar Jun 24 '25 17:06 bhavyajoshi-mahindra