document_intelligence is not working
from markitdown import MarkItDown
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert("test.pdf")
print(result.text_content)
code is throwing this error
DefaultAzureCredential failed to retrieve a token from the included credentials.
Attempted credentials:
EnvironmentCredential: EnvironmentCredential authentication unavailable. Environment variables are not fully configured.
Visit https://aka.ms/azsdk/python/identity/environmentcredential/troubleshoot to troubleshoot this issue.
ManagedIdentityCredential: ManagedIdentityCredential authentication unavailable, no response from the IMDS endpoint.
SharedTokenCacheCredential: SharedTokenCacheCredential authentication unavailable. No accounts were found in the cache.
AzureCliCredential: Azure CLI not found on path
AzurePowerShellCredential: PowerShell is not installed
AzureDeveloperCliCredential: Azure Developer CLI could not be found. Please visit https://aka.ms/azure-dev for installation instructions and then,once installed, authenticate to your Azure account using 'azd auth login'.
To mitigate this issue, please refer to the troubleshooting guidelines here at https://aka.ms/azsdk/python/identity/defaultazurecredential/troubleshoot.
CropBox missing from /Page, defaulting to MediaBox
same set up with document_intelligence sdk directly is working?
# import libraries
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
# set `<your-endpoint>` and `<your-key>` variables with the values from the Azure portal
endpoint = "<your-endpoint>"
key = "<your-key>"
# helper functions
def get_words(page, line):
result = []
for word in page.words:
if _in_span(word, line.spans):
result.append(word)
return result
def _in_span(word, spans):
for span in spans:
if word.span.offset >= span.offset and (
word.span.offset + word.span.length
) <= (span.offset + span.length):
return True
return False
def analyze_layout():
# sample document
formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"
document_intelligence_client = DocumentIntelligenceClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
poller = document_intelligence_client.begin_analyze_document(
"prebuilt-layout", AnalyzeDocumentRequest(url_source=formUrl
))
result: AnalyzeResult = poller.result()
if result.styles and any([style.is_handwritten for style in result.styles]):
print("Document contains handwritten content")
else:
print("Document does not contain handwritten content")
for page in result.pages:
print(f"----Analyzing layout from page #{page.page_number}----")
print(
f"Page has width: {page.width} and height: {page.height}, measured with unit: {page.unit}"
)
if page.lines:
for line_idx, line in enumerate(page.lines):
words = get_words(page, line)
print(
f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
f"within bounding polygon '{line.polygon}'"
)
for word in words:
print(
f"......Word '{word.content}' has a confidence of {word.confidence}"
)
if page.selection_marks:
for selection_mark in page.selection_marks:
print(
f"Selection mark is '{selection_mark.state}' within bounding polygon "
f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
)
if result.tables:
for table_idx, table in enumerate(result.tables):
print(
f"Table # {table_idx} has {table.row_count} rows and "
f"{table.column_count} columns"
)
if table.bounding_regions:
for region in table.bounding_regions:
print(
f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}"
)
for cell in table.cells:
print(
f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'"
)
if cell.bounding_regions:
for region in cell.bounding_regions:
print(
f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'"
)
print("----------------------------------------")
if __name__ == "__main__":
analyze_layout()
any idea why? I am pretty I have the endpoint url and api key set up properly
Ahh yes, it's because the default uses managed identity. This PR should solve the problem I think: https://github.com/microsoft/markitdown/pull/1151
Try setting the "AZURE_API_KEY" environment variable.
Otherwise,
You can do:
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>", docintel_credential=AzureKeyCredential("<your_key>"))
Ahh yes, it's because the default uses managed identity. This PR should solve the problem I think: #1151
Try setting the "AZURE_API_KEY" environment variable.
Otherwise,
You can do:
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>", docintel_credential=AzureKeyCredential("<your_key>"))
thanks for the reply @afourney , I think I am using the latest code with your fix and I tried with your suggestion, but still get the same error it is kind of strange, because the way you use
document_intelligence_client = DocumentIntelligenceClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
seems pretty solid to me, I will try to attach a debugger and see why
Thanks! The option is pretty new, so any help on this front is appreciated. It’s meant to work, and we'll shake out the bugs I'm sure.
Hi, I am having the same issue. Cannot make it work, although for me is not giving any error, it just extracts plain text from the pdf. Not sure why this happens, still debugging.
But I notice that that PR is not included in the last release (v0.1.1), so that may be the reason why is not working.
In the below code the docintel_credential is missing
v0.1.1 - packages/markitdown/src/markitdown/_markitdown.py
@afourney A release would be great—docintel_credential isn’t working for me either.
@afourney although this PR is merged, I guess its not published. we are not seeing these changes in latest version. please check
@afourney
please publish this commit. we want to use this ASAP https://github.com/microsoft/markitdown/commit/9e067c42b647eaf14e842e70e47540b36c0c4a08
@saibhaskerraju there have been other large changes (formula extraction for word documents) that I want a little more feedback on before I publish a new release (e.g., to PyPi). That said, I'll see if I can publish a pre-release today to make it available for easier distribution.
FYI It worked when I changed it to v0.1.2 and did the following
from markitdown import MarkItDown
from azure.core.credentials import AzureKeyCredential
ALLOWED_EXTENSIONS = ['pdf']
md = MarkItDown(docintel_endpoint = "<document_intelligence_endpoint>", docintel_file_types = ALLOWED_EXTENSIONS, docintel_credential = AzureKeyCredential("<azure_api_key>"))
result = md.convert('test.pdf')
print(result.text_content)
I have tired the document intelligent but there is no image description added in the final markdown file.
import os
from markitdown import MarkItDown
from dotenv import load_dotenv
load_dotenv()
from azure.core.credentials import AzureKeyCredential
AZURE_DOC_INT_ENDPOINT = os.getenv("AZURE_FORM_RECOGNIZER_ENDPOINT")
AZURE_DOC_INT_KEY = os.getenv("AZURE_FORM_RECOGNIZER_KEY")
os.environ["AZURE_API_KEY"] = AZURE_DOC_INT_KEY
def save_to_markdown(output_dir: str, filename: str, text: str):
os.makedirs(output_dir, exist_ok=True)
markdown_path = os.path.join(output_dir, filename)
with open(markdown_path, "w", encoding="utf-8") as f:
f.write(text)
print(f"Saved: {markdown_path}")
# Input/output folders
input_folder = "sop"
output_folder = "markdowns2"
# Initialize MarkItDown
md = MarkItDown(docintel_endpoint = AZURE_DOC_INT_ENDPOINT,
docintel_file_types = ['pdf'],
docintel_credential = AzureKeyCredential(AZURE_DOC_INT_KEY))
# Loop through all PDF files
for file in os.listdir(input_folder):
if file.lower().endswith(".pdf"):
pdf_path = os.path.join(input_folder, file)
print(f"🔄 Converting: {pdf_path}")
result = md.convert(pdf_path)
markdown_filename = os.path.splitext(file)[0] + ".md"
save_to_markdown(output_folder, markdown_filename, result.markdown)