Azure Document Intelligence (formrecognizer) - 'InvalidContent' when passing pdf

Daniel
December 12, 2024
66 views
1 vote
2 Answers

I upload a pdf file to my streamlit application like this:

import streamlit as st

uploaded_file = st.file_uploader("Upload pdf file", type="pdf")
result = analyze_general_document(uploaded_file)

I want to analzye this pdf using the Azure Document Intelligence python package like this:

from io import BytesIO
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient


def set_client(secrets: dict):
    endpoint = secrets["AI_DOCS_BASE"]
    key = secrets["AI_DOCS_KEY"]
    document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
    return document_analysis_client


def analyze_general_document(uploaded_file, secrets: dict):
    print(f"File type: {uploaded_file.type}")
    print(f"File size: {uploaded_file.size} bytes")
    client = set_client(secrets)
    # poller = client.begin_analyze_document_from_url("prebuilt-document", formUrl)
    poller = client.begin_analyze_document("prebuilt-document", document=uploaded_file)

I can successfully print the file type and file size as you can see in the terminal output:

File type: application/pdf
File size: 6928426 bytes

Also opening the file with PyMuPDF works fine as well.

However the method begin_analyze_document throws the following exeception:

Traceback (most recent call last):
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesstreamlitruntimescriptrunnerexec_code.py", line 88, in exec_func_with_error_handling
    result = func()
             ^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesstreamlitruntimescriptrunnerscript_runner.py", line 579, in code_to_exec
    exec(code, module.__dict__)
  File "C:UsersmyuserDocumentsvisual-studio-codeprojectproject-ai-docswebappapp.py", line 79, in <module>
    main()
  File "C:UsersmyuserDocumentsvisual-studio-codeprojectproject-ai-docswebappapp.py", line 61, in main
    zip_content = process_pdf(uploaded_file, secrets)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserDocumentsvisual-studio-codeprojectproject-ai-docswebappapp_backend.py", line 40, in process_pdf
    analyze_general_document(uploaded_file, secrets)
  File "C:UsersmyuserDocumentsvisual-studio-codeprojectproject-ai-docswebappaz_document_intelligence.py", line 18, in analyze_general_document
    poller = client.begin_analyze_document("prebuilt-document", document=uploaded_file)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesazurecoretracingdecorator.py", line 105, in wrapper_use_tracer
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesazureaiformrecognizer_document_analysis_client.py", line 129, in begin_analyze_document
    return _client_op_path.begin_analyze_document(  # type: ignore
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesazurecoretracingdecorator.py", line 105, in wrapper_use_tracer
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesazureaiformrecognizer_generatedv2023_07_31operations_document_models_operations.py", line 518, in begin_analyze_document
    raw_result = self._analyze_document_initial(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesazureaiformrecognizer_generatedv2023_07_31operations_document_models_operations.py", line 443, in _analyze_document_initial
    raise HttpResponseError(response=response)
azure.core.exceptions.HttpResponseError: (InvalidRequest) Invalid request.
Code: InvalidRequest
Message: Invalid request.
Inner error: {
    "code": "InvalidContent",
    "message": "The file is corrupted or format is unsupported. Refer to documentation for the list of supported formats."
}

Why is the pdf considered invalid?
I also tried wrapping it in a BytesIO object like this but it didn’t work either:

def analyze_general_document(uploaded_file, secrets: dict):
    print(f"File type: {uploaded_file.type}")
    print(f"File size: {uploaded_file.size} bytes")
    # Read the file as bytes
    file_bytes = uploaded_file.read()
    client = set_client(secrets)
    # poller = client.begin_analyze_document_from_url("prebuilt-document", formUrl)
    poller = client.begin_analyze_document("prebuilt-document", document=BytesIO(file_bytes))

Answers

Chosen as BEST ANSWER

Venkatesan's answer is definitely working and therefore I marked it as the correct answer. In the meantime I also found a way to make it work which works better for my use case as I already use PyMuPDF anyway in the project. Basically what I am doing is reading the pdf file with pymupdf, passing it to my azure document intelligence function and converting it to bytes by using pymupdf's method .write(). The code looks like this:

# ./app.py
import streamlit as st
import pymupdf
from az_intelligence import analyze_general_document

uploaded_file = st.file_uploader("Upload pdf file", type="pdf")
with pymupdf.open(stream=uploaded_file.read(), filetype="pdf") as pdf:  # Open as PDF
    result = analyze_general_document(pdf)  # Pass in the pymupdf object

# ./az_intelligence.py
from io import BytesIO
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient


def set_client(secrets: dict):
    endpoint = secrets["AI_DOCS_BASE"]
    key = secrets["AI_DOCS_KEY"]
    document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
    return document_analysis_client


def analyze_general_document(pdf, secrets: dict):
    client = set_client(secrets)
    poller = client.begin_analyze_document("prebuilt-document", document=pdf.write())  # Convert pdf to bytes using .write()

(Edit)

Azure Document Intelligence (formrecognizer) – ‘InvalidContent’ when passing pdf

You can use the below code that Analyze the pdf file with Azure Document Intelligence by uploaded with streamlit using python,

Code:

import streamlit as st
from io import BytesIO
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from PyPDF2 import PdfReader

def set_client(secrets: dict):
    endpoint = secrets["AI_DOCS_BASE"]
    key = secrets["AI_DOCS_KEY"]
    return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))

def validate_pdf(file):
    try:
        file.seek(0)  # Reset pointer
        reader = PdfReader(file)
        if len(reader.pages) == 0:
            raise ValueError("The PDF has no pages.")
        print(f"PDF is valid with {len(reader.pages)} pages.")
    except Exception as e:
        raise ValueError(f"PDF validation failed: {e}")

def check_pdf_metadata(file):
    file.seek(0)  # Reset pointer
    reader = PdfReader(file)
    if reader.is_encrypted:
        raise ValueError("Encrypted PDFs are not supported.")
    print(f"PDF has {len(reader.pages)} pages and is not encrypted.")

def analyze_general_document(uploaded_file, secrets: dict):

    validate_pdf(uploaded_file)
    check_pdf_metadata(uploaded_file)

    uploaded_file.seek(0)  # Reset pointer
    file_bytes = uploaded_file.read()
    client = set_client(secrets)
    try:
        print("Sending file to Azure Document Intelligence...")
        poller = client.begin_analyze_document("prebuilt-document", document=BytesIO(file_bytes))
        result = poller.result()
        print("Analysis successful!")
        return result
    except Exception as e:
        raise RuntimeError(f"Azure Document Intelligence error: {e}")

# Streamlit Application
def main():
    st.title("Azure Document Intelligence PDF Analyzer")
    uploaded_file = st.file_uploader("Upload PDF file", type="pdf")

    if uploaded_file is not None:
        # Display file details
        st.write(f"File Name: {uploaded_file.name}")
        st.write(f"File Type: {uploaded_file.type}")
        st.write(f"File Size: {uploaded_file.size} bytes")

        # Secrets for Azure setup
        secrets = {
            "AI_DOCS_BASE": "https://xxxxxxx.cognitiveservices.azure.com/",
            "AI_DOCS_KEY": "xxxxx"
        }

        try:
            # Analyze PDF
            result = analyze_general_document(uploaded_file, secrets)
            st.success("PDF analysis completed!")
            st.json(result.to_dict())  # Display results in Streamlit
        except Exception as e:
            st.error(f"Error: {e}")

if __name__ == "__main__":
    main()

Output:

PS C:Usersxxxx> streamlit run set.py

  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8xxx1
  Network URL: http://192.168.1.8:8xxx1

PDF is valid with 20 pages.
PDF has 20 pages and is not encrypted.
Sending file to Azure Document Intelligence...
Analysis successful!