skip to Main Content

I upload a pdf file to my streamlit application like this:

import streamlit as st

uploaded_file = st.file_uploader("Upload pdf file", type="pdf")
result = analyze_general_document(uploaded_file)

I want to analzye this pdf using the Azure Document Intelligence python package like this:

from io import BytesIO
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient


def set_client(secrets: dict):
    endpoint = secrets["AI_DOCS_BASE"]
    key = secrets["AI_DOCS_KEY"]
    document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
    return document_analysis_client


def analyze_general_document(uploaded_file, secrets: dict):
    print(f"File type: {uploaded_file.type}")
    print(f"File size: {uploaded_file.size} bytes")
    client = set_client(secrets)
    # poller = client.begin_analyze_document_from_url("prebuilt-document", formUrl)
    poller = client.begin_analyze_document("prebuilt-document", document=uploaded_file)

I can successfully print the file type and file size as you can see in the terminal output:

File type: application/pdf
File size: 6928426 bytes

Also opening the file with PyMuPDF works fine as well.

However the method begin_analyze_document throws the following exeception:

Traceback (most recent call last):
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesstreamlitruntimescriptrunnerexec_code.py", line 88, in exec_func_with_error_handling
    result = func()
             ^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesstreamlitruntimescriptrunnerscript_runner.py", line 579, in code_to_exec
    exec(code, module.__dict__)
  File "C:UsersmyuserDocumentsvisual-studio-codeprojectproject-ai-docswebappapp.py", line 79, in <module>
    main()
  File "C:UsersmyuserDocumentsvisual-studio-codeprojectproject-ai-docswebappapp.py", line 61, in main
    zip_content = process_pdf(uploaded_file, secrets)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserDocumentsvisual-studio-codeprojectproject-ai-docswebappapp_backend.py", line 40, in process_pdf
    analyze_general_document(uploaded_file, secrets)
  File "C:UsersmyuserDocumentsvisual-studio-codeprojectproject-ai-docswebappaz_document_intelligence.py", line 18, in analyze_general_document
    poller = client.begin_analyze_document("prebuilt-document", document=uploaded_file)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesazurecoretracingdecorator.py", line 105, in wrapper_use_tracer
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesazureaiformrecognizer_document_analysis_client.py", line 129, in begin_analyze_document
    return _client_op_path.begin_analyze_document(  # type: ignore
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesazurecoretracingdecorator.py", line 105, in wrapper_use_tracer
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesazureaiformrecognizer_generatedv2023_07_31operations_document_models_operations.py", line 518, in begin_analyze_document
    raw_result = self._analyze_document_initial(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersmyuserAppDataLocalminiconda3envsprojectaiLibsite-packagesazureaiformrecognizer_generatedv2023_07_31operations_document_models_operations.py", line 443, in _analyze_document_initial
    raise HttpResponseError(response=response)
azure.core.exceptions.HttpResponseError: (InvalidRequest) Invalid request.
Code: InvalidRequest
Message: Invalid request.
Inner error: {
    "code": "InvalidContent",
    "message": "The file is corrupted or format is unsupported. Refer to documentation for the list of supported formats."
}

Why is the pdf considered invalid?
I also tried wrapping it in a BytesIO object like this but it didn’t work either:

def analyze_general_document(uploaded_file, secrets: dict):
    print(f"File type: {uploaded_file.type}")
    print(f"File size: {uploaded_file.size} bytes")
    # Read the file as bytes
    file_bytes = uploaded_file.read()
    client = set_client(secrets)
    # poller = client.begin_analyze_document_from_url("prebuilt-document", formUrl)
    poller = client.begin_analyze_document("prebuilt-document", document=BytesIO(file_bytes))

2

Answers


  1. Chosen as BEST ANSWER

    Venkatesan's answer is definitely working and therefore I marked it as the correct answer. In the meantime I also found a way to make it work which works better for my use case as I already use PyMuPDF anyway in the project. Basically what I am doing is reading the pdf file with pymupdf, passing it to my azure document intelligence function and converting it to bytes by using pymupdf's method .write(). The code looks like this:

    # ./app.py
    import streamlit as st
    import pymupdf
    from az_intelligence import analyze_general_document
    
    uploaded_file = st.file_uploader("Upload pdf file", type="pdf")
    with pymupdf.open(stream=uploaded_file.read(), filetype="pdf") as pdf:  # Open as PDF
        result = analyze_general_document(pdf)  # Pass in the pymupdf object
    
    
    # ./az_intelligence.py
    from io import BytesIO
    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient
    
    
    def set_client(secrets: dict):
        endpoint = secrets["AI_DOCS_BASE"]
        key = secrets["AI_DOCS_KEY"]
        document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
        return document_analysis_client
    
    
    def analyze_general_document(pdf, secrets: dict):
        client = set_client(secrets)
        poller = client.begin_analyze_document("prebuilt-document", document=pdf.write())  # Convert pdf to bytes using .write()
    

  2. Azure Document Intelligence (formrecognizer) – ‘InvalidContent’ when passing pdf

    You can use the below code that Analyze the pdf file with Azure Document Intelligence by uploaded with streamlit using python,

    Code:

    import streamlit as st
    from io import BytesIO
    from azure.core.credentials import AzureKeyCredential
    from azure.ai.formrecognizer import DocumentAnalysisClient
    from PyPDF2 import PdfReader
    
    def set_client(secrets: dict):
        endpoint = secrets["AI_DOCS_BASE"]
        key = secrets["AI_DOCS_KEY"]
        return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
    
    def validate_pdf(file):
        try:
            file.seek(0)  # Reset pointer
            reader = PdfReader(file)
            if len(reader.pages) == 0:
                raise ValueError("The PDF has no pages.")
            print(f"PDF is valid with {len(reader.pages)} pages.")
        except Exception as e:
            raise ValueError(f"PDF validation failed: {e}")
    
    def check_pdf_metadata(file):
        file.seek(0)  # Reset pointer
        reader = PdfReader(file)
        if reader.is_encrypted:
            raise ValueError("Encrypted PDFs are not supported.")
        print(f"PDF has {len(reader.pages)} pages and is not encrypted.")
    
    def analyze_general_document(uploaded_file, secrets: dict):
    
        validate_pdf(uploaded_file)
        check_pdf_metadata(uploaded_file)
    
        uploaded_file.seek(0)  # Reset pointer
        file_bytes = uploaded_file.read()
        client = set_client(secrets)
        try:
            print("Sending file to Azure Document Intelligence...")
            poller = client.begin_analyze_document("prebuilt-document", document=BytesIO(file_bytes))
            result = poller.result()
            print("Analysis successful!")
            return result
        except Exception as e:
            raise RuntimeError(f"Azure Document Intelligence error: {e}")
    
    # Streamlit Application
    def main():
        st.title("Azure Document Intelligence PDF Analyzer")
        uploaded_file = st.file_uploader("Upload PDF file", type="pdf")
    
        if uploaded_file is not None:
            # Display file details
            st.write(f"File Name: {uploaded_file.name}")
            st.write(f"File Type: {uploaded_file.type}")
            st.write(f"File Size: {uploaded_file.size} bytes")
    
            # Secrets for Azure setup
            secrets = {
                "AI_DOCS_BASE": "https://xxxxxxx.cognitiveservices.azure.com/",
                "AI_DOCS_KEY": "xxxxx"
            }
    
            try:
                # Analyze PDF
                result = analyze_general_document(uploaded_file, secrets)
                st.success("PDF analysis completed!")
                st.json(result.to_dict())  # Display results in Streamlit
            except Exception as e:
                st.error(f"Error: {e}")
    
    if __name__ == "__main__":
        main()
    

    Output:

    PS C:Usersxxxx> streamlit run set.py
    
      You can now view your Streamlit app in your browser.
    
      Local URL: http://localhost:8xxx1
      Network URL: http://192.168.1.8:8xxx1
    
    PDF is valid with 20 pages.
    PDF has 20 pages and is not encrypted.
    Sending file to Azure Document Intelligence...
    Analysis successful!
    

    enter image description here

    Browser:
    enter image description here

    Reference:
    azure.ai.formrecognizer.DocumentAnalysisClient class | Microsoft Learn

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search