skip to Main Content

I see the following under my indexer settings:

enter image description here

When hovering over it I read the following:

True means the original file data obtained from your blob data source
is preserved. This allows passing the original file to a custom skill,
or to the Document Extraction skill.

How do I read the original pdf file in the associated blob data source in a custom WebApiSkill?

file_data_base64 = value.get('data', {}).get('file_data', '')
...

EDIT

I enabled Allow Skillset to read file data in the indexer. My full setup:

  • WebApiSkill inputs
inputs=[
    InputFieldMappingEntry(name="file_data", source="/document/file_data")
],
  • WebApiSkill input reading
import azure.functions as func
import datetime
import json
import logging
import base64
import fitz
from io import BytesIO

app = func.FunctionApp()
logging.basicConfig(level=logging.INFO)


@app.route(route="CustomSplitSkill", auth_level=func.AuthLevel.FUNCTION)
def CustomSplitSkill(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Python HTTP trigger function processed a request.')

    try:
        req_body = req.get_json()
        logging.info('Request body parsed successfully.')
    except ValueError:
        logging.error(f"Invalid input: {e}")
        return func.HttpResponse("Invalid input", status_code=400)

    # 'values' expected top-level key in the request body
    response_body = {"values": []}
    for value in req_body.get('values', []):
        recordId = value.get('recordId')
        file_data_base64 = value.get('data', {}).get('file_data', '').get('data', '')
        if not file_data_base64:
            logging.error("No file_data found in the request.")
            return func.HttpResponse("Invalid input: No file_data found", status_code=400)

        try:
            file_data = base64.b64decode(file_data_base64)

            try:
                pdf_document = fitz.open(stream=BytesIO(file_data), filetype='pdf')
            except fitz.FileDataError as e:
                logging.error(f"Failed to open PDF document: {e}")
                return func.HttpResponse("Failed to open PDF document", status_code=400)
            except Exception as e:
                logging.error(f"An unexpected error occurred while opening the PDF document: {e}")
                return func.HttpResponse("An unexpected error occurred", status_code=500)
            
            if pdf_document.page_count == 0:
                logging.error("No pages found in the PDF document.")
                return func.HttpResponse("Invalid PDF: No pages found", status_code=400)

            extracted_text = ""
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                extracted_text += page.get_text()

            combined_list = [{'textItems': ['text1', 'text2'], 'numberItems': [0, 1]}]  # i deleted the chunking and associated page extraction for simplicity

            response_record = {
                "recordId": recordId,
                "data": {
                    "subdata": combined_list
                }
            }
            response_body['values'].append(response_record)
        except Exception as e:
            logging.error(f"Error processing file_data: {e}")
            return func.HttpResponse("Error processing file_data", status_code=500)

    logging.info('Function executed successfully.')
    return func.HttpResponse(json.dumps(response_body), mimetype="application/json")

The error:

Message:
Could not execute skill because the Web Api request failed.

Details:
Web Api response status: 'NotFound', Web Api response details: ''

Given that I have projections I cannot debug this properly as debugging is not supported with projections. The logging does not seem to log the specific error either despite the error handling and checks.

2

Answers


  1. Chosen as BEST ANSWER

    The input to your WebApiSkill in your skillset has to be set to:

    "inputs": [
          {
            "name": "file_data",
            "source": "/document/file_data"
          }
        ]
    

    The file_data input is base64 encoded, so first it has to be decoded, and then opened as a byte-stream with the PDF reader of your choice:

    import re
    import azure.functions as func
    import datetime
    import json
    import logging
    import base64
    import fitz
    from io import BytesIO
    
    app = func.FunctionApp()
    logging.basicConfig(level=logging.INFO)
    
    
    @app.route(route="CustomSplitSkill", auth_level=func.AuthLevel.FUNCTION)
    def CustomSplitSkill(req: func.HttpRequest) -> func.HttpResponse:
        logging.info('Python HTTP trigger function processed a request.')
    
        try:
            req_body = req.get_json()
            logging.info('Request body parsed successfully.')
        except ValueError:
            logging.error(f"Invalid input: {e}")
            return func.HttpResponse("Invalid input", status_code=400)
    
        # 'values' expected top-level key in the request body
        response_body = {"values": []}
        for value in req_body.get('values', []):
            recordId = value.get('recordId')
            file_data_base64 = value.get('data', {}).get('file_data', '').get('data', '')
            if not file_data_base64:
                logging.error("No file_data found in the request.")
                return func.HttpResponse("Invalid input: No file_data found", status_code=400)
    
            try:
                file_data = base64.b64decode(file_data_base64)
    
                try:
                    pdf_document = fitz.open(stream=BytesIO(file_data), filetype='pdf')
                except fitz.FileDataError as e:
                    logging.error(f"Failed to open PDF document: {e}")
                    return func.HttpResponse("Failed to open PDF document", status_code=400)
                except Exception as e:
                    logging.error(f"An unexpected error occurred while opening the PDF document: {e}")
                    return func.HttpResponse("An unexpected error occurred", status_code=500)
                
                if pdf_document.page_count == 0:
                    logging.error("No pages found in the PDF document.")
                    return func.HttpResponse("Invalid PDF: No pages found", status_code=400)
    
                combined_list = [{'textItems': ['text1', 'text2'], 'numberItems': [0, 1]}]  # i deleted the chunking and associated page extraction for simplicity
    
                response_record = {
                    "recordId": recordId,
                    "data": {
                        "subdata": combined_list
                    }
                }
                response_body['values'].append(response_record)
            except Exception as e:
                logging.error(f"Error processing file_data: {e}")
                return func.HttpResponse("Error processing file_data", status_code=500)
    
        logging.info('Function executed successfully.')
        return func.HttpResponse(json.dumps(response_body), mimetype="application/json")
    

  2. According to this documentation when you enable the allowSkillsetToReadFileData parameter in the indexer,
    you will get the file data in the path /document/file_data in document context.

    The same is passed to your custom web api skillset inputs.

    "inputs": [
          {
            "name": "file_data",
            "source": "/document/file_data"
          }
        ]
    

    and it is accessed in the function like below.

    file_data = value.get('data', {}).get('file_data', '')
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search