skip to Main Content

I see the following under my indexer settings:

enter image description here

When hovering over it I read the following:

True means the original file data obtained from your blob data source
is preserved. This allows passing the original file to a custom skill,
or to the Document Extraction skill.

How do I read the original pdf file in the associated blob data source in a custom WebApiSkill?

file_data_base64 = value.get('data', {}).get('file_data', '')


I enabled Allow Skillset to read file data in the indexer. My full setup:

  • WebApiSkill inputs
    InputFieldMappingEntry(name="file_data", source="/document/file_data")
  • WebApiSkill input reading
import azure.functions as func
import datetime
import json
import logging
import base64
import fitz
from io import BytesIO

app = func.FunctionApp()

@app.route(route="CustomSplitSkill", auth_level=func.AuthLevel.FUNCTION)
def CustomSplitSkill(req: func.HttpRequest) -> func.HttpResponse:'Python HTTP trigger function processed a request.')

        req_body = req.get_json()'Request body parsed successfully.')
    except ValueError:
        logging.error(f"Invalid input: {e}")
        return func.HttpResponse("Invalid input", status_code=400)

    # 'values' expected top-level key in the request body
    response_body = {"values": []}
    for value in req_body.get('values', []):
        recordId = value.get('recordId')
        file_data_base64 = value.get('data', {}).get('file_data', '').get('data', '')
        if not file_data_base64:
            logging.error("No file_data found in the request.")
            return func.HttpResponse("Invalid input: No file_data found", status_code=400)

            file_data = base64.b64decode(file_data_base64)

                pdf_document =, filetype='pdf')
            except fitz.FileDataError as e:
                logging.error(f"Failed to open PDF document: {e}")
                return func.HttpResponse("Failed to open PDF document", status_code=400)
            except Exception as e:
                logging.error(f"An unexpected error occurred while opening the PDF document: {e}")
                return func.HttpResponse("An unexpected error occurred", status_code=500)
            if pdf_document.page_count == 0:
                logging.error("No pages found in the PDF document.")
                return func.HttpResponse("Invalid PDF: No pages found", status_code=400)

            extracted_text = ""
            for page_num in range(pdf_document.page_count):
                page = pdf_document.load_page(page_num)
                extracted_text += page.get_text()

            combined_list = [{'textItems': ['text1', 'text2'], 'numberItems': [0, 1]}]  # i deleted the chunking and associated page extraction for simplicity

            response_record = {
                "recordId": recordId,
                "data": {
                    "subdata": combined_list
        except Exception as e:
            logging.error(f"Error processing file_data: {e}")
            return func.HttpResponse("Error processing file_data", status_code=500)'Function executed successfully.')
    return func.HttpResponse(json.dumps(response_body), mimetype="application/json")

The error:

Could not execute skill because the Web Api request failed.

Web Api response status: 'NotFound', Web Api response details: ''

Given that I have projections I cannot debug this properly as debugging is not supported with projections. The logging does not seem to log the specific error either despite the error handling and checks.



  1. Chosen as BEST ANSWER

    The input to your WebApiSkill in your skillset has to be set to:

    "inputs": [
            "name": "file_data",
            "source": "/document/file_data"

    The file_data input is base64 encoded, so first it has to be decoded, and then opened as a byte-stream with the PDF reader of your choice:

    import re
    import azure.functions as func
    import datetime
    import json
    import logging
    import base64
    import fitz
    from io import BytesIO
    app = func.FunctionApp()
    @app.route(route="CustomSplitSkill", auth_level=func.AuthLevel.FUNCTION)
    def CustomSplitSkill(req: func.HttpRequest) -> func.HttpResponse:'Python HTTP trigger function processed a request.')
            req_body = req.get_json()
  'Request body parsed successfully.')
        except ValueError:
            logging.error(f"Invalid input: {e}")
            return func.HttpResponse("Invalid input", status_code=400)
        # 'values' expected top-level key in the request body
        response_body = {"values": []}
        for value in req_body.get('values', []):
            recordId = value.get('recordId')
            file_data_base64 = value.get('data', {}).get('file_data', '').get('data', '')
            if not file_data_base64:
                logging.error("No file_data found in the request.")
                return func.HttpResponse("Invalid input: No file_data found", status_code=400)
                file_data = base64.b64decode(file_data_base64)
                    pdf_document =, filetype='pdf')
                except fitz.FileDataError as e:
                    logging.error(f"Failed to open PDF document: {e}")
                    return func.HttpResponse("Failed to open PDF document", status_code=400)
                except Exception as e:
                    logging.error(f"An unexpected error occurred while opening the PDF document: {e}")
                    return func.HttpResponse("An unexpected error occurred", status_code=500)
                if pdf_document.page_count == 0:
                    logging.error("No pages found in the PDF document.")
                    return func.HttpResponse("Invalid PDF: No pages found", status_code=400)
                combined_list = [{'textItems': ['text1', 'text2'], 'numberItems': [0, 1]}]  # i deleted the chunking and associated page extraction for simplicity
                response_record = {
                    "recordId": recordId,
                    "data": {
                        "subdata": combined_list
            except Exception as e:
                logging.error(f"Error processing file_data: {e}")
                return func.HttpResponse("Error processing file_data", status_code=500)
   'Function executed successfully.')
        return func.HttpResponse(json.dumps(response_body), mimetype="application/json")

  2. According to this documentation when you enable the allowSkillsetToReadFileData parameter in the indexer,
    you will get the file data in the path /document/file_data in document context.

    The same is passed to your custom web api skillset inputs.

    "inputs": [
            "name": "file_data",
            "source": "/document/file_data"

    and it is accessed in the function like below.

    file_data = value.get('data', {}).get('file_data', '')
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top