skip to Main Content

My problem is that textract asynchronous method start_document_analysis, has an option for the type of analysis you want to perform, but when I try to use the "Queries" feature =>

FeatureTypes=[
        'TABLES'|'FORMS'|'QUERIES',
    ], 

you would have to pass another parameter with the queries list =>

QueriesConfig={
        'Queries': [
            {
                'Text': 'string',
                'Alias': 'string',
                'Pages': [
                    'string',
                ]
            },
        ]
    }

once I pass this parameter, boto3 throws an exception that Queries config is not recognized as one of the parameters accepted, have anyone used this feature with python before ?

3

Answers


  1. You can use by this way:

    def getJobResults(jobId):
    
        pages = []
        client = boto3.client('textract')
        response = client.get_document_analysis(JobId=jobId)
        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']
        while(nextToken):
            response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)
            pages.append(response)
            print("Resultset page recieved: {}".format(len(pages)))
            nextToken = None
            if('NextToken' in response):
                nextToken = response['NextToken']
        return pages
    
    
    def get_kv_map(s3BucketName, documentName):
    
        client = boto3.client('textract')
        response = client.start_document_analysis(
            DocumentLocation={
                'S3Object': {
                    'Bucket': s3BucketName,
                    'Name': documentName
                }
            },
            FeatureTypes=['QUERIES'],
            QueriesConfig={
                'Queries': [
                    {
                        "Text": "is 1. A. checkbox seleted"
                    }
                    
                ]
            }
        )
        
        job_id = response['JobId']
        response = client.get_document_analysis(JobId=job_id)
        status = response["JobStatus"]
        
        while(status == "IN_PROGRESS"):
            time.sleep(3)
            response = client.get_document_analysis(JobId=job_id)
            status = response["JobStatus"]
            print("Job status2: {}".format(status))
            
        response = getJobResults(job_id)    
        return response
    
    
    def query_extraction():
    
        s3BucketName = "bucket-name"
        documentName = "xyz.pdf"
    
        data = get_kv_map(s3BucketName, documentName)
        
        return data
    
    data = query_extraction()
    

    Hope this will solve your issue

    Login or Signup to reply.
  2. import trp.trp2 as t2 
    
    def get_kv_map(s3BucketName, documentName):
        client = boto3.client('textract')
        response = client.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,
                'Name': documentName
            }
        },
        FeatureTypes=['QUERIES'],
        QueriesConfig={
            'Queries': [
                {
                    "Text": "is 1. A. checkbox seleted"
                }
                
            ]
        }
    )
    
        job_id = response['JobId']
        response = client.get_document_analysis(JobId=job_id)
        status = response["JobStatus"]
    
        while(status == "IN_PROGRESS"):
            time.sleep(3)
            response = client.get_document_analysis(JobId=job_id)
            status = response["JobStatus"]
            print("Job status: {}".format(status))
           
        return response
    
    
    
    s3BucketName = "bucket-name"
    documentName = "xyz.pdf"
    
    data = get_kv_map(s3BucketName, documentName)
        
    d = t2.TDocumentSchema().load(data)
    page = d.pages[0]
    query_answers = d.get_query_answers(page=page)
    for x in query_answers:
        print(f"{x[1]},{x[2]}")
    
    Login or Signup to reply.
  3. A simpler solution would be to use the amazon-textract-textractor package that wraps all this for you and helps you parse the responses. https://aws-samples.github.io/amazon-textract-textractor/index.html

    For example this calls textract with the Queries, Forms and Tables API all at once:

    from textractor import Textractor
    from textractor.data.constants import TextractFeatures
    from textractcaller import QueriesConfig, Query
    
    extractor = Textractor(profile_name="default")
    
    document1 = extractor.start_document_analysis(    
        file_source='./multipage.pdf',    
        features=[TextractFeatures.QUERIES, TextractFeatures.FORMS, TextractFeatures.TABLES],
        s3_upload_path='s3://textractor-tests/debug/',
        s3_output_path='s3://textractor-tests/debug/',
        save_image=True,
        queries=QueriesConfig([Query("What is the first cell value")])
    )
    
    document1.queries[0].result
    
    0.129853474
    

    This calls the asynchronous API for you and will automatically block once you try to retrieve the value that the processing is completed.

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search