skip to Main Content

I used unstructured io to parse table data from a pdf. I wrote the following code to extract tables from my pdf which is primarily bank statements. Direct PDF to csv was not working so I went on with conversion to json first as it would get the table, title and narrative text better. The python code is shown below:

from unstructured.staging.base import elements_to_json, elements_from_json
from unstructured.staging.base import convert_to_dict
from unstructured.documents.elements import Title, NarrativeText, Table
from unstructured.staging.base import convert_to_csv
import json
import csv
import pandas as pd

input = "a1.pdf"

elements = partition_pdf(filename=input, infer_table_structure=True)
tables = [el for el in elements if el.category == "Table"]

print(tables[0].text)
print(tables[0].metadata.text_as_html)

convert_to_dict(elements)



output = "outputs.json"
elements_to_json(elements, filename=output)
elements = elements_from_json(filename=output)

#df = pd.read_json(output)
#df.to_csv('finale.csv')

# Read JSON file
with open('outputs.json', 'r') as file:
    data = json.load(file)

# Extract Titles and Tables
titles = []
tables = []

for element in data:
    if element['type'] == 'Title':
        titles.append(element)
    elif element['type'] == 'Table':
        tables.append(element)

print(titles)
print(tables)

isd_csv = convert_to_csv(tables)

with open('output_tables.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(isd_csv.splitlines())  # Split lines and write as rows```

The table is extracted quite accurately but it is in json as shown below. The csv function is not working. Is it possible to run some script on json file and get the table, title and narrative text in csv form or HTML, which ever gives more accuracy (csv is preferable). Script preferred is python since all of the other portions of my project are python based.

[
    {
        "element_id": "c1eb5f7eb3b8a8ebd25ddcc63e305006",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        144.7,
                        309.2
                    ],
                    [
                        144.7,
                        353.6
                    ],
                    [
                        675.2,
                        353.6
                    ],
                    [
                        675.2,
                        309.2
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.55205,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1
        },
        "text": "CIBC Account Statement",
        "type": "Title"
    },
    {
        "element_id": "bfe2311a88365d06d632e2bc1df2e5bd",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        144.1,
                        430.4
                    ],
                    [
                        144.1,
                        458.1
                    ],
                    [
                        419.5,
                        458.1
                    ],
                    [
                        419.5,
                        430.4
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.63581,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1,
            "parent_id": "c1eb5f7eb3b8a8ebd25ddcc63e305006"
        },
        "text": "2431536 ONTARIO INC.",
        "type": "NarrativeText"
    },
    {
        "element_id": "045c030ba3939ff17e7c1f26bc38175f",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        1110.8,
                        433.0
                    ],
                    [
                        1110.8,
                        458.0
                    ],
                    [
                        1423.9,
                        458.0
                    ],
                    [
                        1423.9,
                        433.0
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.34244,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1
        },
        "text": "For Aug 1 to Aug 31, 2021",
        "type": "Title"
    },
    {
        "element_id": "146dab2a99fac51981d5b136459f7be7",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        1114.1,
                        496.8
                    ],
                    [
                        1114.1,
                        552.4
                    ],
                    [
                        1318.1,
                        552.4
                    ],
                    [
                        1318.1,
                        496.8
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.30347,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1,
            "parent_id": "045c030ba3939ff17e7c1f26bc38175f"
        },
        "text": "Account number 70-10117",
        "type": "NarrativeText"
    },
    {
        "element_id": "6b525f6ae0b734f9a6869972775aa16e",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        143.5,
                        574.1
                    ],
                    [
                        143.5,
                        659.1
                    ],
                    [
                        920.8,
                        659.1
                    ],
                    [
                        920.8,
                        574.1
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.86173,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1,
            "parent_id": "045c030ba3939ff17e7c1f26bc38175f"
        },
        "text": "The names shown are based on our current records, as of March 2, 2022. This statement does not reflect any changes in account holders and account holder names that may have occurred prior to this date.",
        "type": "NarrativeText"
    },
    {
        "element_id": "cbd860785f81791a1c73b9c75ba6407a",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        1114.1,
                        591.3
                    ],
                    [
                        1114.1,
                        646.8
                    ],
                    [
                        1383.9,
                        646.8
                    ],
                    [
                        1383.9,
                        591.3
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.42099,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1,
            "parent_id": "045c030ba3939ff17e7c1f26bc38175f"
        },
        "text": "Branch transit number 07342",
        "type": "NarrativeText"
    },
    {
        "element_id": "7aa85348022f61c9fc791f63346fd68f",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        1116.2,
                        803.6
                    ],
                    [
                        1116.2,
                        837.5
                    ],
                    [
                        1437.6,
                        837.5
                    ],
                    [
                        1437.6,
                        803.6
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.69574,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1
        },
        "text": "Contact information",
        "type": "Title"
    },
    {
        "element_id": "048adaba40bd8c5501ca9a5219891e32",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        126.0,
                        813.0
                    ],
                    [
                        126.0,
                        1050.9
                    ],
                    [
                        1579.5,
                        1050.9
                    ],
                    [
                        1579.5,
                        813.0
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.53915,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1,
            "parent_id": "7aa85348022f61c9fc791f63346fd68f",
            "text_as_html": "<table><tr><td>Opening balance on Aug 1, 2021</td><td>$43,719.76</td><td>@</td><td>1800 465 CIBC (2422)</td></tr><tr><td>Withdrawals</td><td>14,316.36</td><td></td><td>Contact us by phone for questions this of</td></tr><tr><td>Deposits</td><td>20,801.01</td><td></td><td>on update, change personal information, and general inquiries,</td></tr><tr><td>Closing balance on Aug 31, 2021</td><td>$50,204.41</td><td></td><td>24 hours a day, 7 days a week.</td></tr></table>"
        },
        "text": "Opening balance on Aug 1, 2021 $43,719.76 Withdrawals - 14,316.36 Deposits + 20,801.01 Closing balance on Aug 31, 2021 = $50,204.41 1 800 465 CIBC (2422) Contact us by phone for questions on this update, change of personal information, and general inquiries, 24 hours a day, 7 days a week.",
        "type": "Table"
    },
    {
        "element_id": "0e6ab9db7f9a8c1b8cefa24c9ccff170",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        1159.8,
                        1030.2
                    ],
                    [
                        1159.8,
                        1085.7
                    ],
                    [
                        1415.5,
                        1085.7
                    ],
                    [
                        1415.5,
                        1030.2
                    ]
                ],
                "system": "PixelSpace"
            },
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1,
            "parent_id": "7aa85348022f61c9fc791f63346fd68f"
        },
        "text": "TTY hearing impaired 1 800 465 7401",
        "type": "NarrativeText"
    },
    {
        "element_id": "43ea2d37a7b84d6e13db58e27ead0f01",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        1159.8,
                        1099.6
                    ],
                    [
                        1159.8,
                        1194.6
                    ],
                    [
                        1501.6,
                        1194.6
                    ],
                    [
                        1501.6,
                        1099.6
                    ]
                ],
                "system": "PixelSpace"
            },
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1
        },
        "text": "Outside Canada and the U.S. 1 902 420 CIBC (2422) www.cibc.com",
        "type": "Title"
    },
    {
        "element_id": "15b618d7fc57f900d275074b3651a1fb",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        147.2,
                        1306.2
                    ],
                    [
                        147.2,
                        1340.2
                    ],
                    [
                        451.8,
                        1340.2
                    ],
                    [
                        451.8,
                        1306.2
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.64498,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1
        },
        "text": "Transaction details",
        "type": "Title"
    },
    {
        "element_id": "0c9edae94cc9cec1eb6c559448a54385",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        122.4,
                        1348.7
                    ],
                    [
                        122.4,
                        1998.8
                    ],
                    [
                        1593.5,
                        1998.8
                    ],
                    [
                        1593.5,
                        1348.7
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.90372,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1,
            "parent_id": "15b618d7fc57f900d275074b3651a1fb",
            "text_as_html": "<table><thead><th rowspan="2">Date Aug1</th><th>Description</th><th>Withdrawals ($) Deposits</th><th rowspan="2">($) Balance ($) $43,719.76</th></thead><thead><th></th><th>_ Opening balance</th><th></th><th></th></thead><tr><td rowspan="5">Aug 3</td><td>INTERNET BILL PMT000000214174</td><td>113.70</td><td>43,606.06</td></tr><tr><td></td><td>FIDO</td><td></td><td></td></tr><tr><td></td><td>A506*#ERRRKT QD</td><td></td><td></td></tr><tr><td></td><td>E-TRANSFER103853228862 Erhan Bakirci OD</td><td>2,000.00</td><td>41,606.06</td></tr><tr><td></td><td>ABO GEERT</td><td></td><td></td></tr><tr><td rowspan="5">Aug 5</td><td>INTERNET BILL PMT000000235565 MASTERCARD, MBNA CANADA B A506*#ERRRKT</td><td>80.00</td><td>41,526.06</td></tr><tr><td></td><td>QD</td><td></td><td></td></tr><tr><td></td><td>INTERNET BILL PMT000000216308</td><td>54.50</td><td>41,471.56</td></tr><tr><td></td><td>PROVIDENT ENERGY MANAGEME A506*#ERRRKT</td><td></td><td></td></tr><tr><td></td><td>QD</td><td></td><td></td></tr><tr><td rowspan="2">Aug 23.</td><td>INTERNET BILL PMT000000233266 ROGERS (9 DIGIT ACCOUNT N A506*#ERRRHET QD</td><td>142.30</td><td>41,329.26.</td></tr><tr><td></td><td></td><td></td><td></td></tr></table>"
        },
        "text": "Date Aug 1 Aug 3 Aug 5 Aug 23 Description Opening balance INTERNET BILL PMT000000214174 FIDO 4506*********792 E-TRANSFER103853228862 Erhan Bakirci 4506*********792 INTERNET BILL PMT000000235565 MASTERCARD, MBNA CANADA B 4506*********792 INTERNET BILL PMT000000216308 PROVIDENT ENERGY MANAGEME 4506*********792 INTERNET BILL PMT000000233266 ROGERS (9 DIGIT ACCOUNT N 4506*********792 Withdrawals ($) 113.70 2,000.00 80.00 54.50 142.30 Deposits ($) Balance ($) $43,719.76 43,606.06 41,606.06 41,526.06 41,471.56 41,329.26 (continued on next page)",
        "type": "Table"
    },
    {
        "element_id": "6c8bfe0f2340b6f05354176802ea337c",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        138.9,
                        2118.5
                    ],
                    [
                        138.9,
                        2140.1
                    ],
                    [
                        328.2,
                        2140.1
                    ],
                    [
                        328.2,
                        2118.5
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.59505,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1,
            "parent_id": "15b618d7fc57f900d275074b3651a1fb"
        },
        "text": "10774E BUS-2018/09",
        "type": "NarrativeText"
    },
    {
        "element_id": "33ed8c9803ae2dbbee4c676a19442128",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        1435.3,
                        2119.9
                    ],
                    [
                        1435.3,
                        2143.5
                    ],
                    [
                        1553.4,
                        2143.5
                    ],
                    [
                        1553.4,
                        2119.9
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.44888,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 1,
            "parent_id": "15b618d7fc57f900d275074b3651a1fb"
        },
        "text": "Page 1 of 2",
        "type": "NarrativeText"
    },
    {
        "element_id": "c1eb5f7eb3b8a8ebd25ddcc63e305006",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        153.8,
                        106.9
                    ],
                    [
                        153.8,
                        140.2
                    ],
                    [
                        553.5,
                        140.2
                    ],
                    [
                        553.5,
                        106.9
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.49285,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 2
        },
        "text": "CIBC Account Statement",
        "type": "Title"
    },
    {
        "element_id": "369b08001627723a6f47814a95a5405b",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        1110.6,
                        103.4
                    ],
                    [
                        1110.6,
                        211.6
                    ],
                    [
                        1467.0,
                        211.6
                    ],
                    [
                        1467.0,
                        103.4
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.36904,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 2,
            "parent_id": "c1eb5f7eb3b8a8ebd25ddcc63e305006"
        },
        "text": "Aug 1 to Aug 31, 2021 Account number: 70-10117 Branch transit number: 07342",
        "type": "NarrativeText"
    },
    {
        "element_id": "2babb7fc184bfcfac14a46fded4eaef4",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        147.2,
                        286.2
                    ],
                    [
                        147.2,
                        320.8
                    ],
                    [
                        611.2,
                        320.8
                    ],
                    [
                        611.2,
                        286.2
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.76239,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 2
        },
        "text": "Transaction details (continued)",
        "type": "Title"
    },
    {
        "element_id": "73d209c6bf0ebc45f357118b41f43977",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        137.0,
                        329.6
                    ],
                    [
                        137.0,
                        1122.5
                    ],
                    [
                        1588.8,
                        1122.5
                    ],
                    [
                        1588.8,
                        329.6
                    ]
                ],
                "system": "PixelSpace"
            },
            "detection_class_prob": 0.92363,
            "filename": "a1.pdf",
            "filetype": "application/pdf",
            "languages": [
                "eng"
            ],
            "last_modified": "2024-01-09T13:02:21",
            "page_number": 2,
            "parent_id": "2babb7fc184bfcfac14a46fded4eaef4",
            "text_as_html": "<table><thead><th rowspan="2">Date Aug 23__</th><th rowspan="2">Description Balance forward</th><th>Withdrawals ($)</th><th>Deposits ($)</th><th>Balance ($)</th></thead><thead><th></th><th></th><th></th><th></th><th>$41,329.26</th></thead><tr><td></td><td>NTERNET BILL PMT000000233444</td><td>70.00</td><td></td><td>41,259.26</td></tr><tr><td></td><td>AMERICAN EXPRESS REGULAR</td><td></td><td></td><td></td></tr><tr><td></td><td>A506 HHH TOD</td><td></td><td></td><td></td></tr><tr><td></td><td>nnn</td><td></td><td>Beeeoo</td><td>650836"</td></tr><tr><td></td><td>DIXIE &amp; MEYERSIDE BANKING CENT</td><td></td><td></td><td></td></tr><tr><td></td><td>DEPOSIT 06122</td><td></td><td>5,650.00</td><td>. 52,559.26</td></tr><tr><td></td><td>DIXIE CENT</td><td></td><td></td><td></td></tr><tr><td></td><td>&amp; MEYERSIDE BANKING</td><td></td><td>eee</td><td>een</td></tr><tr><td></td><td>DEPOSIT 06122</td><td></td><td>3,826.01</td><td>56,385.27</td></tr><tr><td>co!</td><td>DIXIE &amp; MEYERSIDE BANKING CENT</td><td></td><td>enn</td><td>nnn</td></tr><tr><td>Aug 24</td><td>CHEQUE 76179759 | OO</td><td>1888.00</td><td></td><td>54,502.27 |</td></tr><tr><td>25</td><td>DEPOSIT 06122</td><td></td><td></td><td></td></tr><tr><td>Aug</td><td></td><td></td><td>5,650.00</td><td>60,152.27</td></tr><tr><td>co!</td><td>DIXIE &amp; MEYERSIDE BANKING CENT</td><td></td><td>eee</td><td>neeneeeeneeneneneee</td></tr><tr><td>Aug 27</td><td>NTERNET TRANSFERO00000249234 A500 HHH ZOD</td><td>9,857.86</td><td></td><td>50,294.41</td></tr><tr><td rowspan="3">u2018Aug 30</td><td>INTERNET BILLPMTO00000254696</td><td>9.00.</td><td></td><td>50,204.41</td></tr><tr><td></td><td>BELL CANADA - ONE BILL</td><td></td><td></td><td></td></tr><tr><td></td><td>A506 HHH TOD</td><td></td><td></td><td></td></tr><tr><td>Aug 31</td><td>ACC FEE- FULL SERV</td><td></td><td></td><td></td></tr></table>"
        },
        "text": "Date Aug 23 Balance forward Description Withdrawals ($) Deposits ($) Aug 24 Aug 25 Aug 27 Aug 30 INTERNET BILL PMT000000233444 AMERICAN EXPRESS REGULAR 4506*********792 DEPOSIT 06122 DIXIE & MEYERSIDE BANKING CENT DEPOSIT 06122 DIXIE & MEYERSIDE BANKING CENT DEPOSIT 06122 DIXIE & MEYERSIDE BANKING CENT CHEQUE 78179759 6 DEPOSIT 06122 DIXIE & MEYERSIDE BANKING CENT INTERNET TRANSFER000000249234 4500*********302 INTERNET BILL PMT000000254696 BELL CANADA - ONE BILL 4506*********792 ACC FEE- FULL SERV BALANCE FEE WAIVER 70.00 1,883.00 9,857.86 90.00 5,650.00 5,650.00 3,826.01 5,650.00 Aug 31 25.00 25.00 Balance ($) $41,329.26 41,259.26 46,909.26 52,559.26 56,385.27 54,502.27 60,152.27 50,294.41 50,204.41 50,179.41 50,204.41",
        "type": "Table"
    },
    {
        "element_id": "0d88225a095dfbc5407269264a9c59be",
        "metadata": {
            "coordinates": {
                "layout_height": 2200,
                "layout_width": 1700,
                "points": [
                    [
                        263.0,
                        1117.1
                    ],
                    [
                        263.0,
                        1142.1
                    ],
                    [
                        454.7,
                        1142.1
                    ],
                    [
                        454.7,
                        1117.1
                    ]
...

2

Answers


  1. We can make another code file for Json to csv save it in a separately and make the first code which you wrote call the second code in the 2nd file which will the convert the json to csv

    The code can be written as

    import json
    import csv
    
    
    with open('data.json') as json_file:
        data = json.load(json_file)
     
    employee_data = data['emp_details']
     
    data_file = open('data_file.csv', 'w')
     
    csv_writer = csv.writer(data_file)
     
    count = 0
     
    for emp in employee_data:
        if count == 0:
     
            header = emp.keys()
            csv_writer.writerow(header)
            count += 1
     
        csv_writer.writerow(emp.values())
     
    data_file.close()
    

    The more Indepth details are on https://www.geeksforgeeks.org/convert-json-to-csv-in-python/

    A second way could be to use pandas

    import pandas as pd
    
    df = pd.read_json('sample.json')
    df.to_csv('file.csv')
    

    The more Indepth details are on
    https://datagy.io/python-json-to-csv/

    Login or Signup to reply.
  2. I had the somewhat same problem working with the unstructured module for my project. The solution I opted for is to extract Tables as html. The unstructured python library could be used to extract the data from pdf in following JSON array :

    [
        {
            "element_id": "1fa58806a9a99bd55e50fac8979ba4f8",
            "metadata": {
                "coordinates": {
                    "layout_height": 1654,
                    "layout_width": 2339,
                    "points": [
                        [
                            157.5,
                            928.2
                        ],
                        [
                            157.5,
                            1333.8
                        ],
                        [
                            1178.3,
                            1333.8
                        ],
                        [
                            1178.3,
                            928.2
                        ]
                    ],
                    "system": "PixelSpace"
                },
                "file_directory": ".../Dataset",
                "filename": "A1.pdf",
                "filetype": "application/pdf",
                "languages": [
                    "eng"
                
                ],
                "last_modified": "2000-00-00T00:00:00",
                "page_number": 1
                "parent_id": "3de9063918f5d08449d850b6e410c6c4", 
                "text_as_html": "<table><tr><td></td></tr></table>" #available only if "type"=="Table"
            },
            "text": "some text",
            "type": "Table"
        }
    ]
    

    Then to extract the table_as_html you can define a simple function in Python which accesses data from the ‘text_as_html’ array.
    Sample code :

    def extract_table_text(json_data):
    extracted_data = []
    for item in json_data:
        if item.get("type") == "Table":
            metadata = item.get("metadata", {})  #Accessing the second level of JSON array
            HTML_Table = metadata.get("text_as_html", "") #Extracting the Tables as html tags and text
            extracted_data.append(HTML_Table)
    return extracted_data
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search