skip to Main Content

I want to convert the following nested JSON file to a CSV file using Python.

{
    "page": {
        "page": 1,
        "pageSize": 250
    },
    "dataRows": [
        {
            "entityId": 349255,
            "Id": "41432-95P",
            "disabled": false,
            "followed": false,
            "suggestion": false,
            "inactive": false,
            "pinned": false,
            "highlighted": false,
            "columnValues": {
                "lastName": [
                    {
                        "columnValueType": "ENTITY",
                        "accessStatus": "OK",
                        "columnValueType": "ENTITY",
                        "name": "McBrady",
                        "Id": "41432-95P",
                        "unpublished": false
                    }
                ],
                "gender": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "Male"
                    }
                ],
                "hqCity": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "Seattle"
                    }
                ],
                "prefix": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "Dr."
                    }
                ],
                "lastUpdateDate": [
                    {
                        "columnValueType": "DATE",
                        "accessStatus": "OK",
                        "columnValueType": "DATE",
                        "expected": false,
                        "asOfdate": "2023-06-26"
                    }
                ],
                "companyName": [
                    {
                        "columnValueType": "BUSINESS_ENTITY",
                        "accessStatus": "OK",
                        "columnValueType": "BUSINESS_ENTITY",
                        "name": "Global Partnerships",
                        "Id": "56347-39",
                        "unpublished": false,
                        "profileType": "INVESTOR"
                    }
                ],
                "roles": [
                    {
                        "columnValueType": "INT_COLUMN_VALUE",
                        "accessStatus": "OK",
                        "columnValueType": "INT_COLUMN_VALUE",
                        "marked": false,
                        "value": 3
                    }
                ],
                "dailyUpdates": [],
                "assetClass": [],
                "hqCountry": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "United States"
                    }
                ],
                "latestNoteAuthor": [],
                "primaryPosition": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "Chair, Enterprise Risk, Compliance and Audit Committee and Member of the Board of Directors"
                    }
                ],
                "boardSeats": [
                    {
                        "columnValueType": "INT_COLUMN_VALUE",
                        "accessStatus": "OK",
                        "columnValueType": "INT_COLUMN_VALUE",
                        "marked": false,
                        "value": 2
                    }
                ],
                "fundRoles": [],
                "institution": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "Harvard University"
                    },
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "University of Oxford"
                    }
                ],
                "latestNote": [],
                "Id": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "41432-95P"
                    }
                ],
                "hqRegion": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "Americas"
                    }
                ],
                "email": [],
                "dealRoles": [],
                "PrimaryCompanyType": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "Not-For-Profit Venture Capital"
                    }
                ],
                "mgtRoles": [],
                "hqStateProvince": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "Washington"
                    }
                ],
                "fullName": [
                    {
                        "columnValueType": "ENTITY_WITH_NOTE",
                        "accessStatus": "OK",
                        "columnValueType": "ENTITY_WITH_NOTE",
                        "name": "Matthew McBrady Ph.D",
                        "Id": "41432-95P",
                        "unpublished": false
                    }
                ],
                "hqLocation": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "Seattle, WA"
                    }
                ],
                "biography": [
                    {
                        "columnValueType": "DESCRIPTION_WITH_SOURCE",
                        "accessStatus": "OK",
                        "columnValueType": "DESCRIPTION_WITH_SOURCE",
                        "value": "Dr. Matthew McBrady serves as Chair, of the Enterprise Risk, Compliance, and Audit Committee.",
                        "morningstarSource": true
                    }
                ],
                "firstName": [
                    {
                        "columnValueType": "ENTITY",
                        "accessStatus": "OK",
                        "columnValueType": "ENTITY",
                        "name": "Matthew",
                        "Id": "41432-95P",
                        "unpublished": false
                    }
                ],
                "phone": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "+1 (206) 652-8773"
                    }
                ],
                "hqSubRegion": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "North America"
                    }
                ],
                "hqAddressLine2": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "Suite 410"
                    }
                ],
                "hqAddressLine1": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "1201 Western Avenue"
                    }
                ],
                "hqFax": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "+1 (206) 456-7877"
                    }
                ],
                "middleName": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "R."
                    }
                ],
                "companyWebsite": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "www.globalpartnerships.org"
                    }
                ],
                "hqZipCode": [
                    {
                        "columnValueType": "STRING",
                        "accessStatus": "OK",
                        "columnValueType": "STRING",
                        "value": "98101"
                    }
                ],
                "weeklyUpdates": []
            }
        }
    ]
}

I got the following code after iterating using chatgpt. I, however, couldn’t get it to capture the first nest that includes entityId, Id. Moreover, in each subsequent nest, I want to capture all fields; for example, in "lastName", I want to capture both "name" and "Id", similarly in "companyName", I want the "name", "Id" and "profileType" to be in separate columns. As I mentioned to chatgpt, I don’t care about "columnValueType", "accessStatus", or "unpublished".

Here is python code:

import csv
import json

def extract_field_value(data):
    if isinstance(data, dict):
        if 'value' in data:
            return str(data['value'])
        elif 'columnValueType' in data and data['columnValueType'] == 'ENTITY':
            return str(data['name'])
        else:
            values = []
            for key, value in data.items():
                if key not in ['columnValueType', 'accessStatus', 'unpublished']:
                    values.append(extract_field_value(value))
            return ', '.join(values) if values else ''
    elif isinstance(data, list):
        values = []
        for item in data:
            value = extract_field_value(item)
            if value:
                values.append(value)
        return ', '.join(values) if values else ''
    else:
        return str(data) if data is not None else ''

# Read the JSON data
with open('data.json') as file:
    data = json.load(file)

# Extract the nested data rows
data_rows = data['dataRows']

# Extract the column headers from the first data row
column_headers = list(data_rows[0]['columnValues'].keys())

# Create a CSV file
with open('data.csv', 'w', newline='') as file:
    writer = csv.writer(file)

    # Write the column headers as the first row
    writer.writerow(column_headers)

    # Write each data row as a separate row in the CSV file
    for row in data_rows:
        column_values = row['columnValues']
        csv_row = []
        for column_header in column_headers:
            values = column_values.get(column_header, [])
            value = extract_field_value(values)
            csv_row.append(value)
        writer.writerow(csv_row)

print("CSV file created successfully.")

2

Answers


  1. I see you trying to write a more general program that figures out the structure of the JSON. I think it’d be easier, in the beginning at least, since you know this structure to just make your code aware of it… be very explicit, and in this case it comes out looking much simpler.

    This approach also leverages the DictWriter class in the csv module, so your final row has the header in it… no need to track that separately. I also like typing, so I’ve added type hints for the structure of the JSON I saw (especially looking at columnValues).

    import csv
    import json
    
    from typing import Any
    
    
    def extract_columns(col_values: dict[str, list[dict[str, Any]]]) -> dict[str, Any]:
        """
        Flatten the dict of columnValues down to a single dict.  Each columnValue key becomes the
        prefix for the value keys that follow.  Assumes that each column value is a single-item item list.
        """
        row: dict[str, Any] = {}
    
        for col_name, list_of_vals in col_values.items():
            if list_of_vals == []:
                continue
    
            col_name = col_name.upper()
    
            vals = list_of_vals[0]
            for k, v in vals.items():
                if k in ["columnValueType", "accessStatus", "unpublished"]:
                    continue
    
                row[col_name + "_" + k] = v
    
        return row
    
    
    def extract_row(data_row: dict[str, Any]) -> dict[str, Any]:
        row: dict[str, Any] = {}
    
        for k in ["Id", "disabled", "followed", "suggestion", "inactive", "pinned", "highlighted"]:
            row[k] = data_row[k]
    
        row.update(extract_columns(data_row["columnValues"]))
    
        return row
    
    
    csv_rows: list[dict[str, Any]] = []
    with open("input.json") as f:
        data = json.load(f)
    
        for data_row in data["dataRows"]:
            row = extract_row(data_row)
            csv_rows.append(row)
    
    
    with open("output.csv", "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=csv_rows[0])
        writer.writeheader()
        writer.writerows(csv_rows)
    

    When I run that on your sample JSON, the first row ends up looking like:

    Final column name Final value
    Id 41432-95P
    disabled False
    followed False
    suggestion False
    inactive False
    pinned False
    highlighted False
    LASTNAME_name McBrady
    LASTNAME_Id 41432-95P
    GENDER_value Male
    HQCITY_value Seattle
    PREFIX_value Dr.
    LASTUPDATEDATE_expected False
    LASTUPDATEDATE_asOfdate 2023-06-26
    COMPANYNAME_name Global Partnerships
    COMPANYNAME_Id 56347-39
    COMPANYNAME_profileType INVESTOR
    ROLES_marked False
    ROLES_value 3
    HQCOUNTRY_value United States
    PRIMARYPOSITION_value Chair, Enterprise Risk, Compliance and Audit Committee and Member of the Board of Directors
    BOARDSEATS_marked False
    BOARDSEATS_value 2
    INSTITUTION_value Harvard University
    ID_value 41432-95P
    HQREGION_value Americas
    PRIMARYCOMPANYTYPE_value Not-For-Profit Venture Capital
    HQSTATEPROVINCE_value Washington
    FULLNAME_name Matthew McBrady Ph.D
    FULLNAME_Id 41432-95P
    HQLOCATION_value Seattle, WA
    BIOGRAPHY_value Dr. Matthew McBrady serves as Chair, of the Enterprise Risk, Compliance, and Audit Committee.
    BIOGRAPHY_morningstarSource True
    FIRSTNAME_name Matthew
    FIRSTNAME_Id 41432-95P
    PHONE_value +1 (206) 652-8773
    HQSUBREGION_value North America
    HQADDRESSLINE2_value Suite 410
    HQADDRESSLINE1_value 1201 Western Avenue
    HQFAX_value +1 (206) 456-7877
    MIDDLENAME_value R.
    COMPANYWEBSITE_value <www.globalpartnerships.org>
    HQZIPCODE_value 98101

    I know this deviates from what your code shows, but I only want to give you the rough idea (the approach). I believe you can adapt the code, especically extract_columns, to fit your needs.

    Login or Signup to reply.
  2. You can try this example how to parse the Json to a DataFrame:

    import json
    import pandas as pd
    
    with open('your_file.json', 'r') as f_in:
        data = json.load(f_in)
    
    out = []
    for r in data['dataRows']:
        d = {}
        for k, v in r['columnValues'].items():
            if not v:
                d[k] = None
            else:
                del v[0]['accessStatus']
                del v[0]['columnValueType']
                for kk, vv in v[0].items():
                    d[f'{k}.{kk}'] = vv
        out.append(d)
    
    df = pd.DataFrame(out)
    print(df)
    

    Prints:

      lastName.name lastName.Id  lastName.unpublished gender.value hqCity.value prefix.value  lastUpdateDate.expected lastUpdateDate.asOfdate     companyName.name companyName.Id  companyName.unpublished companyName.profileType  roles.marked  roles.value dailyUpdates assetClass hqCountry.value latestNoteAuthor                                                                        primaryPosition.value  boardSeats.marked  boardSeats.value fundRoles   institution.value latestNote   Id.value hqRegion.value email dealRoles        PrimaryCompanyType.value mgtRoles hqStateProvince.value         fullName.name fullName.Id  fullName.unpublished hqLocation.value                                                                                biography.value  biography.morningstarSource firstName.name firstName.Id  firstName.unpublished        phone.value hqSubRegion.value hqAddressLine2.value hqAddressLine1.value        hqFax.value middleName.value        companyWebsite.value hqZipCode.value weeklyUpdates
    0       McBrady   41432-95P                 False         Male      Seattle          Dr.                    False              2023-06-26  Global Partnerships       56347-39                    False                INVESTOR         False            3         None       None   United States             None  Chair, Enterprise Risk, Compliance and Audit Committee and Member of the Board of Directors              False                 2      None  Harvard University       None  41432-95P       Americas  None      None  Not-For-Profit Venture Capital     None            Washington  Matthew McBrady Ph.D   41432-95P                 False      Seattle, WA  Dr. Matthew McBrady serves as Chair, of the Enterprise Risk, Compliance, and Audit Committee.                         True        Matthew    41432-95P                  False  +1 (206) 652-8773     North America            Suite 410  1201 Western Avenue  +1 (206) 456-7877               R.  www.globalpartnerships.org           98101          None
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search