How to handle a nested JSON?

test
April 29, 2024
128 views
2 votes
3 Answers

I have a problem. I have a nested JSON file:

json_data = '''
{
    "appVersion": "",
    "device": {
        "model": ""
    },
    "bef": {
        "catalog": ""
    },
    "data": [
        {
            "timestamp": "",
            "label": "",
            "category": ""
        }
    ]
}

I would like to extract all data and if it is nested I would like it to be separated with a _.
I have tried to normalise the nested JSON file. I use json_normalise for this.
Unfortunately, the desired output is not what I want and need.
Furthermore, I want that there can be any possible number of nested values, so I tried to solve it with a loop.

How can I produce the desired output?

import pandas as pd
import json

json_data = '''
{
    "appVersion": "0.0.3",
    "device": {
        "model": "Lenovo"
    },
    "bef": {
        "catalog": "Manual"
    },
    "data": [
        {
            "timestamp": "2024-04-24 12:08:02.415077",
            "label": "zuf",
            "category": "50"
        }
    ]
}
'''
parsed_json = json.loads(json_data)

def extract_metadata(json_data):
    metadata = {}
    for key, value in json_data.items():
        if isinstance(value, dict):
            for k, v in value.items():
                metadata[f'{key}_{k}'] = v
        else:
            metadata[key] = value
    return metadata

meta_data = extract_metadata(parsed_json)
df_main = pd.json_normalize(parsed_json['data'], sep='_')
df_meta = pd.DataFrame([meta_data])
df = pd.concat([df_main, df_meta], axis=1)


print(df)

What I got

                    timestamp label category appVersion device_model  
0  2024-04-24 12:08:02.415077   zuf       50      0.0.3       Lenovo   

  bef_catalog                                               data  
0     Manual  [{'timestamp': '2024-04-24 12:08:02.415077', '...

What I want

appVersion  device_model    bef_catalog data_timestamp          data_label  data_category
0.0.3       Lenovo          Manual      2024-04-24 12:08:02.415 zuf         50

Answers

import pandas as pd
import json

def flatten_json(json_obj, parent_key='', sep='_'):
items = {}
for k, v in json_obj.items():
    new_key = parent_key + sep + k if parent_key else k
    if isinstance(v, dict):
        items.update(flatten_json(v, new_key, sep=sep))
    elif isinstance(v, list):
        for i, item in enumerate(v):
            items.update(flatten_json(item, f"{new_key}{sep}{i}", sep=sep))
    else:
        items[new_key] = v
return items



#Your JSON data


  json_data = '''
{
   "appVersion": "0.0.3",
   "device": {
    "model": "Lenovo"
},
"bef": {
    "catalog": "Manual"
},
"data": [
    {
        "timestamp": "2024-04-24 12:08:02.415",
        "label": "zuf",
        "category": 50
    }
]

}
”’

# Parse the JSON data
 parsed_data = json.loads(json_data)

 # Flatten the JSON data
 flattened_data = flatten_json(parsed_data)

   # Convert flattened data to DataFrame
 df = pd.DataFrame(flattened_data, index=[0])

   print(df)

You can first flatten your dictionary like this:

def flatten_dict(d: dict, pre=''):
    new_d = {}
    for key, item in d.items():
        if isinstance(item, dict):
            new_d = {**new_d, **flatten_dict(item, pre=f'{pre}{key}_')}
        elif isinstance(item, list):
            for i, ele in enumerate(item):
                if isinstance(ele, dict):
                    new_d = {**new_d, **flatten_dict(ele, pre=f'{pre}{key}_{i+1}_')}
                else:
                    new_d[f'{pre}{key}_{i+1}'] = ele
        else:
            new_d[f'{pre}{key}'] = item
    return new_d

In your code you missed the list type. I added the enumerator since it is possible to have multiple dictionaries in a list. If you are sure that is is always max 1 element per list you can remove the i. Or include a check or something to see if the list has length 1.

To convert it to pandas:

pd.json_normalize(flatten_dict(parsed_json))

The output of the function:

flatten_dict(parsed_json)
{'appVersion': '0.0.3',
 'device_model': 'Lenovo',
 'bef_catalog': 'Manual',
 'data_1_timestamp': '2024-04-24 12:08:02.415077',
 'data_1_label': 'zuf',
 'data_1_category': '50'}

The problems is that you are trying to normalize the data field which is a list of dictionaries, and then concatenate it with the matadata. Instead, iterate over the data list and for each dictionary in it, merge the metadata and append it to your dataframe:

import pandas as pd
import json
json_data = {your_json_data}
parsed_json = json.loads(json_data)
def extract_metadata(json_data):
    metadata = {}
    for key, value in json_data.items():
        if isinstance(value, dict):
            for k, v in value.items():
                metadata[f'{key}_{k}'] = v
        elif key != 'data':
            metadata[key] = value
    return metadata
meta_data = extract_metadata(parsed_json)
df = pd.DataFrame()
for data in parsed_json['data']:
    data.update(meta_data)
    df = df.append(data, ignore_index=True)
print(df)

Please signup or login to give your own answer.

Click here to cancel reply.