Question posted in Json
Our archive of expertly curated questions and answers provides insights and solutions to common problems related to this popular data interchange format. From parsing and manipulating JSON data to integrating it with various programming languages and web services, our archive has got you covered. Start exploring today and take your JSON skills to the next level

exploding deeply nested json (json flatten) / flattening pandas df

PavanAithal
September 19, 2023
231 views
1 vote
2 Answers

I have the data coming via REST api with nested json, Trying to explode the response but its flatteing in only the first level. Need to explode the nested part also.
First step im converting data into pd df then exploding each node (bdata, edata). But not able to explode the columns(topping) inside edata.

sample data:

{
    "adata": {
        "1": {
            "xid": "012",
            "xtype": "donut",
            "xname": "xCake",
            "xppu": 0.55
        },
        "2": {
            "xid": "015",
            "xtype": "donut",
            "xname": "Cake",
            "xppu": 0.565
        },
        "3": {
            "xid": "018",
            "xtype": "donut",
            "xname": "Cakex",
            "xppu": 0.559
        }
    },
    "bdata": {
        "1": [
            {
                "yid": "00012",
                "ytype": "donut",
                "yname": "Cake",
                "yppu": 0.55
            },
            {
                "yid": "00023",
                "ytype": "donut",
                "yname": "Raised",
                "yppu": 0.554
            },
            {
                "yid": "00024",
                "ytype": "donut",
                "yname": "Raised",
                "yppu": 0.554
            }
        ],
        "2": [
            {
                "yid": "00015",
                "ytype": "donut",
                "yname": "Cake",
                "yppu": 0.565
            },
            {
                "yid": "00026",
                "ytype": "donut",
                "yname": "Raised",
                "yppu": 0.557
            },
            {
                "yid": "00027",
                "ytype": "donut",
                "yname": "Raised",
                "yppu": 0.525
            }
        ],
        "3": [
            {
                "yid": "00018",
                "ytype": "donut",
                "yname": "Cake",
                "yppu": 0.559
            },
            {
                "yid": "00039",
                "ytype": "donut",
                "yname": "Old Fashioned",
                "yppu": 0.558
            }
        ]
    },
    "edata": {
        "1": [
            {
                "eid": "03001",
                "etype": "donut",
                "name": "Cake",
                "ppu": 0.55,
                "topping": [
                    {"id": "51", "type": "None"},
                    {"id": "002", "type": "zGlazed"},
                    {"id": "05", "type": "Sugar"},
                    {"id": "5007", "type": "Powdered Sugar"},
                    {"id": "06", "type": "Chocolate with Sprinkles"},
                    {"id": "53", "type": "Chocolate"},
                    {"id": "04", "type": "Maple"}
                ]
            },
            {
                "eid": "0302",
                "etype": "donut",
                "name": "Raised",
                "ppu": 0.55
            },
            {
                "eid": "0302",
                "etype": "donut",
                "name": "Raisedz",
                "ppu": 0.55,
                "topping": "None"
            },
            {
                "eid": "03003",
                "etype": "donut",
                "name": "zOld Fashioned",
                "ppu": 0.55,
                "topping": [
                    {"id": "501", "type": "Nonex"},
                    {"id": "52", "type": "xGlazed"},
                    {"id": "503", "type": "Chocolatez"}
                ]
            }
        ],
        "2": [
            {
                "eid": "00401",
                "etype": "donut",
                "name": "Cake",
                "ppu": 0.55,
                "topping": [
                    {"id": "01", "type": "None"},
                    {"id": "2", "type": "xGlazed"},
                    {"id": "55", "type": "xSugar"},
                    {"id": "507", "type": "Powdered Sugar"},
                    {"id": "506", "type": "xChocolate with Sprinkles"},
                    {"id": "03", "type": "xChocolate"},
                    {"id": "54", "type": "xMaple"}
                ]
            },
            {
                "eid": "042",
                "etype": "donut",
                "name": "Raised",
                "ppu": 0.55
            },
            {
                "eid": "042",
                "etype": "donut",
                "name": "Raisedx",
                "ppu": 0.55,
                "topping": "None"
            }
        ],
        "3": [
            {
                "eid": "051",
                "etype": "donut",
                "name": "Cake",
                "ppu": 0.55,
                "topping": [
                    {"id": "50407", "type": "Powdered Sugarx"},
                    {"id": "50406", "type": "Chocolate with Sprinklesx"},
                    {"id": "50403", "type": "Chocolatex"},
                    {"id": "50404", "type": "Maplex"}
                ]
            },
            {
                "eid": "050403",
                "etype": "donut",
                "name": "Old Fashioned",
                "ppu": 0.55,
                "topping": [
                    {"id": "5071", "type": "None"},
                    {"id": "5072", "type": "Glazedx"},
                    {"id": "5703", "type": "Chocolatex"}
                ]
            }
        ]
    }
}

Im trying to get the result somewhat like below.

      eid   etype   name    ppu     id                   type      xid     xtype   xname   xppu    yid     ytype   yname   yppu
0   03001   donut   Cake   0.55     51                  None      012     donut   xCake   0.55  00012     donut    Cake   0.55
1   03001   donut   Cake   0.55    002                zGlazed     012     donut   xCake   0.55  00012     donut    Cake   0.55
2   03001   donut   Cake   0.55     05                 Sugar      012     donut   xCake   0.55  00012     donut    Cake   0.55
3   03001   donut   Cake   0.55   5007       Powdered Sugar      012     donut   xCake   0.55  00012     donut    Cake   0.55
4   03001   donut   Cake   0.55     06  Chocolate with Sprinkles 012     donut   xCake   0.55  00012     donut    Cake   0.55
5   03001   donut   Cake   0.55     53             Chocolate     012     donut   xCake   0.55  00012     donut    Cake   0.55
6   03001   donut   Cake   0.55     04                Maple      012     donut   xCake   0.55  00012     donut    Cake   0.55
7   0302   donut   Raised   0.55   NaN                  None     015     donut   Cake   0.565  00015     donut    Cake   0.565
8   0302   donut   Raisedz  0.55  None                  None     015     donut   Cake   0.565  00015     donut    Cake   0.565

Answers

You can extract data with pd.json_normalize:

def extract_bdata(bdata):
    bdata = {k: pd.json_normalize(records)
                for k, records in bdata.items()}
    return pd.concat(bdata, names=['group']).reset_index('group').reset_index(drop=True)

def extract_edata(edata):
    edata = {k: pd.json_normalize(record, 'topping', ['eid', 'etype', 'name', 'ppu'])
                for k, records in edata.items() for record in records
                if 'topping' in record and isinstance(record['topping'], list)}
    return pd.concat(edata, names=['group']).reset_index('group').reset_index(drop=True)

# Convert your json as python data structure
# import json
# data = json.loads(<your input as json string>)
    
bdata = extract_bdata(data['bdata'])
edata = extract_edata(data['edata'])

Output:

>>> bdata
  group    yid  ytype          yname   yppu
0     1  00012  donut           Cake  0.550
1     1  00023  donut         Raised  0.554
2     1  00024  donut         Raised  0.554
3     2  00015  donut           Cake  0.565
4     2  00026  donut         Raised  0.557
5     2  00027  donut         Raised  0.525
6     3  00018  donut           Cake  0.559
7     3  00039  donut  Old Fashioned  0.558

>>> edata
   group    id                       type     eid  etype            name   ppu
0      1   501                      Nonex   03003  donut  zOld Fashioned  0.55
1      1    52                    xGlazed   03003  donut  zOld Fashioned  0.55
2      1   503                 Chocolatez   03003  donut  zOld Fashioned  0.55
3      2    01                       None   00401  donut            Cake  0.55
4      2     2                    xGlazed   00401  donut            Cake  0.55
5      2    55                     xSugar   00401  donut            Cake  0.55
6      2   507             Powdered Sugar   00401  donut            Cake  0.55
7      2   506  xChocolate with Sprinkles   00401  donut            Cake  0.55
8      2    03                 xChocolate   00401  donut            Cake  0.55
9      2    54                     xMaple   00401  donut            Cake  0.55
10     3  5071                       None  050403  donut   Old Fashioned  0.55
11     3  5072                    Glazedx  050403  donut   Old Fashioned  0.55
12     3  5703                 Chocolatex  050403  donut   Old Fashioned  0.55

However, I don’t see how to merge bdata and edata.

Had to build a recursive iterator to get through the lists embedded within the dictionaries which is what the drill_through function is doing. The dictionaries with a topping were duplicated and a type was assigned for those, otherwise it was set to None.

import pandas as pd

x = {} # the input that you have in question

def drill_through(inner_val, list_save=[]):
    if isinstance(inner_val, dict):
        list_save.append(inner_val)
    elif isinstance(inner_val, list):
        for ele in inner_val:
            drill_through(ele,list_save)
    else:
        print(inner_val) # incase there is another data type
    return list_save


def get_flattened(val: dict) -> dict:
    flattened_result = []
    for name in val:
        for num in val[name]:
            inner_val = val[name][num]
            drilled = drill_through(inner_val)
            for drill_value in drilled:
                flattened_result.append(drill_value)

    result = []
    for new_dict in flattened_result:
        d = {}
        if 'topping' in new_dict.keys():
            iter_array = new_dict['topping']
            del new_dict['topping']
            for toppin in iter_array:
                d = new_dict
                try:
                    d['type'] = toppin['type']
                except:
                    d['type'] = None
                result.append(d)
        else:
            d = new_dict
            d['type'] = None
            result.append(d)
    return flattened_result
                  

df = pd.DataFrame(get_flattened(x))

Please signup or login to give your own answer.

Click here to cancel reply.