skip to Main Content

I’m generating a CSV file from JSON.
How can I get flatter JSON with the exact rows as one list of dict has?

The data in rows will be almost the same, except for a few columns which will be variate.

Example: I have

{
  "transportOrder": {
    "customerId":"877299" ,
    "customerOrder": "155564649",
    "customerReference": "reference2",
    "creationDateTime": "2022-08-26T16:30:56.000Z",
    "orderDetail": {
      "AdditionalInfo": {
        "info1": "abc",
        "info2": "cds",
        "name1": "Jonathan",
        "name2": "Grulich",
      }},
    "orderLines": [
      {
        "amount": 7,
        "code": "EUP"
      },
      {
        "amount": 8,
        "code": "ENP"
      },
      {
        "amount": 17,
        "code": "ERP"
      }
    ]
  }
}

And I want to as attached in the image:

customerId    customerOrder    customerReference    creationDateTime    info1    info2    name1    name2    amount    code
877299    155564649    reference2    26.08.2022    abc    cds    Jonathan    Grulich    7    EUP
838299    155564649    reference2    26.08.2022    abc    cds    Jonathan    Grulich    8    ENP
838299    155564649    reference2    26.08.2022    abc    cds    Jonathan    Grulich    17    ERP
import flatdict

data = {
  "Order": {
    "customerId":"877299" ,
    "customerOrder": "155564649",
    "customerReference": "reference2",
    "creationDateTime": "2022-08-26T16:30:56.000Z",
    "orderDetail": {
      "AdditionalInfo": {
        "info1": "abc",
        "info2": "cds",
        "name1": "Jonathan",
        "name2": "Grulich",
      }},
    "orderLines": [
      {
        "amount": 7,
        "code": "EUP"
      },
      {
        "amount": 8,
        "code": "ENP"
      },
      {
        "amount": 17,
        "code": "ERP"
      }
    ]
  }
}
flat = flatdict.FlatDict(data, delimiter='.')

result_list = []
j=0

for line in flat['Order.orderLines']:
    temp = flat
    temp['amount'] = line['amount']
    temp['code'] = line['code']
    result_list.append(temp)

print(result_list)

But I think it is not the best solution. What could it be?

2

Answers


  1. Chosen as BEST ANSWER

    Also I found this solution:

    import pandas as pd
    data = {
      "Order": {
        "customerId":"877299" ,
        "customerOrder": "155564649",
        "customerReference": "reference2",
        "creationDateTime": "2022-08-26T16:30:56.000Z",
        "orderDetail": {
          "AdditionalInfo": {
            "Order": "abc",
            "info2": "cds",
            "name1": "Jonathan",
            "name2": "Grulich",
          }},
        "orderLines": [
          {
            "amount": 7,
            "code": "EUP"
          },
          {
            "amount": 8,
            "code": "ENP"
          },
          {
            "amount": 17,
            "code": "ERP"
          }
        ]
      }
    }
    
    df = pd.json_normalize(
        data['Order'],
        record_path=['orderLines'],
        meta=['customerId','customerReference' ,['orderDetail','AdditionalInfo','info2']]
    )
    
    
    print(df)
    

    Output:

     amount code customerId customerReference orderDetail.AdditionalInfo.info2
    0       7  EUP     877299        reference2                              cds
    1       8  ENP     877299        reference2                              cds
    2      17  ERP     877299        reference2                              cds
    

  2. I would be very explicit in building such a DataFrame.

    import pandas as pd
    
    ##Define the shape and columns of the final DataFrame
    df = pd.DataFrame({
        "customerId":[],
        "customerOrder":[],
        "customerReference":[],
        "creationDateTime":[],
        "info1":[],
        "info2":[],
        "name1":[],
        "name2":[],
        "amount":[],
        "code":[]
    })
    
    #Iterate over the dict and populate the dataframe
    for k, order in data.items():
        for line in order["orderLines"]:
            entry = pd.DataFrame({
                "customerId":[order["customerId"]],
                "customerOrder":[order["customerOrder"]],
                "customerReference":[order["customerReference"]],
                "creationDateTime":[order["creationDateTime"]],
                "info1":[order["orderDetail"]["AdditionalInfo"]["info1"]],
                "info2":[order["orderDetail"]["AdditionalInfo"]["info2"]],
                "name1":[order["orderDetail"]["AdditionalInfo"]["name1"]],
                "name2":[order["orderDetail"]["AdditionalInfo"]["name2"]],
                "amount":[line["amount"]],
                "code":[line["code"]]
            })
            df = pd.concat([df, entry])
    
    print(df)
    

    Output:

      customerId customerOrder customerReference          creationDateTime info1  
    0     877299     155564649        reference2  2022-08-26T16:30:56.000Z   abc
    0     877299     155564649        reference2  2022-08-26T16:30:56.000Z   abc
    0     877299     155564649        reference2  2022-08-26T16:30:56.000Z   abc
    
      info2     name1    name2  amount code
    0   cds  Jonathan  Grulich     7.0  EUP
    0   cds  Jonathan  Grulich     8.0  ENP
    0   cds  Jonathan  Grulich    17.0  ERP
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search