skip to Main Content

I would like to extract data simultaneously from the same directory but they have different file names and different data in each. I did a simple separate loop like this, where I just specify which files to read and extract data and it works. But it seemed inefficient and I wanted to combine both loops to get one single data frame.


for pos_json in os.listdir(path_to_json):
    if pos_json.endswith('.json') and 'XYZ' not in pos_json: #specify which files to read: to extract X and Y -> files with XYZ not in file name. To extract A, B, C -> XYZ in file name 
        json_files.append(pos_json)
      


data = []
for json_file in json_files:
    with open(os.path.join(path_to_json, json_file)) as file:
        json_data = json.load(file)
        X = json_data['X']
        Y = json_data['Y']
        data.extend(zip(X, Y))


df = pd.DataFrame(data, columns=[
                  'X', 'Y'])

print(df)


print(json_files)
print(pos_json)


This is my attempt to run both loops on the same directory and I get the correct 4-column data. I don’t get why I get None under X and Y, I can specify the same conditions as the first loop, but it does not work and I get a syntax error or it double-reads files and registers a NaN value if X and Y are not found.

import os
import json
import pandas as pd
import matplotlib.pyplot as plt


path_to_json = '/Users/XXXX/XXXXXX/XXXXX/'

json_files = []

for pos_json in os.listdir(path_to_json):
    if pos_json.endswith('.json'):
        json_files.append(pos_json)
        

data = []


for json_file in json_files:
    with open(os.path.join(path_to_json, json_file)) as file:
        json_data = json.load(file)
        X = json_data.get('X')
        Y = json_data.get('Y')
        A = json_data.get('A')
        parameters = json_data.get('parameters')
        if parameters is not None:
            B = parameters.get('B')
            C = parameters.get(
                'C')
            if B is not None and C is not None:
                data.append({'A': A, 'B': B, 'C': C,
                             'X': X, 'Y': Y})

df = pd.DataFrame(data, columns=['A',
                                 'B', 'C', 'X', 'Y'])


#df.to_csv('/Users/XXX/XXXX/XXXX/XXXXXX', index=False, sep=';')

print(df)
print(json_files)


output:

    |    A              |   B   |    C    |   X   |   Y   |
|------------------|-------|---------|-------|-------|
|    .....         |   0   |  700.0  |  None |  None |
|  ............... |   5   |   5.00  |  None |  None |







Sample data:

the first type of file does not include "XYZ" in its file name: 

{
    "A": "20220821_174040",
    "X": [966.6331929293214,
        966.7065979819918,
        966.7799774914224,
        967.5123668787631,
        967.5854651639233,
        967.6585378606378,
        967.7315849651395,
        967.8046064736611,
        967.8776023824357,
        967.950572687696,
        968.0235173856747,
        968.096436472605,
        968.1693299447195,
        968.242197798251,
        
    ]

"y": [
        84.375,
        93.91666666666666,
        98.95833333333334,
        99.27083333333334,
        72.625
    ]
}



Second file, with XYZ in name:

{
    "A": "20220821_174040",
    "parameters": {
        "N": 10.5,
        "B": 1000,
        "C": 0
    },
    "comments": "",
    "errors": "",
    "dataset": {
        "": [
            "",
            ""
        ]
    }
}





2

Answers


  1. If you want simultaneous processing of your JSON files (except those with ‘XYZ’ in their name) then you should consider multithreading.

    Something like this:

    from concurrent.futures import ThreadPoolExecutor
    import json
    import glob
    import os
    import pandas
    
    data = []
    
    def process(filename):
        if not 'XYZ' in os.path.basename(filename):
            with open(filename) as jf:
                jdata = json.load(jf)
                data.extend(zip(jdata['X'], jdata['Y']))
    
    path_to_json = 'your_path_goes_here'
    
    with ThreadPoolExecutor() as executor:
        files = glob.glob(os.path.join(path_to_json, '*.json'))
        executor.map(process, files)
    
    df = pandas.DataFrame(data, columns=['X', 'Y'])
    
    Login or Signup to reply.
  2. Judging from your input examples, after this

    X = json_data.get('X')
    Y = json_data.get('Y')
    A = json_data.get('A')
    parameters = json_data.get('parameters')
    

    either X and Y are not None, or parameters is not None.

    Then this code snippet is executed:

    if parameters is not None:
        B = parameters.get('B')
        C = parameters.get('C')
        if B is not None and C is not None:
            data.append({'A': A, 'B': B, 'C': C, 'X': X, 'Y': Y})
    

    Here it’s important that data.append ONLY happens if parameters is not None. So all the entries where X and Y are not None are getting skipped. And your dataset only gets None values for X and Y.

    What you probably can do is append the data each time or check if you received at least one not None value for a row. Depends on your task logic. For example:

    for json_file in json_files:
        with open(os.path.join(path_to_json, json_file)) as file:
            json_data = json.load(file)
            X = json_data.get('X')
            Y = json_data.get('Y')
            A = json_data.get('A')
            parameters = json_data.get('parameters')
            if parameters is not None:
                B = parameters.get('B')
                C = parameters.get('C')
    
        # your task logic here:
        if (B is not None and C is not None) or (X is not None and Y is not None):
            data.append({'A': A, 'B': B, 'C': C, 'X': X, 'Y': Y})
    
    df = pd.DataFrame(data, columns=['A', 'B', 'C', 'X', 'Y'])
    

    That would result in

    A                 B      C     X     Y
    20220821_174040   1000   0     None  None
    20220821_174040   None   None  [..]  [..]
    

    I am guessing you then would want to unite the rows for the same A value.

    def get_not_none_value(values):
        return next(filter(lambda x: x is not None, values))
    
    df 
        .groupby("A", as_index=False) 
        .agg(get_not_na_value)
    

    Output:

                     A       B    C       X       Y
    0  20220821_174040  1000.0  0.0     [..]   [..]
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search