skip to Main Content

I have many json files (example below), I am interested in converting these json to a table version in the csv file format [example below] with the help of python script from jupyter notebook provided my one of my collaborator [example below]. The notebook works on extracting one gene_name along with summary, and scores, but I am looking to extract two gene_name columns. Primarily, from the json file, I am interested in extracting the "gene_name" field, and make it as two separate columns such as gene_name_A, and gene_name_B, Summary_Gene_A, and Summary_Gene_B from "brief_summary" field, and scores for all the 6 statements in json file (well_characterized, "biology", cancer", tumor, drug_targets, clinical_marker).

Error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[4], line 76
     74     print (k)
     75     q1 = json.load(open(k,'r'))
---> 76     tmpDF = get_qScore(q1,mainkey_qFLU, subkey=subkey_name)
     77     q1_DF = pd.concat([q1_DF,tmpDF],axis=0)
     79 print (q1_DF.shape)

Cell In[4], line 60, in get_qScore(q, question_dict, subkey)
     58 q_scores = []
     59 for gname in q.keys():
---> 60     for model in q[gname].keys():
     61         kx = convert_stringtodict(q[gname][model],question_dict)
     62         kx.update({'gene_name':gname,
     63                     'runID':model,
     64                     "model_version":model.lstrip("datasvc-openai-paluckalab-poc-").split("_")[0],
     65                     "subjectKey":subkey,})

AttributeError: 'int' object has no attribute 'keys'

Example json file:

{
    "KAT2A": {
        "official_gene_symbol": "KAT2A",
        "brief_summary": "KAT2A, also known as GCN5, is a histone acetyltransferase that plays a key role."
    },
    "E2F1": {
        "official_gene_symbol": "E2F1",
        "brief_summary": "E2F1 is a member of the E2F family of transcription factors."
    },
    "The genes encode molecules that are well characterized protein-protein pairs": 7,
    "The protein-protein pair is relevant to biology": 8,
    "The protein-protein pair is relevant to cancer": 3,
    "The protein-protein pair is relevant to interactions between tumor": 4,
    "One or both of the genes are known drug targets": 5,
    "One or both of the genes are known clinical marker": 6
}

Example expected csv file:

#>   gene_name_A gene_name_B               Summary_Gene_A
#> 1       KAT2A        E2F1   KAT2A, also known as GCN5.
#> 2       KRT30       KRT31 Keratin 30 is a cytokeratin.
#>                                         Summary_Gene_B well_characterized
#> 1 E2F1 is a member of the E2F family of transcription.                  7
#> 2                         Keratin 31 is a cytokeratin.                  7
#>   Biology Cancer Tumor drug_targets clinical_biomarkers
#> 1       8      3     4            5                   6
#> 2       8      2     3            1                   5

Jupyter notebook

# test azure
import sys, time, json
# from openai import OpenAI
import pandas as pd
import re
from glob import glob
# define key dictionary for each question for concrete formatting

mainkey_qFLU = {'summary':'Summary',
                'well characterized':'well_characterized',
                'biology':'Biology',
                'cancer':'Cancer',
                'tumor':'tumor',
                'drug targets':'drug_targets',
                'clinical markers':'clinical_markers'
               }


def find_keyword(sline, keyLib):
    for mk in keyLib.keys():

        # Regular expression pattern to find all combinations of the letters in 'gene'
        pattern = r'{}'.format(mk)

        # Finding all matches in the sample text
        matches = re.findall(pattern, sline, re.IGNORECASE)
        if matches:
            return keyLib[mk]
        else:
            next
    return False



def convert_stringtodict(lines, keylib):
    dict_line = {}
    for k in lines:
        ksplit = k.split(":")

        if len(ksplit) ==2:
            key_tmp = find_keyword(ksplit[0].strip("'|"|', |").strip(), keylib)
            val_tmp = ksplit[1].strip("'|"|',|{|} ").strip()
            if key_tmp and val_tmp:
                if key_tmp == "Summary":
                    dict_line[key_tmp] = val_tmp
                else:
                    try:
                        dict_line[key_tmp] = float(val_tmp)
                    except:
                        dict_line[key_tmp] = 0
            else:
                next
                # print ("error in ", ksplit)

    return dict_line

def get_qScore(q, question_dict, subkey):
    q_scores = []
    for gname in q.keys():
        for model in q[gname].keys():
            kx = convert_stringtodict(q[gname][model],question_dict)
            kx.update({'gene_name':gname,
                        'runID':model,
                        "model_version":model.lstrip("datasvc-openai-testinglab-poc-").split("_")[0],
                        "subjectKey":subkey,})
            q_scores.append(kx)
    print (len(q_scores))
    return pd.DataFrame(q_scores)


q1_DF = pd.DataFrame()
for k in glob("/Users/Documents/Projects/json_files/*.json"):
    subkey_name = "-".join(k.split("/")[-1].split("_")[1:3])
    print (k)
    q1 = json.load(open(k,'r'))
    tmpDF = get_qScore(q1,mainkey_qFLU, subkey=subkey_name)
    q1_DF = pd.concat([q1_DF,tmpDF],axis=0)

print (q1_DF.shape)

q1_DF.to_csv("./Score_parse_1_2.csv")

``





I tried using the jupyter notebook [code mentioned above] that works on extracting one gene_name along with summary, and scores, but I am looking to extract two gene_name columns. 

2

Answers


  1. Chosen as BEST ANSWER

    @JonSG, thank you very much for the earlier suggestions, and inputs. I managed to pool all *.json. Here is the code:

    import csv
    import json
    import glob
    
    
    fieldnames_mapping = {
        "gene_name_A": lambda keys, values: keys[0],
        "gene_name_B": lambda keys, values: keys[1],
        # Check for 'brief_summary' first, then 'summary', otherwise use 'Not Available'
        "Summary_Gene_A": lambda keys, values: values[0].get("brief_summary", values[0].get("summary", 'Not Available')),
        "Summary_Gene_B": lambda keys, values: values[1].get("brief_summary", values[1].get("summary", 'Not Available')),
        "well_characterized": lambda keys, values: values[2],
        "biology": lambda keys, values: values[3],
        "cancer": lambda keys, values: values[4],
        "tumor": lambda keys, values: values[5],
        "drug_targets": lambda keys, values: values[6],
        "clinical_biomarkers": lambda keys, values: values[7],
    }
    
    
    
    def reshape(row, fieldnames_mapping):
        keys = list(row.keys())
        values = list(row.values())
        return {key: fn(keys, values) for key, fn in fieldnames_mapping.items()}
    
    # Path to your JSON files
    json_files_path = "/Users/Documents/Projects/json_files/*.json"
    
    # List all JSON files in the directory
    json_files = glob.glob(json_files_path)
    
    # Open the CSV file for writing
    with open("results.csv", "w", encoding="utf-8", newline="") as file_out:
        writer = csv.DictWriter(file_out, fieldnames=list(fieldnames_mapping))
        writer.writeheader()
        
        # Loop through each JSON file
        for json_file in json_files:
            with open(json_file, 'r', encoding='utf-8') as file_in:
                # Load the JSON content
                data = json.load(file_in)
                
                # Check if the JSON content is a list or a single dict
                if isinstance(data, list):
                    # If it's a list, loop through each item
                    for row in data:
                        writer.writerow(reshape(row, fieldnames_mapping))
                else:
                    # If it's a single dict, process it directly
                    writer.writerow(reshape(data, fieldnames_mapping))
    
    

  2. I would approach this by defining a dictionary of keys and lambda methods to calculate the value of the given key. Each lambda will expect to be passed the list of keys and the list of values from the original row/dictionary and it will do it’s thing

    In this case:

    fieldnames_mapping = {
        "gene_name_A": lambda keys, values: keys[0],
        "gene_name_B": lambda keys, values: keys[1],
        "Summary_Gene_A": lambda keys, values: values[0]["brief_summary"],
        "Summary_Gene_B": lambda keys, values: values[1]["brief_summary"],
        "well_characterized": lambda keys, values: values[2],
        "biology": lambda keys, values: values[3],
        "cancer": lambda keys, values: values[4],
        "tumor": lambda keys, values: values[5],
        "drug_targets": lambda keys, values: values[6],
        "clinical_biomarkers": lambda keys, values: values[7],
    }
    

    and the given data:

    data = [
        {
            "KAT2A": {
                "official_gene_symbol": "KAT2A",
                "brief_summary": "KAT2A, also known as GCN5, is a histone acetyltransferase that plays a key role."
            },
            "E2F1": {
                "official_gene_symbol": "E2F1",
                "brief_summary": "E2F1 is a member of the E2F family of transcription factors."
            },
            "The genes encode molecules that are well characterized protein-protein pairs": 7,
            "The protein-protein pair is relevant to biology": 8,
            "The protein-protein pair is relevant to cancer": 3,
            "The protein-protein pair is relevant to interactions between tumor": 4,
            "One or both of the genes are known drug targets": 5,
            "One or both of the genes are known clinical marker": 6
        }
    ]
    

    With that out of the way, the rest is fairly simple:

    import csv
    
    def reshape(row, fieldnames_mapping):
        keys = list(row.keys())
        values = list(row.values())
        return {key: fn(keys, values) for key, fn in fieldnames_mapping.items()}
    
    ##---------------
    ## generate a CSV
    ##---------------
    with open("results.csv", "w", encoding="utf-8", newline="") as file_out:
        writer = csv.DictWriter(file_out, fieldnames=list(fieldnames_mapping))
        writer.writeheader()
        for row in data:
            writer.writerow(reshape(row, fieldnames_mapping))
    ##---------------
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search