skip to Main Content

I have this JSON

{
    "journal.pbio.0050304.xml": {
        "sentence": [
            [
                {"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
            ],
            [
                {"entity_group": "literal", "score": 0.9932352, "word": "RA, Fgfs, and Wnts are all produced at the posterior of the embryo, and might therefore be expected to form posterior-", "start": 0, "end": 118},
                {"entity_group": "metaphoric", "score": 0.874372, "word": "to", "start": 118, "end": 120},
                {"entity_group": "literal", "score": 0.99049604, "word": "-anterior gradients (for Fgf8", "start": 120, "end": 149},
                {"entity_group": "metaphoric", "score": 0.9993481, "word": "this", "start": 150, "end": 154}
            ]
        ]
    },
    "journal.pbio.0050093.xml": {
        "sentence": [
            [
                {"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
            ]
        ]
    }
}

And I would like to take only entity groups, start and end and convert them into tuples, like this:
[(0, 299, 'literal'),(186, 194, 'literal'), ('metaphoric', 196, 199)], and so on. How can I do it?

2

Answers


  1. Something like this?

    from pprint import pprint
    
    data = {
        "journal.pbio.0050304.xml": {
            "sentence": [
                [
                    {"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
                ],
                [
                    {"entity_group": "literal", "score": 0.9932352, "word": "RA, Fgfs, and Wnts are all produced at the posterior of the embryo, and might therefore be expected to form posterior-", "start": 0, "end": 118},
                    {"entity_group": "metaphoric", "score": 0.874372, "word": "to", "start": 118, "end": 120},
                    {"entity_group": "literal", "score": 0.99049604, "word": "-anterior gradients (for Fgf8", "start": 120, "end": 149},
                    {"entity_group": "metaphoric", "score": 0.9993481, "word": "this", "start": 150, "end": 154}
                ]
            ]
        },
        "journal.pbio.0050093.xml": {
            "sentence": [
                [
                    {"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
                ]
            ]
        }
    }
    
    
    for file in data.values():
        for idx1, sentence in enumerate(file["sentence"]):
            new_sentence = [word for word in sentence]
            for idx2, word in enumerate(sentence):
                new_sentence[idx2] = (word["start"], word["end"], word["entity_group"])
            file["sentence"][idx1] = new_sentence
    
    pprint(data)
    

    Result:

    {'journal.pbio.0050093.xml': {'sentence': [[(0, 299, 'literal')]]},
     'journal.pbio.0050304.xml': {'sentence': [[(0, 299, 'literal')],
                                               [(0, 118, 'literal'),
                                                (118, 120, 'metaphoric'),
                                                (120, 149, 'literal'),
                                                (150, 154, 'metaphoric')]]}}
    
    Login or Signup to reply.
  2. You just need to iterate first over the dictionary values then the lists and sub-lists as follows:

    data = {
        "journal.pbio.0050304.xml": {
            "sentence": [
                [
                    {"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
                ],
                [
                    {"entity_group": "literal", "score": 0.9932352, "word": "RA, Fgfs, and Wnts are all produced at the posterior of the embryo, and might therefore be expected to form posterior-", "start": 0, "end": 118},
                    {"entity_group": "metaphoric", "score": 0.874372, "word": "to", "start": 118, "end": 120},
                    {"entity_group": "literal", "score": 0.99049604, "word": "-anterior gradients (for Fgf8", "start": 120, "end": 149},
                    {"entity_group": "metaphoric", "score": 0.9993481, "word": "this", "start": 150, "end": 154}
                ]
            ]
        },
        "journal.pbio.0050093.xml": {
            "sentence": [
                [
                    {"entity_group": "literal", "score": 0.9961686, "word": "The anterioru2013posterior (Au2013P) axis ", "start": 0, "end": 299}
                ]
            ]
        }
    }
    
    output = []
    
    for v in data.values():
        for s in v.get('sentence', []):
            for d in s:
                output.append((d.get('start'), d.get('end'), d.get('entity_group')))
    
    print(output)
    

    Output:

    [(0, 299, 'literal'), (0, 118, 'literal'), (118, 120, 'metaphoric'), (120, 149, 'literal'), (150, 154, 'metaphoric'), (0, 299, 'literal')]
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search