skip to Main Content

Here is my input file.xml :

<?xml version="1.0" encoding="UTF-8" ?>
<project name="so_project" id="Project-9999">
    <schema name="database1">
        <table name="table1">
            <column name="foo" type="int"/>
            <column name="bar" type="string"/>
            <column name="details_resolution" type="array[object]">
                <column name="timestamp" type="timestamp"/>
                <column name="user_id" type="string"/>
                <column name="user_name" type="string"/>
            </column>
            <column name="details_closure" type="array[object]">
                <column name="timestamp" type="timestamp"/>
                <column name="auto_closure" type="bool"/>
            </column>
        </table>
    </schema>
    <schema name="database2">
        <table name="table1">
            <column name="foo" type="int"/>
            <column name="bar" type="string"/>
            <column name="details" type="array[object]">
                <column name="timestamp" type="timestamp"/>
                <column name="value" type="float"/>
            </column>
        </table>
    </schema>
</project>

.. and I’m trying to make this classical nested dict :

{
    "database1": {
        "table1": {
            "foo": "int",
            "bar": "string",
            "details_resolution": {
                "timestamp": "timestamp",
                "user_id": "string",
                "user_name": "string"
            },
            "details_closure": {
                "timestamp": "timestamp",
                "auto_closure": "bool"
            }
        }
    },
    "database2": {
        "table1": {
            "foo": "int",
            "bar": "string",
            "details": {
                "timestamp": "timestamp",
                "value": "float"
            }
        }
    }
}

PS : Each database can eventually have more than one table.

I tried some AI codes but none of them gave me the expected result..
I’m sorry guys to not being able to show my attempts !

SO, any help would be greately appreciated.

3

Answers


  1. You can use xml.etree.ElementTree

    import xml.etree.ElementTree as ET
    
    def parse_column(column_elem):
        column_data = {}
        column_data['name'] = column_elem.get('name')
        column_data['type'] = column_elem.get('type')
        return column_data
    
    def parse_table(table_elem):
        table_data = {}
        table_name = table_elem.get('name')
        for column_elem in table_elem.findall('column'):
            column_data = parse_column(column_elem)
            table_data[column_data['name']] = column_data['type']
        return {table_name: table_data}
    
    def parse_schema(schema_elem):
        schema_data = {}
        schema_name = schema_elem.get('name')
        for table_elem in schema_elem.findall('table'):
            table_data = parse_table(table_elem)
            schema_data.update(table_data)
        return {schema_name: schema_data}
    
    def parse_xml(xml_content):
        root = ET.fromstring(xml_content)
        project_data = {}
        for schema_elem in root.findall('schema'):
            schema_data = parse_schema(schema_elem)
            project_data.update(schema_data)
        return project_data
    
    # Read XML file
    with open('file.xml', 'r') as f:
        xml_content = f.read()
    
    # Parse XML and generate nested dictionary
    nested_dict = parse_xml(xml_content)
    print(nested_dict)
    
    Login or Signup to reply.
  2. Solution using :

    from bs4 import BeautifulSoup
    
    with open("your_file.xml", "r") as f_in:
        soup = BeautifulSoup(f_in.read(), "xml")
    
    
    def parse_columns(t):
        out = {}
        for c in t.find_all("column", recursive=False):
            if c.find("column"):
                out[c["name"]] = parse_columns(c)
            else:
                out[c["name"]] = c["type"]
        return out
    
    
    def parse_schema(sch):
        out = {}
        for t in sch.select("table"):
            out[t["name"]] = parse_columns(t)
        return out
    
    
    out = {}
    for sch in soup.select("schema"):
        out[sch["name"]] = parse_schema(sch)
    
    print(out)
    

    Prints:

    {
        "database1": {
            "table1": {
                "foo": "int",
                "bar": "string",
                "details_resolution": {
                    "timestamp": "timestamp",
                    "user_id": "string",
                    "user_name": "string",
                },
                "details_closure": {"timestamp": "timestamp", "auto_closure": "bool"},
            }
        },
        "database2": {
            "table1": {
                "foo": "int",
                "bar": "string",
                "details": {"timestamp": "timestamp", "value": "float"},
            }
        },
    }
    
    Login or Signup to reply.
  3. The simple way to do it using xmltodict module.

    from pprint import pprint
    import xmltodict
    
    xml = '''
    <?xml version="1.0" encoding="UTF-8" ?>
    <project name="so_project" id="Project-9999">
        <schema name="database1">
            <table name="table1">
                <column name="foo" type="int"/>
                <column name="bar" type="string"/>
                <column name="details_resolution" type="array[object]">
                    <column name="timestamp" type="timestamp"/>
                    <column name="user_id" type="string"/>
                    <column name="user_name" type="string"/>
                </column>
                <column name="details_closure" type="array[object]">
                    <column name="timestamp" type="timestamp"/>
                    <column name="auto_closure" type="bool"/>
                </column>
            </table>
        </schema>
        <schema name="database2">
            <table name="table1">
                <column name="foo" type="int"/>
                <column name="bar" type="string"/>
                <column name="details" type="array[object]">
                    <column name="timestamp" type="timestamp"/>
                    <column name="value" type="float"/>
                </column>
            </table>
        </schema>
    </project>
    '''
    
    json_data = xmltodict.parse(xml.strip())
    pprint(json_data)
    

    Output:

    {'project': {'@id': 'Project-9999',
                 '@name': 'so_project',
                 'schema': [{'@name': 'database1',
                             'table': {'@name': 'table1',
                                       'column': [{'@name': 'foo', '@type': 'int'},
                                                  {'@name': 'bar',
                                                   '@type': 'string'},
                                                  {'@name': 'details_resolution',
                                                   '@type': 'array[object]',
                                                   'column': [{'@name': 'timestamp',
                                                               '@type': 'timestamp'},
                                                              {'@name': 'user_id',
                                                               '@type': 'string'},
                                                              {'@name': 'user_name',
                                                               '@type': 'string'}]},
                                                  {'@name': 'details_closure',
                                                   '@type': 'array[object]',
                                                   'column': [{'@name': 'timestamp',
                                                               '@type': 'timestamp'},
                                                              {'@name': 'auto_closure',
                                                               '@type': 'bool'}]}]}},
                            {'@name': 'database2',
                             'table': {'@name': 'table1',
                                       'column': [{'@name': 'foo', '@type': 'int'},
                                                  {'@name': 'bar',
                                                   '@type': 'string'},
                                                  {'@name': 'details',
                                                   '@type': 'array[object]',
                                                   'column': [{'@name': 'timestamp',
                                                               '@type': 'timestamp'},
                                                              {'@name': 'value',
                                                               '@type': 'float'}]}]}}]}}
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search