skip to Main Content

I am trying to read through a large JSONL, maybe couple hundreds up to thousands or possibly million line, below is sample of of the data.

{"id":"gid://shopify/Product/1921569226808"}
{"id":"gid://shopify/ProductVariant/19435458986040","__parentId":"gid://shopify/Product/1921569226808"}
{"id":"gid://shopify/Product/1921569259576"}
{"id":"gid://shopify/ProductVariant/19435459018808","__parentId":"gid://shopify/Product/1921569259576"}
{"id":"gid://shopify/Product/1921569292344"}
{"id":"gid://shopify/ProductVariant/19435459051576","__parentId":"gid://shopify/Product/1921569292344"}
{"id":"gid://shopify/Product/1921569325112"}
{"id":"gid://shopify/ProductVariant/19435459084344","__parentId":"gid://shopify/Product/1921569325112"}
{"id":"gid://shopify/Product/1921569357880"}
{"id":"gid://shopify/ProductVariant/19435459117112","__parentId":"gid://shopify/Product/1921569357880"}
{"id":"gid://shopify/ProductVariant/19435458986123","__parentId":"gid://shopify/Product/1921569226808"}

So each line is json object, either its a Product, or a Product Child identified by __parentId, given that the data may contain thousands of lines, what’s the best way to read through it and return a regular JSON object, like this.

{
"id": "gid://shopify/Product/1921569226808",
"childrens": {
    {"id":"gid://shopify//ProductImage//20771195224224","__parentId":"gid:////shopify//Product//1921569226808"},
    {"id":"gid:////shopify//ProductImage//20771195344224","__parentId":"gid:////shopify//Product//1921569226808"}
    {"id":"gid:////shopify//ProductImage//20771329344224","__parentId":"gid:////shopify//Product//1921569226808"}
}

}

The data is coming back from Shopify and they advice to:

Because nested connections are no longer nested in the response data
structure, the results contain the __parentId field, which is a
reference to an object’s parent. This field doesn’t exist in the API
schema, so you can’t explicitly query it. It’s included automatically
in bulk operation result.

Read the JSONL file in reverse Reading the JSONL file in reverse makes
it easier to group child nodes and avoids missing any that appear
after the parent node. For example, while collecting variants, there
won’t be more variants further up the file when you come to the
product that the variants belong to. After you download the JSONL
file, read it in reverse, and then parse it so that any child nodes
are tracked before the parent node is discovered.

You can look for look here to read more about all of thisenter link description here.

2

Answers


  1. Here’s a technique that:

    1. forms an object with properties of the parent ids
    2. converts that object to an array

    (input lines converted to an array for simplicity)

    const lines = [
      { "id": "gid://shopify/Product/1921569226808" },
      {  "id": "gid://shopify/ProductVariant/19435458986040",  "__parentId": "gid://shopify/Product/1921569226808" },
      { "id": "gid://shopify/Product/1921569259576" },
      { "id": "gid://shopify/ProductVariant/19435459018808", "__parentId": "gid://shopify/Product/1921569259576" },
      { "id": "gid://shopify/Product/1921569292344" },
      { "id": "gid://shopify/ProductVariant/19435459051576", "__parentId": "gid://shopify/Product/1921569292344" },
      { "id": "gid://shopify/Product/1921569325112" },
      { "id": "gid://shopify/ProductVariant/19435459084344", "__parentId": "gid://shopify/Product/1921569325112" },
      { "id": "gid://shopify/Product/1921569357880" },
      { "id": "gid://shopify/ProductVariant/19435459117112", "__parentId": "gid://shopify/Product/1921569357880" },
      { "id": "gid://shopify/ProductVariant/19435458986123", "__parentId": "gid://shopify/Product/1921569226808" }
    ];
    
    // form object keyed to parent ids
    const result = lines.reduce((res, line) => {
    
      const {id, __parentId} = line;
    
      // if there is no `__parentId`, this is a parent
      if (typeof __parentId === 'undefined') {
        res[id] = {
          id,
          childrens: []
        };
        return res;
      }
      
      // this is a child, create its parent if necessary
      if (typeof res[__parentId] === 'undefined') {
        res[__parentId] = {
          id: __parentId,
          childrens: []
        }
      }
      
      // add child to parent's children
      res[__parentId].childrens.push(line);
      return res;
    }, {});
    
    // convert object to array
    const resultArray = Object.values(result);
    
    const pre = document.querySelector('pre');
    pre.innerText = 'resultArray: ' + JSON.stringify(resultArray, null, 2);
    <pre></pre>
    Login or Signup to reply.
  2. Consider using streams so that you don’t have to load the entire file in memory.

    You can use readline (a native module) to process each line individually.

    I took the line processing part from @terrymorse https://stackoverflow.com/a/65484413/14793527

    const readline = require('readline');
    const fs = require('fs');
    
    let res = {};
    
    function processLine(line) {
      const {id, __parentId} = line;
    
      // if there is no `__parentId`, this is a parent
      if (typeof __parentId === 'undefined') {
        res[line.id] = {
          id,
          childrens: []
        };
        return res;
      }
      
      // this is a child, create its parent if necessary
      if (typeof res[__parentId] === 'undefined') {
        res[__parentId] = {
          id: __parentId,
          childrens: []
        }
      }
      
      // add child to parent's children
      res[__parentId].childrens.push(line);
      return res;
    }
    
    const readInterface = readline.createInterface({
        input: fs.createReadStream('large.jsonl'),
        output: process.stdout,
        console: false
    });
    
    readInterface.on('line', processLine);
    
    readInterface.on('close', function() {
        const resultArray = Object.values(res);
        console.log(resultArray);
    });
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search