How to aggregate nested comments using mongodb - CentOS

PriyankaJalan
March 21, 2020
198 views
0 votes
2 Answers

I have one document for blogpost like this:

{
            "_id": "5d8051cdf0b1017da7bff23c",
            "description": "<p>will update soon</p>",
            "topic": "How to setup Kafka Cluster on CentOS",
            "comments": [
                {
                    "created_at": "2019-10-12T02:13:01.859Z",
                    "updated_at": "2019-10-12T02:13:01.859Z",
                    "edited": false,
                    "_id": "5da182aa013d12567e340a2d",
                    "message": "test"
                    "replies": [
                        {
                            "created_at": "2019-10-12T02:13:01.859Z",
                            "updated_at": "2019-10-12T02:13:01.859Z",
                            "edited": false,
                            "_id": "5da182aa013d12567e340a2a",
                            "message": "I am replying to first comment",
                            "commentator": "5daae8b8af029ec4533fe317"
                        },
                        {
                            "created_at": "2019-10-12T02:13:01.859Z",
                            "updated_at": "2019-10-12T02:13:01.859Z",
                            "edited": false,
                            "_id": "5da182aa013d12567e340a2c",
                            "message": "Helpful second Comment",
                            "commentator": "5d7f936544dac213e3f650ec"
                        }
                    ]
                }
            ]
        }
}

I want to do a nested aggregation using mongodb.

My query so far is

{ $unwind: { path: '$comments', preserveNullAndEmptyArrays: true } },
{ $unwind: { path: '$comments.replies', preserveNullAndEmptyArrays: true } },
{
  $lookup: {
    from: 'users',
    let: { thread_reply_commentator: '$comments.replies.commentator' },
    pipeline: [
       { $match: { $expr: { $eq: ['$_id', '$$thread_reply_commentator'] } } },
       { $project: AUTHOR_PROJECTION },
    ],
    as: 'comments.replies.commentator'
  }
},
{ $unwind: { path: '$comments.replies.commentator', preserveNullAndEmptyArrays: true } },
{ 
   $group: {
      _id: { _id: '$_id', comment: "$comments._id" },
      root: { $mergeObjects: '$$ROOT' },
      replies: { $push: '$comments.replies' }
   }
},
{
   $replaceRoot: {
      newRoot: {
        $mergeObjects: ['$$ROOT.replies']
      }
   }
}

And now my above query results in

{
  "_id": "5d8051cdf0b1017da7bff23c",
  "description": "<p>will update soon</p>",
  "topic": "How to setup Kafka Cluster on CentOS",
   "comments": [
      {
        "_id": "5da182aa013d12567e340a2d",
        "message": "test",
        "replies": {
           "created_at": "2019-10-12T02:13:01.859Z",
           "updated_at": "2019-10-12T02:13:01.859Z",
           "edited": false,
           "_id": "5da182aa013d12567e340a2a",
           "message": "I am replying to first comment",
           "commentator": { 
             "first_name":"test",
             "last_name":"test"
            }
         }
      },
      {
        "_id": "5da182aa013d12567e340a2d",
        "message": "test",
        "replies": {
           "created_at": "2019-10-12T02:13:01.859Z",
           "updated_at": "2019-10-12T02:13:01.859Z",
           "edited": false,
           "_id": "5da182aa013d12567e340a2a",
           "message": "Helpful second Comment",
           "commentator": { 
             "first_name":"test",
             "last_name":"test"
            }
         }
      }
    ]
}

But my desired result is:

{
  "_id": "5d8051cdf0b1017da7bff23c",
  "description": "<p>will update soon</p>",
  "topic": "How to setup Kafka Cluster on CentOS",
   "comments": [
      {
        "_id": "5da182aa013d12567e340a2d",
        "message": "test",
        "replies": [
          {
           "created_at": "2019-10-12T02:13:01.859Z",
           "updated_at": "2019-10-12T02:13:01.859Z",
           "edited": false,
           "_id": "5da182aa013d12567e340a2a",
           "message": "I am replying to first comment",
           "commentator": { 
             "first_name":"test",
             "last_name":"test"
            }
          },
          {
             "created_at": "2019-10-12T02:13:01.859Z",
             "updated_at": "2019-10-12T02:13:01.859Z",
             "edited": false,
             "_id": "5da182aa013d12567e340a2a",
             "message": "Helpful second Comment",
             "commentator": { 
               "first_name":"test",
               "last_name":"test"
              }
            }
         ]
      }
    ]
}

Please help how can I achieve this. I know it is easily possible using mongoose but I donot have access to the model Schema so I can only use mongodb and cannot use mongoose.

Tags: aggregation-framework mongodb

Answers

Try this one:

db.collection.aggregate([
  {
    $lookup: {
      from: "users",
      let: {
        thread_reply_commentator: {
          $reduce: {
            input: "$comments.replies.commentator",
            initialValue: [],
            in: {
              $concatArrays: [
                "$$value",
                "$$this"
              ]
            }
          }
        }
      },
      pipeline: [
        {
          $match: {
            $expr: {
              $in: [
                "$_id",
                "$$thread_reply_commentator"
              ]
            }
          }
        },
        { $project: AUTHOR_PROJECTION }
      ],
      as: "comentators"
    }
  },
  {
    $addFields: {
      comments: {
        $map: {
          input: "$comments",
          as: "comments",
          in: {
            $mergeObjects: [
              "$$comments",
              {
                replies: {
                  $map: {
                    input: "$$comments.replies",
                    as: "replies",
                    in: {
                      $mergeObjects: [
                        "$$replies",
                        {
                          commentator: {
                            $arrayElemAt: [
                              {
                                $filter: {
                                  input: "$comentators",
                                  cond: {
                                    $eq: [
                                      "$$this._id",
                                      "$$replies.commentator"
                                    ]
                                  }
                                }
                              },
                              0
                            ]
                          }
                        }
                      ]
                    }
                  }
                }
              }
            ]
          }
        }
      }
    }
  },
  {
    $project: {
      comentators: 0,
      "comments.replies.commentator._id": 0
    }
  }
])

MongoPlayground

Alternate approach: don’t make the DB do anything that you cannot do as easily or performantly including the transfer of material over the network.

We (ultimately) want to substitute the commentator ID in the replies with info about the commentator like first and last name. We see that the commentator ID is a unique ID into a users collection. At worst, every single reply will have a different commentator ID. This means that a whole bunch of first and last names will have to be looked up. This takes time in DB engine but there is no getting around that. The engine then marries the replies info with the users info and sends the doc across the wire. Some commentators, however, will comment on more than one reply. Depending on the sophistication of the DB engine, we may only need that commentator’s first and last name to be looked up and transmitted once for all replies in all docs. But the belief is that is probably not the case. It is use-case likely that doc-to-doc overlap of commentators is low. In addition, there is no getting around the fact that the first and last name — same though they may be in many docs — is sent over the wire again and again so there is no network/data performance gained through this approach. So given this setup, arguably the ideal query is simply this:

c = db.repliesColl.aggregate([
{$lookup: {
        from: "users",
        localField: "comments.replies.commentator",
        foreignField: "commentator",
        as: "z"
    }
}
]);

That’s it. What will this do? The “double dive” through two arrays (comments and replies) will in each doc create an array z containing the unique lookups for that doc. As stated before, if commentator C1 showed up over and over again, then yes that information is passed over the wire again and again (doc by doc) but make no mistake: there is no getting around the initial lookup to find it in the first place on the server side. And we assert that the “repeat rate” of commentators across docs is likely low.

So in practice, data like this (some extra fields eliminated for clarity):

{
     "_id": "5d8051cdf0b1017da7bff23c",
     "description": "<p>will update soon</p>",
     "topic": "How to setup Kafka Cluster on CentOS",
     "comments": [
     {
         "_id": "5da182aa013d12567e340a2d",
         "message": "original msg",
         "replies": [
           {"commentator": "C1", "message": "I am replying to first comment"},
           {"commentator": "C1", "message": "Forgot something"},
           {"commentator": "C2", "message": "Second comment"},
           {"commentator": "C2", "message": "Third comment, same guy"}
                     ]
     }
    ]
  }
}

will yield this output:

(everything in the doc above plus):
    "z" : [
        {
            "commentator" : "C1",
            "fname" : "Steve",
            "lname" : "Jones"
        },
        {
            "commentator" : "C2",
            "fname" : "Dan",
            "lname" : "Dare"
        }
    ]

So now, in the client side code, we can do this pseudocode:

  Map cidmap = new HashMap();                                                                                                        

  while(cursor.hasNext()) {                                                                                                          
    Document doc = cursor.next();                                                                                                 

    // Capture id->name mappings:                                                                                                    
    for(Map m : (List)doc.get("z")) {                                                                                                
      String cid = m.get("commentator");                                                                                             
        if(!cidmap.containsKey(cid)) {                                                                                               
          cidmap.set(cid, m));                                                                                                       
        }                                                                                                                            
    }                                                                                                                                

    // Process comments and, where necessary, substitute the value for cid.                                                           
  }

The attraction here is that you are eliminating much of the work from the central resource by doing it yourself. The more “unique” the set of commentators, the more efficient this scheme becomes. In summary we are balancing load on the DB engine to manipulate the data, expected cardinality of unique commentators, network transfer speed, and complexity of client-side “post processing” of the query.

Please signup or login to give your own answer.

Click here to cancel reply.

How to aggregate nested comments using mongodb – CentOS

Answers