skip to Main Content

In my mongodb collection documents are stored in the following format:

{ "_id" : ObjectId("62XXXXXX"), "res" : 12, ... }
{ "_id" : ObjectId("63XXXXXX"), "res" : 23, ... }
{ "_id" : ObjectId("64XXXXXX"), "res" : 78, ... }
...

I need to extract id’s for the document for which the value of "res" is outlier (i.e. value < Q1 – 1.5 * IQR or value > Q3 + 1.5 * IQR (Q1, Q3 are percentiles)). I have done this using pandas functionality by retrieving all documents from the collection, which may become slow if the number of documents in collection become too big.

Is there a way to do this using mongodb aggregation pipeline (or just calculating percentiles)?

2

Answers


  1. If I understand how you want to retrieve outliers, here’s one way you might be able to do it.

    db.collection.aggregate([
      { // partition res into quartiles
        "$bucketAuto": {
          "groupBy": "$res",
          "buckets": 4
        }
      },
      { // get the max of each quartile
        "$group": {
          "_id": "$_id.max"
        }
      },
      { // sort the quartile maxs
        "$sort": {
          "_id": 1
        }
      },
      { // put sorted quartile maxs into array
        "$group": {
          "_id": null,
          "maxs": {"$push": "$_id"}
        }
      },
      { // assign Q1 and Q3
        "$project": {
          "_id": 0,
          "q1": {"$arrayElemAt": ["$maxs", 0]},
          "q3": {"$arrayElemAt": ["$maxs", 2]}
        }
      },
      { // set IQR
        "$set": {
          "iqr": {
            "$subtract": ["$q3", "$q1"]
          }
        }
      },
      { // assign upper/lower outlier thresholds
        "$project": {
          "outlierThresholdLower": {
            "$subtract": [
              "$q1",
              {"$multiply": ["$iqr", 1.5]}
            ]
          },
          "outlierThresholdUpper": {
            "$add": [
              "$q3",
              {"$multiply": ["$iqr", 1.5]}
            ]
          }
        }
      },
      { // get outlier _id's
        "$lookup": {
          "from": "collection",
          "as": "outliers",
          "let": {
            "oTL": "$outlierThresholdLower",
            "oTU": "$outlierThresholdUpper"
          },
          "pipeline": [
            {
              "$match": {
                "$expr": {
                  "$or": [
                    {"$lt": ["$res", "$$oTL"]},
                    {"$gt": ["$res", "$$oTU"]}
                  ]
                }
              }
            },
            {
              "$project": {
                "_id": 1
              }
            }
          ]
        }
      }
    ])
    

    Try it on mongoplayground.net.

    Login or Signup to reply.
  2. One more option based on @rickhg12hs’s answer, is to use $setWindowFields:

    db.collection.aggregate([
      {$setWindowFields: {
          sortBy: {res: 1},
          output: {
            totalCount: {$count: {}},
            index: {$sum: 1, window: {documents: ["unbounded", "current"]}}
          }
        }
      },
      {$match: {
          $expr: {$lte: [
              {$abs: {$subtract: [
                    {$mod: [
                        {$multiply: [
                            {$add: ["$index", {$round: {$divide: ["$totalCount", 4]}}]}, 2]},
                        "$totalCount"
                      ]}, 0]}
              }, 1]}
      }},
      {$group: {_id: null, res: {$push: "$res"}}},
      {$project: {_id: 0, q1: {$first: "$res"}, q3: {$last: "$res"},
          iqr: {"$subtract": [{$last: "$res"}, {$first: "$res"}]}
      }},
      {$project: {
          outlierThresholdLower: {$subtract: ["$q1", {$multiply: ["$iqr", 1.5]}]},
          outlierThresholdUpper: {$add: ["$q3", {$multiply: ["$iqr", 1.5]}]}
        }
      },
      {$lookup: {
          from: "collection",
          as: "outliers",
          let: {oTL: "$outlierThresholdLower", oTU: "$outlierThresholdUpper"},
          pipeline: [
            {$match: {$expr: {$or: [{$lt: ["$res", "$$oTL"]}, {$gt: ["$res", "$$oTU"]}]}}},
            {$project: {_id: 1}}
          ]
        }
      }
    ])
    

    See how it works on the playground example

    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search