M121: Chapter 4: Core Aggregation - Multidimensional Grouping

The MongoDB Aggregation Framework(Chapter 4 of 6)

Facets: Introduction

Facets: Single Facet Query

db.companies.findOne()
db.companies.createIndex({'description': 'text', 'overview': 'text'})

db.companies.aggregate([{$match: {$text :{ $search: 'network' }}}])

// $sortByCount
mongos> db.companies.aggregate([
...   {"$match": { "$text": {"$search": "network"}  }  },
...   {"$sortByCount": "$category_code"}] )
{ "_id" : "web", "count" : 788 }
{ "_id" : "software", "count" : 463 }
{ "_id" : "network_hosting", "count" : 306 }
{ "_id" : "games_video", "count" : 276 }
{ "_id" : "mobile", "count" : 264 }
{ "_id" : "advertising", "count" : 205 }
{ "_id" : "enterprise", "count" : 151 }
{ "_id" : "other", "count" : 150 }
{ "_id" : null, "count" : 138 }
{ "_id" : "ecommerce", "count" : 101 }
{ "_id" : "hardware", "count" : 89 }
{ "_id" : "consulting", "count" : 88 }
{ "_id" : "public_relations", "count" : 86 }
{ "_id" : "security", "count" : 55 }
{ "_id" : "semiconductor", "count" : 41 }
{ "_id" : "search", "count" : 34 }
{ "_id" : "social", "count" : 30 }
{ "_id" : "cleantech", "count" : 17 }
{ "_id" : "news", "count" : 14 }
{ "_id" : "biotech", "count" : 13 }
Type "it" for more

mongos> db.companies.aggregate([
...   {"$match": { "$text": {"$search": "network"}  }  } ,
...   {"$unwind": "$offices"},
...   {"$match": { "offices.city": {"$ne": ""}  }}   ,
...   {"$sortByCount": "$offices.city"}] )

{ "_id" : "San Francisco", "count" : 245 }
{ "_id" : "New York", "count" : 218 }
{ "_id" : "London", "count" : 133 }
{ "_id" : "Los Angeles", "count" : 66 }
{ "_id" : "Palo Alto", "count" : 62 }
{ "_id" : "Sunnyvale", "count" : 58 }
{ "_id" : "San Jose", "count" : 53 }
{ "_id" : "Santa Clara", "count" : 49 }
{ "_id" : "Seattle", "count" : 44 }
{ "_id" : "Mountain View", "count" : 41 }
{ "_id" : "Paris", "count" : 41 }
{ "_id" : "Chicago", "count" : 36 }
{ "_id" : "Austin", "count" : 34 }
{ "_id" : "Boston", "count" : 33 }
{ "_id" : "San Diego", "count" : 33 }
{ "_id" : "Toronto", "count" : 32 }
{ "_id" : "Cambridge", "count" : 31 }
{ "_id" : "Santa Monica", "count" : 28 }
{ "_id" : "Atlanta", "count" : 24 }
{ "_id" : null, "count" : 23 }
Type "it" for more

Single query facets are supported by the new aggregation pipeline stage $sortByCount.

Facets: Manual Buckets

Histogram



// create manual buckets using $ bucket

db.companies.aggregate( [
  { "$match": {"founded_year": {"$gt": 1980}, "number_of_employees": {"$ne": null}}  },
  {"$bucket": {
     "groupBy": "$number_of_employees",
     "boundaries": [ 0, 20, 50, 100, 500, 1000, Infinity  ]}
}] )


// reproduce error message for non matching documents

db.coll.insert({ x: "a" });
db.coll.aggregate([{ $bucket: {groupBy: "$x", boundaries: [0, 50, 100]}}])


// set `default` option to collect documents that do not match boundaries

db.companies.aggregate( [
  { "$match": {"founded_year": {"$gt": 1980}}},
  { "$bucket": {
    "groupBy": "$number_of_employees",
    "boundaries": [ 0, 20, 50, 100, 500, 1000, Infinity  ],
    "default": "Other" }
}] )


// reproduce error message for inconsitent boundaries datatype

db.coll.aggregate([{ $bucket: {groupBy: "$x", boundaries: ["a", "b", 100]}}])


// set `output` option for $bucket stage

db.companies.aggregate([
  { "$match":
    {"founded_year": {"$gt": 1980}}
  },
  { "$bucket": {
      "groupBy": "$number_of_employees",
      "boundaries": [ 0, 20, 50, 100, 500, 1000, Infinity  ],
      "default": "Other",
      "output": {
        "total": {"$sum":1},
        "average": {"$avg": "$number_of_employees" },
        "categories": {"$addToSet": "$category_code"}
      }
    }
  }
]
)

Facets: Auto Buckets

#!/bin/sh

// generate buckets automatically with $bucktAuto stage
db.companies.aggregate([
  { "$match": {"offices.city": "New York" }},
  {"$bucketAuto": {
    "groupBy": "$founded_year",
    "buckets": 5
}}])


//set `output` option for $bucketAuto
db.companies.aggregate([
  { "$match": {"offices.city": "New York" }},
  {"$bucketAuto": {
    "groupBy": "$founded_year",
    "buckets": 5,
    "output": {
        "total": {"$sum":1},
        "average": {"$avg": "$number_of_employees" }  }}}
])


// default $buckeAuto behaviour

for(i=1; i <= 1000; i++) {  db.series.insert( {_id: i}  ) };
db.series.aggregate(
  {$bucketAuto:
    {groupBy: "$_id", buckets: 5 }
})


// generate automatic buckets using granularity numerical series R20
db.series.aggregate(
  {$bucketAuto:
    {groupBy: "$_id", buckets: 5 , granularity: "R20"}
  })

Auto Bucketing will, given a number of buckets, try to distribute documents evenly across buckets. Auto Bucketing will adhere bucket boundaries to a numerical series set by the granularity option

Facets: Multiple Facets


//  render several different facets using $facet stage
db.companies.aggregate( [
    {"$match": { "$text": {"$search": "Databases"} } },
    { "$facet": {
      "Categories": [{"$sortByCount": "$category_code"}],
      "Employees": [
        { "$match": {"founded_year": {"$gt": 1980}}},
        {"$bucket": {
          "groupBy": "$number_of_employees",
          "boundaries": [ 0, 20, 50, 100, 500, 1000, Infinity  ],
          "default": "Other"
        }}],
      "Founded": [
        { "$match": {"offices.city": "New York" }},
        {"$bucketAuto": {
          "groupBy": "$founded_year",
          "buckets": 5   }
        }
      ]
  }}]).pretty()
  • The $facet stage allows several sub-pipelines to be executed to produce multiple facets.

  • The $facet stage allows the applications to generate several different facets with one single database request. The $facet stage allows other stages to be included on the sub-pipelines, except for:

  • $facet

  • $out

  • $geoNear

  • $indexStats

  • $collStats Also, the sub-pipelines, defined for each individual facet, cannot share their output accross other parallel facets. Each sub-pipeline will receive the same input data set but does not share the result dataset with parallel facets.

Lab - $facets

Problem:

How many movies are in both the top ten highest rated movies according to the imdb.rating and the metacritic fields? We should get these results with exactly one access to the database.

db.movies.aggregate([
	{$project:{"imdb.rating":1, metacritic:1}}

])

db.movies.aggregate([
  {
    $match: {
      metacritic: { $gte: 0 },
      "imdb.rating": { $gte: 0 }
    }
  },
  {
    $project: {
      _id: 0,
      metacritic: 1,
      imdb: 1,
      title: 1
    }
  },
  {
    $facet: {
      top_metacritic: [
        {
          $sort: {
            metacritic: -1,
            title: 1
          }
        },
        {
          $limit: 10
        },
        {
          $project: {
            title: 1
          }
        }
      ],
      top_imdb: [
        {
          $sort: {
            "imdb.rating": -1,
            title: 1
          }
        },
        {
          $limit: 10
        },
        {
          $project: {
            title: 1
          }
        }
      ]
    }
  },
  {
    $project: {
      movies_in_both: {
        $setIntersection: ["$top_metacritic", "$top_imdb"]
      }
    }
  }
])

The $sortByCount Stage

// performing a group followed by a sort to rank occurence
db.movies.aggregate([
  {
    "$group": {
      "_id": "$imdb.rating",
      "count": { "$sum": 1 }
    }
  },
  {
    "$sort": { "count": -1 }
  }
])

// sortByCount is equivalent to the above. In fact, if you execute this pipeline
// with { explain: true } you will see that it is transformed to the above!
db.movies.aggregate([
  {
    "$sortByCount": "$imdb.rating"
  }
])