M121: Chapter 6: Aggregation Performance and Pipeline Optimization

The MongoDB Aggregation Framework(Chapter 6 of 6)

Aggregation Performance

  • Index usage

  • Memory Constraints

  • Realtime processing(online application)

  • Batch processing(offline analytics)

Index usage

db.orders.aggregate([
	{$<operator>: <predicate>},
	{$<operator>: <predicate>},
	...
], {explain: true})
	

query optimizer.

Memory Constraints

  • Results are subject to 16MB document limit(use $limit, $project to reduce)
  • 100MB of RAM per stage
    • use indexes
    • db.orders.aggregrate([...],{allowDiskUse: true})

With regards to aggregation performance, which of the following are true? When $limit and $sort are close together a very performant top-k sort can be performed Transforming data in a pipeline stage prevents us from using indexes in the stages that follow

Aggregation Pipeline on a Sharded Cluster

Problem:

What operators will cause a merge stage on the primary shard for a database?

  • $lookup
  • $out

No FETCH stage is a covered query. Avoid needless $project

// an initial aggregatioin finding all movies where the title begins
// with a vowel. Notice the $project stage that will prevent a covered
// query!
db.movies.aggregate([
  {
    $match: {
      title: /^[aeiou]/i
    }
  },
  {
    $project: {
      title_size: { $size: { $split: ["$title", " "] } }
    }
  },
  {
    $group: {
      _id: "$title_size",
      count: { $sum: 1 }
    }
  },
  {
    $sort: { count: -1 }
  }
])

// showing the query isn't covered
db.movies.aggregate(
  [
    {
      $match: {
        title: /^[aeiou]/i
      }
    },
    {
      $project: {
        title_size: { $size: { $split: ["$title", " "] } }
      }
    },
    {
      $group: {
        _id: "$title_size",
        count: { $sum: 1 }
      }
    },
    {
      $sort: { count: -1 }
    }
  ],
  { explain: true }
)

// this is better, we are projecting away the _id field. But this seems like
// a lot of manual work...
db.movies.aggregate([
  {
    $match: {
      title: /^[aeiou]/i
    }
  },
  {
    $project: {
      _id: 0,
      title_size: { $size: { $split: ["$title", " "] } }
    }
  },
  {
    $group: {
      _id: "$title_size",
      count: { $sum: 1 }
    }
  },
  {
    $sort: { count: -1 }
  }
])

// verifying that it is a covered query
db.movies.aggregate(
  [
    {
      $match: {
        title: /^[aeiou]/i
      }
    },
    {
      $project: {
        _id: 0,
        title_size: { $size: { $split: ["$title", " "] } }
      }
    },
    {
      $group: {
        _id: "$title_size",
        count: { $sum: 1 }
      }
    },
    {
      $sort: { count: -1 }
    }
  ],
  { explain: true }
)

// can we... do this? Yes, yes we can.
db.movies.aggregate([
  {
    $match: {
      title: /^[aeiou]/i
    }
  },
  {
    $group: {
      _id: {
        $size: { $split: ["$title", " "] }
      },
      count: { $sum: 1 }
    }
  },
  {
    $sort: { count: -1 }
  }
])

// proof
db.movies.aggregate(
  [
    {
      $match: {
        title: /^[aeiou]/i
      }
    },
    {
      $group: {
        _id: {
          $size: { $split: ["$title", " "] }
        },
        count: { $sum: 1 }
      }
    },
    {
      $sort: { count: -1 }
    }
  ],
  { explain: true }
)

// and a very succinct way of expressing what we wanted all along
db.movies.aggregate([
  {
    $match: {
      title: /^[aeiou]/i
    }
  },
  {
    $sortByCount: {
      $size: { $split: ["$title", " "] }
    }
  }
])

Pipeline Optimization - Part 2

  • The Aggregation Framework will automatically reorder stages in certain conditions

  • The Aggregation Framework can automatically project fields if the shape of the final document is only dependent upon those fields in the input document.

  • Causing a merge in a sharded deployment will cause all subsequent pipeline stages to be performed in the same location as the merge

  • The query in a $match stage can be entirely covered by an index

Final

Final: Question 3

Consider the following collection documents:

db.people.find()
{ "_id" : 0, "name" : "Bernice Pope", "age" : 69, "date" : ISODate("2017-10-04T18:35:44.011Z") }
{ "_id" : 1, "name" : "Eric Malone", "age" : 57, "date" : ISODate("2017-10-04T18:35:44.014Z") }
{ "_id" : 2, "name" : "Blanche Miller", "age" : 35, "date" : ISODate("2017-10-04T18:35:44.015Z") }
{ "_id" : 3, "name" : "Sue Perez", "age" : 64, "date" : ISODate("2017-10-04T18:35:44.016Z") }
{ "_id" : 4, "name" : "Ryan White", "age" : 39, "date" : ISODate("2017-10-04T18:35:44.019Z") }
{ "_id" : 5, "name" : "Grace Payne", "age" : 56, "date" : ISODate("2017-10-04T18:35:44.020Z") }
{ "_id" : 6, "name" : "Jessie Yates", "age" : 53, "date" : ISODate("2017-10-04T18:35:44.020Z") }
{ "_id" : 7, "name" : "Herbert Mason", "age" : 37, "date" : ISODate("2017-10-04T18:35:44.020Z") }
{ "_id" : 8, "name" : "Jesse Jordan", "age" : 47, "date" : ISODate("2017-10-04T18:35:44.020Z") }
{ "_id" : 9, "name" : "Hulda Fuller", "age" : 25, "date" : ISODate("2017-10-04T18:35:44.020Z") }

And the aggregation pipeline execution result:

db.people.aggregate(pipeline)
{ "_id" : 8, "names" : [ "Sue Perez" ], "word" : "P" }
{ "_id" : 9, "names" : [ "Ryan White" ], "word" : "W" }
{ "_id" : 10, "names" : [ "Eric Malone", "Grace Payne" ], "word" : "MP" }
{ "_id" : 11, "names" : [ "Bernice Pope", "Jessie Yates", "Jesse Jordan", "Hulda Fuller" ], "word" : "PYJF" }
{ "_id" : 12, "names" : [ "Herbert Mason" ], "word" : "M" }
{ "_id" : 13, "names" : [ "Blanche Miller" ], "word" : "M" }

Which of the following pipelines generates the output result?

var pipeline = [{
    "$project": {
      "surname_capital": { "$substr": [{"$arrayElemAt": [ {"$split": [ "$name", " " ] }, 1]}, 0, 1 ] },
      "name_size": {  "$add" : [{"$strLenCP": "$name"}, -1]},
      "name": 1
    }
  },
  {
    "$group": {
      "_id": "$name_size",
      "word": { "$push": "$surname_capital" },
      "names": {"$push": "$name"}
    }
  },
  {
    "$project": {
      "word": {
        "$reduce": {
          "input": "$word",
          "initialValue": "",
          "in": { "$concat": ["$$value", "$$this"] }
        }
      },
      "names": 1
    }
  },
  {
    "$sort": { "_id": 1}
  }
]

Final: Question 5

Problem:

Consider a company producing solar panels and looking for the next markets they want to target in the USA. We have a collection with all the major cities (more than 100,000 inhabitants) from all over the World with recorded number of sunny days for some of the last years.

A sample document looks like the following:

db.cities.findOne()
{
"_id": 10,
"city": "San Diego",
"region": "CA",
"country": "USA",
"sunnydays": [220, 232, 205, 211, 242, 270]
}

The collection also has these indexes:

db.cities.getIndexes()
[
{
  "v": 2,
  "key": {
    "_id": 1
  },
  "name": "_id_",
  "ns": "test.cities"
},
{
  "v": 2,
  "key": {
    "city": 1
  },
  "name": "city_1",
  "ns": "test.cities"
},
{
  "v": 2,
  "key": {
    "country": 1
  },
  "name": "country_1",
  "ns": "test.cities"
}
]

We would like to find the cities in the USA where the minimum number of sunny days is 200 and the average number of sunny days is at least 220. Lastly, we’d like to have the results sorted by the city’s name. The matching documents may or may not have a different shape than the initial one.

We have the following query:

var pipeline = [
    {"$addFields": { "min": {"$min": "$sunnydays"}}},
    {"$addFields": { "mean": {"$avg": "$sunnydays" }}},
    {"$sort": {"city": 1}},
    {"$match": { "country": "USA", "min": {"$gte": 200}, "mean": {"$gte": 220}}}
]
db.cities.aggregate(pipeline)

However, this pipeline execution can be optimized!

Which of the following choices is still going to produce the expected results and likely improve the most the execution of this aggregation pipeline?

var pipeline = [
    {"$match": { "country": "USA"}},
    {"$addFields": { "mean": {"$avg": "$sunnydays"}}},
    {"$match": { "mean": {"$gte": 220}, "sunnydays": {"$not": {"$lt": 200 }}}},
    {"$sort": {"city": 1}}
]

Question 6

Consider the following people collection:

db.people.find().limit(5)
{ "_id" : 0, "name" : "Iva Estrada", "age" : 95, "state" : "WA", "phone" : "(739) 557-2576", "ssn" : "901-34-4492" }
{ "_id" : 1, "name" : "Roger Walton", "age" : 92, "state" : "ID", "phone" : "(948) 527-2370", "ssn" : "498-61-9106" }
{ "_id" : 2, "name" : "Isaiah Norton", "age" : 26, "state" : "FL", "phone" : "(344) 479-5646", "ssn" : "052-49-6049" }
{ "_id" : 3, "name" : "Tillie Salazar", "age" : 88, "state" : "ND", "phone" : "(216) 414-5981", "ssn" : "708-26-3486" }
{ "_id" : 4, "name" : "Cecelia Wells", "age" : 16, "state" : "SD", "phone" : "(669) 809-9128", "ssn" : "977-00-7372" }

And the corresponding people_contacts view:

db.people_contacts.find().limit(5)
{ "_id" : 6585, "name" : "Aaron Alvarado", "phone" : "(631)*********", "ssn" : "********8014" }
{ "_id" : 8510, "name" : "Aaron Barnes", "phone" : "(944)*********", "ssn" : "********6820" }
{ "_id" : 6441, "name" : "Aaron Barton", "phone" : "(234)*********", "ssn" : "********1937" }
{ "_id" : 8180, "name" : "Aaron Coleman", "phone" : "(431)*********", "ssn" : "********7559" }
{ "_id" : 9738, "name" : "Aaron Fernandez", "phone" : "(578)*********", "ssn" : "********0211" }

Which of the of the following commands generates this people_contacts view?

var pipeline = [
  {
    "$sort": {"name": 1}
  },
  {
    "$project": {"name":1,
    "phone": {
      "$concat": [
        {"$arrayElemAt": [{"$split": ["$phone", " "]}, 0]} ,
        "*********"  ]
      },
    "ssn": {
      "$concat": [
        "********",
        {"$arrayElemAt": [{"$split": ["$ssn", "-"]}, 2]}
      ]
    }
  }
}
];
db.createView("people_contacts", "people", pipeline);

Question 7

// my 
db.air_routes.aggregate([{$match: { src_airport: "JFK", dst_airport: "LHR" } }])
db.air_routes.aggregate([{$match: { src_airport: "LHR", dst_airport: "JFK" } }])

db.air_routes.aggregate([
  {$match: {   
    $or: [
      { src_airport: "JFK", dst_airport: "LHR" },
      { src_airport: "LHR", dst_airport: "JFK" }
    ]
  }},

  {
    $project:{
      _id:0,
      name:"$airline.name"
    }
  },

  {
    "$lookup": {
      "from": "air_alliances",
      "localField": "name",
      "foreignField": "airlines",
      "as": "airlines"
    }
  }
  ,
  {
    $unwind: "$airlines"
  },
  {
    $project: { Bigname: "$airlines.name", name:1}
  }

])

The answer

db.air_routes.aggregate([
  {
    $match: {
      src_airport: { $in: ["LHR", "JFK"] },
      dst_airport: { $in: ["LHR", "JFK"] }
    }
  },
  {
    $lookup: {
      from: "air_alliances",
      foreignField: "airlines",
      localField: "airline.name",
      as: "alliance"
    }
  },
  {
    $match: { alliance: { $ne: [] } }
  },
  {
    $addFields: {
      alliance: { $arrayElemAt: ["$alliance.name", 0] }
    }
  },
  {
    $group: {
      _id: "$airline.id",
      alliance: { $first: "$alliance" }
    }
  },
  {
    $sortByCount: "$alliance"
  }
])