Skip to content

GithubArray/MongoDB-common-pitfalls-and-remedies

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

7 Commits
 
 

Repository files navigation

Below is a compiled list of common pitfalls I saw over the years on stackoverflow. I am going to propose common remedies to them also.


Storing dates as non-date objects

What is the Symptom?

Date are not stored as date object; but as strings or timestamp... e.g.

{
    "dateInString1": "2024-01-01",
    "dateInString2": "13/09/2024",
    "dateInString3": "13/Feb/2024",
    "dateInTimestamp": 1709777005000
}

Why this is bad?

MongoDB offers a rich set of date APIs like $dateAdd, $dateSubstract, $dateTrunc... You cannot leverage these APIs when you are not storing them as proper date objects.

What would be suggested way(s) to avoid this?

Storing date as proper date objects.

{
    "date": ISODate("2024-01-01")
}

What would be the remedy if this issue already happen?

  1. clean up existing data by using $toDate; if they are in standard format or timestamp
db.collection.update({},
[
  {
    "$set": {
      "date": {
        "$toDate": "$dateInString1"
      }
    }
  }
])

Mongo Playground

  1. clean up exising data by using $dateFromString; if they are in some formatted date strings
db.collection.aggregate([
  {
    "$set": {
      "date": {
        "$dateFromString": {
          "dateString": "$dateInString2",
          "format": "%d/%m/%Y"
        }
      }
    }
  }
])

Mongo Playground

  1. break up the date into tokens using $split and use $dateFromParts to reform the date object if they are in non-standard format or you need extra parsing.

Below example assume:

  1. the date format is dd MMM yyyy HH:mm:ss zzz
    • i.e. "01 Aug 2020 06:26:09 GMT"
  2. the timezone is always in GMT(timezone info will be ignored in following conversion script)

Steps:

  1. $split the date string into tokens for processing
  2. try to locate the time string by checking $indexOfCP with :. If it is a time string, $split into tokens and put them back into the original array
  3. use an array of month with $indexOfArray to convert them into int values(i.e. Jan to 1, Feb to 2 ...); Meanwhile, convert other string tokens into int
  4. Use $dateFromParts with tokens to construct proper date object
  5. $merge back to the collection for update
db.collection.aggregate([
  // break into tokens for processing
  {
    "$addFields": {
      "tokens": {
        "$split": [
          "$date",
          " "
        ]
      }
    }
  },
  // try to parse time part and break into hh, mm, ss
  {
    "$addFields": {
      "tokens": {
        "$reduce": {
          "input": "$tokens",
          "initialValue": [],
          "in": {
            "$cond": {
              "if": {
                $ne: [
                  -1,
                  {
                    "$indexOfCP": [
                      "$$this",
                      ":"
                    ]
                  }
                ]
              },
              "then": {
                "$concatArrays": [
                  "$$value",
                  {
                    "$split": [
                      "$$this",
                      ":"
                    ]
                  }
                ]
              },
              "else": {
                "$concatArrays": [
                  "$$value",
                  [
                    "$$this"
                  ]
                ]
              }
            }
          }
        }
      }
    }
  },
  // try to 1. parse month part and 2. convert into int
  {
    "$addFields": {
      "tokens": {
        $let: {
          vars: {
            tokens: "$tokens",
            monthArray: [
              "dummy",
              "Jan",
              "Feb",
              "Mar",
              "Apr",
              "May",
              "Jun",
              "Jul",
              "Aug",
              "Sep",
              "Oct",
              "Nov",
              "Dec"
            ]
          },
          in: {
            "$map": {
              "input": "$$tokens",
              "as": "t",
              "in": {
                "$switch": {
                  "branches": [
                    {
                      "case": {
                        "$in": [
                          "$$t",
                          "$$monthArray"
                        ]
                      },
                      "then": {
                        "$indexOfArray": [
                          "$$monthArray",
                          "$$t"
                        ]
                      }
                    }
                  ],
                  default: {
                    "$convert": {
                      "input": "$$t",
                      "to": "int",
                      "onError": "$$t"
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  },
  {
    "$addFields": {
      "parsedDate": {
        "$dateFromParts": {
          "year": {
            "$arrayElemAt": [
              "$tokens",
              2
            ]
          },
          "month": {
            "$arrayElemAt": [
              "$tokens",
              1
            ]
          },
          "day": {
            "$arrayElemAt": [
              "$tokens",
              0
            ]
          },
          "hour": {
            "$arrayElemAt": [
              "$tokens",
              3
            ]
          },
          "minute": {
            "$arrayElemAt": [
              "$tokens",
              4
            ]
          },
          "second": {
            "$arrayElemAt": [
              "$tokens",
              5
            ]
          }
        }
      }
    }
  },
  // cosmetics
  {
    "$project": {
      "date": "$parsedDate"
    }
  },
  // update back to collection
  {
    "$merge": {
      "into": "collection",
      "on": "_id",
      "whenMatched": "merge"
    }
  }
])

Mongo Playground


Storing at wrong query level

What is the Symptom?

Contrary to your frequent query pattern, you are storing your information in different level. For example in a customer data analytics system, your major query is concerned around customers transactions, like monthly statistics aggregation. However, your transaction data is stored as array entries in a customer document.

Why this is bad?

  1. You suffer from query complexity. Even if you can use complicated aggregation to wrangle the data, it introduces code smell and tech debt.
  2. You are unlikely to take advantage from index.
  3. When data grow, 16MB MongoDB single document size could be breached.

What would be suggested way(s) to avoid this?

You should denormalize your data as an single document.
Following our scenario of monthly transactions data aggregation, the customer collection may look like this:

db={
  "customer": [
    {
      "_id": 1,
      "name": "Alice",
      "transactions": [
        {
          _id: "tx1",
          date: ISODate("2024-01-01"),
          amount: 100
        },
        {
          _id: "tx2",
          date: ISODate("2024-01-02"),
          amount: 200
        },
        {
          _id: "tx3",
          date: ISODate("2024-02-01"),
          amount: 300
        },
        {
          _id: "tx4",
          date: ISODate("2024-03-01"),
          amount: 400
        }
      ]
    },
    {
      "_id": 2,
      "name": "Bob",
      "transactions": [
        {
          _id: "tx5",
          date: ISODate("2024-02-02"),
          amount: 1000
        },
        {
          _id: "tx6",
          date: ISODate("2024-03-03"),
          amount: 1000
        }
      ]
    }
  ]
}

You will need following (slightly) complex aggregation to get monthly sales aggregation data, which may be much more complex if you have sophisticated business logic.

db.customer.aggregate([
  {
    "$unwind": "$transactions"
  },
  {
    "$group": {
      "_id": {
        $dateTrunc: {
          date: "$transactions.date",
          unit: "month"
        }
      },
      "totalAmount": {
        "$sum": "$transactions.amount"
      }
    }
  }
])

Mongo Playground You can refactor the schema to storing them as individual documents. There are 2 ways to do that:

  1. denormalize common fields If customer data is not that important or does not change frequently, you can simply duplicate/denormalize the field into new transactions documents.
db={
  "transactions": [
    {
      "_id": "tx1",
      "customerName": "Alice",
      "date": ISODate("2024-01-01"),
      "amount": 100
    },
    {
      "_id": "tx2",
      "customerName": "Alice",
      "date": ISODate("2024-01-02"),
      "amount": 200
    },
    {
      "_id": "tx3",
      "customerName": "Alice",
      "date": ISODate("2024-02-01"),
      "amount": 300
    },
    {
      "_id": "tx4",
      "customerName": "Alice",
      "date": ISODate("2024-03-01"),
      "amount": 400
    }
  ]
}

A single $group will serve our previous purpose.

db.transactions.aggregate([
  {
    "$group": {
      "_id": {
        $dateTrunc: {
          date: "$date",
          unit: "month"
        }
      },
      "totalAmount": {
        "$sum": "$amount"
      }
    }
  }
])

Mongo Playground

  1. separating customer data into separate collection If customer data is important/will be changed frequently, it might not be a good idea to scatter them around each transactions as it involves a lot of updates. You can segregate the data into an individual document.
db={
  "transactions": [
    {
      "_id": "tx1",
      "customerId": 1,
      "date": ISODate("2024-01-01"),
      "amount": 100
    },
    {
      "_id": "tx2",
      "customerId": 1,
      "date": ISODate("2024-01-02"),
      "amount": 200
    },
    {
      "_id": "tx3",
      "customerId": 1,
      "date": ISODate("2024-02-01"),
      "amount": 300
    },
    {
      "_id": "tx4",
      "customerId": 1,
      "date": ISODate("2024-03-01"),
      "amount": 400
    }
  ],
  "customer": [
    {
      "_id": 1,
      "name": "Alice"
    }
  ]
}

You can perform $lookup to the customer collection to retrieve customer data.

What would be the remedy if this issue already happen?

You can refactor your schema and perform data migration according to above suggestions. The most important thing is that you need to identify your common query pattern. That would be a rather sophisticated topic so we don't go into details here. Once you identify the most common query, you can rafctor your schema towards a schema that is more suitable for your query needs.


Using dynamic value as field name

What is the Symptom?

Instead of fixed field name, dynamic/non-constant values are used as field name. An forex exchange(FX) schema may looks like this:

[
  {
    "mainCurrency": "USD",
    "fxRate": {
      "CAD": {
        "rate": 1.35
      },
      "GBP": {
        "rate": 0.78
      },
      "EUR": {
        "rate": 0.91
      }
    }
  }
]

Why this is bad?

  1. This introduces unnecessary complexity to query. Usually $objectToArray and $arrayToObject is required to manipulate the object when you perform filtering / mapping, which is a non-trivial process.
  2. Index cannot be built to improve performance
  3. The schema cannot be well-defined. This may hinder documentation and future integration.

What would be suggested way(s) to avoid this?

Make the field a key-value(kv) tuple, potentially as array entries.

[
  {
    "mainCurrency": "USD",
    "fxRate": [
      {
        "currency": "CAD",
        "rate": 1.35
      },
      {
        "currency": "GBP",
        "rate": 0.78
      },
      {
        "currency": "EUR",
        "rate": 0.91
      }
    ]
  }
]

In this way, you can avoid the complexity from object-array conversion and you can index the fxRate.currency field to improve performance.

PS. if you found yourself frequently working at the fxRate level, consider refactoring them to individual element. See Storing at wrong query level

What would be the remedy if this issue already happen?

  1. You can refactor your schema and perform data migration according to above suggestions.
  2. You can use $objectToArray to convert the object to kv tuples.
db.collection.update({},
[
  {
    "$set": {
      "fxRate": {
        "$map": {
          "input": {
            "$objectToArray": "$fxRate"
          },
          "as": "kv",
          "in": {
            currency: "$$kv.k",
            rate: "$$kv.v.rate"
          }
        }
      }
    }
  }
])

Mongo Playground 3. You may partially mitigate the issue through usage of wildcard index if your only concern is performance and relatively simple structure.


Highly nested array

What is the Symptom?

You are nesting arrays, in a complicated and usually unnecessary manner. In this course management schema, you can see the students are allocated in nested arrays.

db={
  "courses": [
    {
      _id: 1,
      title: "Introduction to Computer Science",
      studentGroups: [
        {
          group: "A",
          students: [
            1,
            2
          ]
        },
        {
          group: "B",
          students: [
            3
          ]
        }
      ]
    },
    {
      _id: 2,
      title: "Chemistry",
      studentGroups: [
        {
          group: "C",
          students: [
            2,
            3
          ]
        },
        {
          group: "B",
          students: [
            4,
            5
          ]
        }
      ]
    }
  ],
  "students": [
    {
      _id: 1,
      name: "Henry",
      age: 15
    },
    {
      _id: 2,
      name: "Kim",
      age: 20
    },
    {
      _id: 3,
      name: "Michel",
      age: 14
    },
    {
      _id: 4,
      name: "Max",
      age: 16
    },
    {
      _id: 5,
      name: "Nathan",
      age: 19
    }
  ]
}

Why this is bad?

  1. This introduces unnecessary complexity to query. A often use case is to populate the innermost layer of array from another collection, like students collection's data in the above example. That will requires convoluted $lookup, $mergeObjects, $map/$filter...
  2. Queries cannot be benefited from index.

What would be suggested way(s) to avoid this?

Again, you need to figure our your most frequent query pattern first. There is no generic suggested way for this issue. However, you will likely find out the suggestions in Storing at wrong query level works here too.

What would be the remedy if this issue already happen?

See Storing at wrong query level


Storing as recusive structure

What is the Symptom?

Information are stored as recursive structure, often with dynamic recursive depth. Consider a family tree document like this:

db={
  "family": [
    {
      _id: 1,
      familyName: "Johnson",
      familyTree: {
        member: 1,
        children: [
          {
            member: 2,
            children: [
              {
                member: 3,
                children: [
                  {
                    member: 4,
                    children: []
                  },
                  {
                    member: 5,
                    children: []
                  }
                ]
              }
            ]
          }
        ]
      }
    }
  ]
}

Why this is bad?

  1. This introduces unnecessary complexity to query. A often use case is to populate the member data from another collection, which requires complicated $lookup and $mergeObjects...
  2. For complex cases like a big family, 16 MB document limit size may be breached.
  3. Queries cannot be benefited from index.

What would be suggested way(s) to avoid this?

Storing the member node in individual document with link to parent/children. Use $graphLookup to construct the tree.

db={
  "member": [
    {
      _id: 1,
      name: "great grandpa",
      children: [
        2
      ]
    },
    {
      _id: 2,
      name: "grandpa",
      children: [
        3
      ]
    },
    {
      _id: 3,
      name: "dad",
      children: [
        4,
        5
      ]
    },
    {
      _id: 4,
      name: "son"
    },
    {
      _id: 5,
      name: "son2"
    }
  ]
}

Construct the tree with $graphLookup:

db.member.aggregate([
  {
    $match: {
      "_id": 1
    }
  },
  {
    "$graphLookup": {
      "from": "member",
      "startWith": "$_id",
      "connectFromField": "children",
      "connectToField": "_id",
      "as": "children"
    }
  }
])

Mongo Playground

What would be the remedy if this issue already happen?

You need to iterate your existing recursive structure and flatten the structure through some application level processing.


Scattering similar data/information in different collections/database

What is the Symptom?

Similar information are scattered in different collections/database. Consider a course schema like this:

db={
  "courses": [
    {
      _id: 1,
      title: "maths",
      members: [
        "p1",
        "p2",
        "s1",
        "s2",
        "s3",
        "s4",
        "s5"
      ]
    }
  ],
  "professors": [
    {
      _id: "p1",
      name: "Dr. Johnson"
    },
    {
      _id: "p2",
      name: "Dr. Wong"
    }
  ],
  "students": [
    {
      _id: "s1",
      name: "Alice"
    },
    {
      _id: "s2",
      name: "Bob"
    },
    {
      _id: "s3",
      name: "Chuck"
    },
    {
      _id: "s4",
      name: "Dean"
    },
    {
      _id: "s5",
      name: "Eve"
    }
  ]
}

2 $lookup is required to fetch information of professors and students

db.courses.aggregate([
  {
    "$lookup": {
      "from": "professors",
      "localField": "members",
      "foreignField": "_id",
      "as": "professorsLookup"
    }
  },
  {
    "$lookup": {
      "from": "students",
      "localField": "members",
      "foreignField": "_id",
      "as": "studentsLookup"
    }
  }
])

Mongo Playground

Why this is bad?

  1. The structure of professors and students are actually similar and they are often queried together. There is no point in doing the expensive $lookup twice.
  2. Separate indices are needed to support the 2 $lookup. This incurs extra storage and management cost for the extra indices.

What would be suggested way(s) to avoid this?

  1. Again, you need to identify your most common use case first. This determines whether your documents are "similar".
  2. Store the scattered information in one place. You can introduce extra field to distinguish the information's original location, e.g. a type field to indicate whether member is a student or a professor. You can maintain the flexibility in usage of subpipeline.
db={
  "courses": [
    {
      _id: 1,
      title: "maths",
      members: [
        "p1",
        "p2",
        "s1",
        "s2",
        "s3",
        "s4",
        "s5"
      ]
    }
  ],
  "members": [
    {
      _id: "p1",
      name: "Dr. Johnson",
      type: "professor"
    },
    {
      _id: "p2",
      name: "Dr. Wong",
      type: "professor"
    },
    {
      _id: "s1",
      name: "Alice",
      type: "student"
    },
    {
      _id: "s2",
      name: "Bob",
      type: "student"
    },
    {
      _id: "s3",
      name: "Chuck",
      type: "student"
    },
    {
      _id: "s4",
      name: "Dean",
      type: "student"
    },
    {
      _id: "s5",
      name: "Eve",
      type: "student"
    }
  ]
}

Querying only students:

db.courses.aggregate([
  {
    "$lookup": {
      "from": "members",
      "localField": "members",
      "foreignField": "_id",
      "pipeline": [
        {
          $match: {
            "type": "student"
          }
        }
      ],
      "as": "membersLookup"
    }
  }
])

Mongo Playground

What would be the remedy if this issue already happen?

  1. You can refactor your schema and perform data migration according to above suggestions.
  2. Use $unionsWith and $merge / $out to consolidate the inforamtion
db.professors.aggregate([
  {
    "$set": {
      "type": "professor"
    }
  },
  {
    "$unionWith": {
      "coll": "students",
      "pipeline": [
        {
          "$set": {
            "type": "student"
          }
        }
      ]
    }
  },
  {
    "$merge": {
      "into": "members",
      "on": "_id"
    }
  }
])

Mongo Playground


Inconsistent data types for $lookup in reference field

What is the Symptom?

Inconsistent data types are used in $lookup for reference fields. A common example would be amking ObjectId fields as Strings. Note the different data types in itemId field in order collection and _id field in item collection.

db={
  "order": [
    {
      "itemId": "5a934e000102030405000000",
      "price": 1,
      "quantity": 2
    },
    {
      "_id": 2,
      "itemId": "5a934e000102030405000001",
      "price": 2,
      "quantity": 1
    },
    {
      "_id": 3,
      "itemId": "5a934e000102030405000002",
      "price": 1,
      "quantity": 2
    }
  ],
  "item": [
    {
      "_id": ObjectId("5a934e000102030405000000"),
      "name": "apple"
    },
    {
      "_id": ObjectId("5a934e000102030405000001"),
      "name": "orange"
    },
    {
      "_id": ObjectId("5a934e000102030405000002"),
      "name": "banana"
    }
  ]
}

Why this is bad?

This will result in unexpected (often incorrect) $lookup result.

What would be suggested way(s) to avoid this?

Standardize the field types in both collections, either both in strings/ObjectId/numerics...

What would be the remedy if this issue already happen?

  1. use type conversion before $lookup, e.g. $toObjectId or $toString... Note that index may not be leveraged after type conversion.
db.order.aggregate([
  {
    "$lookup": {
      "from": "item",
      "let": {
        "itemId": {
          "$toObjectId": "$itemId"
        }
      },
      "pipeline": [
        {
          "$match": {
            "$expr": {
              "$eq": [
                "$_id",
                "$$itemId"
              ]
            }
          }
        }
      ],
      "as": "itemLookup"
    }
  }
])

Mongo Playground
2. Data sanitization with type conversion operator

db.order.update({},
[
  {
    $set: {
      itemId: {
        $toObjectId: "$itemId"
      }
    }
  }
])

Mongo Playground

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published