Skip to content

Commit

Permalink
Add option to create unique ID index when inserting from JSONL
Browse files Browse the repository at this point in the history
- Make creation of default index configurable

- Lint index creation

- Add wrapper call in Elasticsearch to create_default_index

- Add hacky test for index creation

- Make index before insert
  • Loading branch information
ml-evs committed Nov 17, 2024
1 parent 642213e commit f34cff3
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 16 deletions.
10 changes: 10 additions & 0 deletions optimade/server/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,16 @@ class ServerConfig(BaseSettings):
),
] = None

create_default_index: Annotated[
bool,
Field(
description=(
"Whether to create a set of default indices "
"for supporting databases after inserting JSONL data."
)
),
] = False

use_real_mongo: Annotated[
bool | None,
Field(description="DEPRECATED: force usage of MongoDB over any other backend."),
Expand Down
15 changes: 10 additions & 5 deletions optimade/server/entry_collections/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,19 @@ def __init__(
self.client = client if client else CLIENT
self.name = name

# If we are creating a new collection from scratch, also create the index,
# otherwise assume it has already been created externally
if CONFIG.insert_test_data:
self.create_optimade_index()

def count(self, *args, **kwargs) -> int:
raise NotImplementedError

def create_default_index(self) -> None:
"""Create the default index for the collection.
For Elastic, the default is to create a search index
over all relevant OPTIMADE fields based on the configured
mapper.
"""
return self.create_optimade_index()

def create_optimade_index(self) -> None:
"""Load or create an index that can handle aliased OPTIMADE fields and attach it
to the current client.
Expand Down
20 changes: 17 additions & 3 deletions optimade/server/entry_collections/entry_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,16 +279,30 @@ def all_fields(self) -> set[str]:

return self._all_fields

def create_index(self, fields: str | set[str], unique: bool = False) -> None:
"""Create an index on the given fields.
def create_index(self, field: str, unique: bool = False) -> None:
"""Create an index on the given field, as stored in the database.
Arguments:
fields: The fields, or single field, to index.
field: The database field to index (i.e., if different from the OPTIMADE field,
the mapper should be used to convert between the two).
unique: Whether or not the index should be unique.
"""
raise NotImplementedError

def create_default_index(self) -> None:
"""Create the default index for the collection.
For example, a database backend could override this method to
create a unique index on the `id` field, so that it can be called
on server startup.
This method should use a mapper to convert any OPTIMADE field names
to the corresponding stored names in the database.
"""
raise NotImplementedError

def get_attribute_fields(self) -> set[str]:
"""Get the set of attribute fields
Expand Down
23 changes: 17 additions & 6 deletions optimade/server/entry_collections/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,18 +105,29 @@ def insert(self, data: list[EntryResource | dict]) -> None:
"""
self.collection.insert_many(data, ordered=False)

def create_index(self, fields: str | set[str], unique: bool = False) -> None:
"""Create an index on the given fields.
def create_index(self, field: str, unique: bool = False) -> None:
"""Create an index on the given field, as stored in the database.
If any error is raised during index creation, this method should faithfully
return it, except for the simple case where an identical index already exists.
Arguments:
fields: The fields, or single field, to index.
field: The database field to index (i.e., if different from the OPTIMADE field,
the mapper should be used to convert between the two).
unique: Whether or not the index should be unique.
"""
if isinstance(fields, str):
fields = {fields}
self.collection.create_index(field, unique=unique, background=True)

def create_default_index(self) -> None:
"""Create the default index for the collection.
self.collection.create_indexes(fields, unique=unique, background=True)
For MongoDB, the default is to create a unique index
on the `id` field. This method should obey any configured
mappers.
"""
self.create_index(self.resource_mapper.get_backend_field("id"), unique=True)

def handle_query_params(
self, params: EntryListingQueryParams | SingleEntryQueryParams
Expand Down
2 changes: 1 addition & 1 deletion optimade/server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def load_entries(endpoint_name: str, endpoint_collection: EntryCollection):
f"Requested JSONL file does not exist: {jsonl_path}. Please specify an absolute group."
)

insert_from_jsonl(jsonl_path)
insert_from_jsonl(jsonl_path, create_default_index=CONFIG.create_default_index)

LOGGER.debug("Inserted data from JSONL file: %s", jsonl_path)
if CONFIG.insert_test_data:
Expand Down
11 changes: 10 additions & 1 deletion optimade/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@
)


def insert_from_jsonl(jsonl_path: Path) -> None:
def insert_from_jsonl(jsonl_path: Path, create_default_index: bool = False) -> None:
"""Insert OPTIMADE JSON lines data into the database.
Arguments:
jsonl_path: Path to the JSON lines file.
create_default_index: Whether to create a default index on the `id` field.
"""
from collections import defaultdict
Expand All @@ -50,6 +51,14 @@ def insert_from_jsonl(jsonl_path: Path) -> None:
)
jsonl_path = _jsonl_path

# If the chosen database backend supports it, make the default indices
if create_default_index:
for entry_type in ENTRY_COLLECTIONS:
try:
ENTRY_COLLECTIONS[entry_type].create_default_index()
except NotImplementedError:
pass

bad_rows: int = 0
good_rows: int = 0
with open(jsonl_path) as handle:
Expand Down
40 changes: 40 additions & 0 deletions tests/server/entry_collections/test_indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pytest
from bson import ObjectId

from optimade.server.config import CONFIG


@pytest.mark.skipif(
CONFIG.database_backend.value not in ("mongomock", "mongodb"),
reason="Skipping index test when testing the elasticsearch backend.",
)
def test_indexes_are_created_where_appropriate(client):
"""Test that with the test config, default indices are made by
supported backends. This is tested by checking that we cannot insert
an entry with the same underlying ID as the test data, and that this
returns the appopriate database-specific error.
"""
import pymongo.errors

from optimade.server.query_params import EntryListingQueryParams
from optimade.server.routers import ENTRY_COLLECTIONS

# get one structure with and try to reinsert it
for _type in ENTRY_COLLECTIONS:
result, _, _, _, _ = ENTRY_COLLECTIONS[_type].find(
EntryListingQueryParams(page_limit=1)
)
assert result is not None
if isinstance(result, list):
result = result[0]

# The ID is mapped to the test data ID (e.g., 'task_id'), so the index is actually on that
id_field = ENTRY_COLLECTIONS[_type].resource_mapper.get_backend_field("id")

# Take the raw database result, extract the OPTIMADE ID and try to insert a canary
# document containing just that ID, plus a fake MongoDB ID to avoid '_id' clashes
canary = {id_field: result["id"], "_id": ObjectId(24 * "0")}
# Match either for "Duplicate" (mongomock) or "duplicate" (mongodb)
with pytest.raises(pymongo.errors.BulkWriteError, match="uplicate"):
ENTRY_COLLECTIONS[_type].insert([canary]) # type: ignore
1 change: 1 addition & 0 deletions tests/test_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"mongo_count_timeout": 0,
"index_base_url": "http://localhost:5001",
"insert_from_jsonl": "optimade/server/data/test_data.jsonl",
"create_default_index": true,
"provider_fields": {
"structures": [
"band_gap",
Expand Down

0 comments on commit f34cff3

Please sign in to comment.