Skip to content

Commit

Permalink
fix: fixes for db ingestion (#334)
Browse files Browse the repository at this point in the history
  • Loading branch information
jgadling authored Oct 22, 2024
1 parent 02441de commit d142394
Show file tree
Hide file tree
Showing 10 changed files with 23 additions and 20 deletions.
11 changes: 9 additions & 2 deletions apiv2/db_import/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from botocore.exceptions import ClientError
from database import models
from s3fs import S3FileSystem
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import Session

if TYPE_CHECKING:
Expand Down Expand Up @@ -66,8 +67,14 @@ def get_tiltseries_by_path(self, path: str) -> int | None:
# '_' is a wildcard character in sql LIKE queries, so we need to escape them!
escaped_path = os.path.dirname(path).replace("_", "\\_")
path = os.path.join(self.s3_prefix, escaped_path, "%")
item = session.scalars(sa.select(models.Tiltseries).where(models.Tiltseries.s3_mrc_file.like(path))).one()
return item.id
try:
item = session.scalars(sa.select(models.Tiltseries).where(models.Tiltseries.s3_mrc_file.like(path))).one()
return item.id
except NoResultFound:
# We have a few runs that (erroneously) are missing tiltseries.
# They need to be fixed, but in the meantime let's not fail ingestion
# for the entire dataset based on that problem.
return None

def find_subdirs_with_files(self, prefix: str, target_filename: str) -> list[str]:
paginator = self.s3_client.get_paginator("list_objects_v2")
Expand Down
2 changes: 1 addition & 1 deletion apiv2/db_import/tests/test_db_tomo_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def expected_tomograms_by_run(http_prefix: str) -> dict[str, dict[float, list[di
"offset_x": 0,
"offset_y": 0,
"offset_z": 0,
"neuroglancer_config": '{"foo":"bar","baz":"test"}',
"neuroglancer_config": None,
"deposition_id": 300,
"is_portal_standard": False,
"deposition_date": date(2022, 4, 2),
Expand Down
6 changes: 3 additions & 3 deletions apiv2/scripts/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def add(session, model, item, parents):
# "primary_author_status": remote_item.get["primary_annotator_status"],
# "corresponding_author_status": remote_item.get("corresponding_annotator_status"),
"run_id": parents["run_id"],
"deposition_id": parents["deposition_id"], # Doesn't exist in the old api.
"deposition_id": remote_item["deposition_id"], # Doesn't exist in the old api.
"s3_metadata_path": remote_item["s3_metadata_path"],
"https_metadata_path": remote_item["https_metadata_path"],
"annotation_publication": remote_item["annotation_publication"],
Expand Down Expand Up @@ -229,7 +229,7 @@ def add(session, model, item, parents):
if model == models.Tiltseries:
local_item_data = {
"run_id": parents["run_id"],
"deposition_id": parents["deposition_id"], # We don't have deposition id's yet
"deposition_id": remote_item["deposition_id"],
"s3_omezarr_dir": remote_item["s3_omezarr_dir"],
"s3_mrc_file": remote_item["s3_mrc_bin1"],
"https_omezarr_dir": remote_item["https_omezarr_dir"],
Expand Down Expand Up @@ -289,7 +289,7 @@ def add(session, model, item, parents):
if model == models.Tomogram:
local_item_data = {
"alignment_id": parents["alignment_id"],
"deposition_id": parents["deposition_id"],
"deposition_id": remote_item["deposition_id"],
"tomogram_voxel_spacing_id": parents["tomogram_voxel_spacing_id"],
"run_id": parents["run_id"],
"name": remote_item["name"],
Expand Down
4 changes: 2 additions & 2 deletions ingestion_tools/scripts/enqueue_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def get_datasets(
exclude_datasets = [re.compile(pattern) for pattern in exclude_dataset]
s3_config = Config(signature_version=UNSIGNED) if anonymous else None
s3_client = boto3.client("s3", config=s3_config)
config = DBImportConfig(s3_client, s3_bucket, https_prefix)
config = DBImportConfig(s3_client, None, s3_bucket, https_prefix)

datasets_to_check = []
if include_dataset:
Expand All @@ -198,7 +198,7 @@ def get_datasets(
def get_depositions(s3_bucket, include_depositions, anonymous: bool):
s3_config = Config(signature_version=UNSIGNED) if anonymous else None
s3_client = boto3.client("s3", config=s3_config)
config = DBImportConfig(s3_client, s3_bucket, "")
config = DBImportConfig(s3_client, None, s3_bucket, "")
for dep in include_depositions:
for deposition in DepositionDBImporter.get_items(config, dep):
deposition_id = os.path.basename(deposition.dir_prefix.strip("/"))
Expand Down
12 changes: 5 additions & 7 deletions ingestion_tools/scripts/importers/db/tomogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,10 @@ def get_tomogram_type(self) -> str:
return "CANONICAL"
return "UNKOWN" # TYPO that's also reflected in the db :'(

def generate_neuroglancer_data(self) -> str:
tomogram_id = self.dir_prefix.split("/").pop()
path = os.path.relpath(
os.path.join(self.dir_prefix, f"../../NeuroglancerPrecompute/{tomogram_id}-neuroglancer_config.json"),
)
config = self.config.load_key_json(path, is_file_required=True)
def generate_neuroglancer_data(self, config_path) -> str:
if not config_path:
return "{}"
config = self.config.load_key_json(config_path, is_file_required=True)
# TODO: Log warning
return json.dumps(config, separators=(",", ":")) if config else "{}"

Expand All @@ -94,7 +92,7 @@ def get_computed_fields(self) -> dict[str, Any]:
"https_mrc_scale0": self.get_https_url(self.metadata["mrc_file"]),
"key_photo_url": None,
"key_photo_thumbnail_url": None,
"neuroglancer_config": self.generate_neuroglancer_data(),
"neuroglancer_config": self.generate_neuroglancer_data(self.metadata.get("neuroglancer_config_path")),
"type": self.get_tomogram_type(),
}
if key_photos := self.metadata.get("key_photo"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def expected_tomograms_by_run(http_prefix: str) -> dict[str, dict[float, list[di
"offset_x": 0,
"offset_y": 0,
"offset_z": 0,
"neuroglancer_config": '{"foo":"bar","baz":"test"}',
"neuroglancer_config": "{}",
"type": "CANONICAL",
"deposition_id": 300,
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
"last_modified_date": "2023-09-02",
"release_date": "2024-06-01"
},
"alignment_metadata_path": "test-public-bucket/30001/RUN1/Alignments/100/alignment_metadata.json",
"neuroglancer_config_path": "test-public-bucket/30001/RUN1/Reconstructions/VoxelSpacing12.300/NeuroglancerPrecompute/100-neuroglancer_config.json",
"alignment_metadata_path": "30001/RUN1/Alignments/100/alignment_metadata.json",
"neuroglancer_config_path": "30001/RUN1/Reconstructions/VoxelSpacing12.300/NeuroglancerPrecompute/100/neuroglancer_config.json",
"last_updated_at": 1728676818
}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,5 @@
"release_date": "2022-06-01"
},
"alignment_metadata_path": "foo",
"neuroglancer_config_path": "test-public-bucket/30001/RUN2/Reconstructions/VoxelSpacing3.456/NeuroglancerPrecompute/100-neuroglancer_config.json",
"last_updated_at": 1728676818
}

0 comments on commit d142394

Please sign in to comment.