From d1423949f534667fc6131eb29729c040461cc16d Mon Sep 17 00:00:00 2001 From: Jessica Gadling Date: Tue, 22 Oct 2024 06:43:28 -0700 Subject: [PATCH] fix: fixes for db ingestion (#334) --- apiv2/db_import/common/config.py | 11 +++++++++-- apiv2/db_import/tests/test_db_tomo_import.py | 2 +- apiv2/scripts/scrape.py | 6 +++--- ingestion_tools/scripts/enqueue_runs.py | 4 ++-- ingestion_tools/scripts/importers/db/tomogram.py | 12 +++++------- .../scripts/tests/db_import/test_db_tomo_import.py | 2 +- .../neuroglancer_config.json} | 0 .../Tomograms/100/tomogram_metadata.json | 4 ++-- .../100-neuroglancer_config.json | 1 - .../Tomograms/100/tomogram_metadata.json | 1 - 10 files changed, 23 insertions(+), 20 deletions(-) rename test_infra/test_files/30001/RUN1/Reconstructions/VoxelSpacing12.300/NeuroglancerPrecompute/{100-neuroglancer_config.json => 100/neuroglancer_config.json} (100%) delete mode 100644 test_infra/test_files/30001/RUN2/Reconstructions/VoxelSpacing3.456/NeuroglancerPrecompute/100-neuroglancer_config.json diff --git a/apiv2/db_import/common/config.py b/apiv2/db_import/common/config.py index c8a907e73..d838f78e1 100644 --- a/apiv2/db_import/common/config.py +++ b/apiv2/db_import/common/config.py @@ -10,6 +10,7 @@ from botocore.exceptions import ClientError from database import models from s3fs import S3FileSystem +from sqlalchemy.exc import NoResultFound from sqlalchemy.orm import Session if TYPE_CHECKING: @@ -66,8 +67,14 @@ def get_tiltseries_by_path(self, path: str) -> int | None: # '_' is a wildcard character in sql LIKE queries, so we need to escape them! escaped_path = os.path.dirname(path).replace("_", "\\_") path = os.path.join(self.s3_prefix, escaped_path, "%") - item = session.scalars(sa.select(models.Tiltseries).where(models.Tiltseries.s3_mrc_file.like(path))).one() - return item.id + try: + item = session.scalars(sa.select(models.Tiltseries).where(models.Tiltseries.s3_mrc_file.like(path))).one() + return item.id + except NoResultFound: + # We have a few runs that (erroneously) are missing tiltseries. + # They need to be fixed, but in the meantime let's not fail ingestion + # for the entire dataset based on that problem. + return None def find_subdirs_with_files(self, prefix: str, target_filename: str) -> list[str]: paginator = self.s3_client.get_paginator("list_objects_v2") diff --git a/apiv2/db_import/tests/test_db_tomo_import.py b/apiv2/db_import/tests/test_db_tomo_import.py index 400c1445e..21d95259c 100644 --- a/apiv2/db_import/tests/test_db_tomo_import.py +++ b/apiv2/db_import/tests/test_db_tomo_import.py @@ -115,7 +115,7 @@ def expected_tomograms_by_run(http_prefix: str) -> dict[str, dict[float, list[di "offset_x": 0, "offset_y": 0, "offset_z": 0, - "neuroglancer_config": '{"foo":"bar","baz":"test"}', + "neuroglancer_config": None, "deposition_id": 300, "is_portal_standard": False, "deposition_date": date(2022, 4, 2), diff --git a/apiv2/scripts/scrape.py b/apiv2/scripts/scrape.py index ad62dac45..7d1a2f94f 100644 --- a/apiv2/scripts/scrape.py +++ b/apiv2/scripts/scrape.py @@ -88,7 +88,7 @@ def add(session, model, item, parents): # "primary_author_status": remote_item.get["primary_annotator_status"], # "corresponding_author_status": remote_item.get("corresponding_annotator_status"), "run_id": parents["run_id"], - "deposition_id": parents["deposition_id"], # Doesn't exist in the old api. + "deposition_id": remote_item["deposition_id"], # Doesn't exist in the old api. "s3_metadata_path": remote_item["s3_metadata_path"], "https_metadata_path": remote_item["https_metadata_path"], "annotation_publication": remote_item["annotation_publication"], @@ -229,7 +229,7 @@ def add(session, model, item, parents): if model == models.Tiltseries: local_item_data = { "run_id": parents["run_id"], - "deposition_id": parents["deposition_id"], # We don't have deposition id's yet + "deposition_id": remote_item["deposition_id"], "s3_omezarr_dir": remote_item["s3_omezarr_dir"], "s3_mrc_file": remote_item["s3_mrc_bin1"], "https_omezarr_dir": remote_item["https_omezarr_dir"], @@ -289,7 +289,7 @@ def add(session, model, item, parents): if model == models.Tomogram: local_item_data = { "alignment_id": parents["alignment_id"], - "deposition_id": parents["deposition_id"], + "deposition_id": remote_item["deposition_id"], "tomogram_voxel_spacing_id": parents["tomogram_voxel_spacing_id"], "run_id": parents["run_id"], "name": remote_item["name"], diff --git a/ingestion_tools/scripts/enqueue_runs.py b/ingestion_tools/scripts/enqueue_runs.py index 8c2bdec85..b6d07084f 100644 --- a/ingestion_tools/scripts/enqueue_runs.py +++ b/ingestion_tools/scripts/enqueue_runs.py @@ -175,7 +175,7 @@ def get_datasets( exclude_datasets = [re.compile(pattern) for pattern in exclude_dataset] s3_config = Config(signature_version=UNSIGNED) if anonymous else None s3_client = boto3.client("s3", config=s3_config) - config = DBImportConfig(s3_client, s3_bucket, https_prefix) + config = DBImportConfig(s3_client, None, s3_bucket, https_prefix) datasets_to_check = [] if include_dataset: @@ -198,7 +198,7 @@ def get_datasets( def get_depositions(s3_bucket, include_depositions, anonymous: bool): s3_config = Config(signature_version=UNSIGNED) if anonymous else None s3_client = boto3.client("s3", config=s3_config) - config = DBImportConfig(s3_client, s3_bucket, "") + config = DBImportConfig(s3_client, None, s3_bucket, "") for dep in include_depositions: for deposition in DepositionDBImporter.get_items(config, dep): deposition_id = os.path.basename(deposition.dir_prefix.strip("/")) diff --git a/ingestion_tools/scripts/importers/db/tomogram.py b/ingestion_tools/scripts/importers/db/tomogram.py index e15dc4eb2..c55a306e8 100644 --- a/ingestion_tools/scripts/importers/db/tomogram.py +++ b/ingestion_tools/scripts/importers/db/tomogram.py @@ -72,12 +72,10 @@ def get_tomogram_type(self) -> str: return "CANONICAL" return "UNKOWN" # TYPO that's also reflected in the db :'( - def generate_neuroglancer_data(self) -> str: - tomogram_id = self.dir_prefix.split("/").pop() - path = os.path.relpath( - os.path.join(self.dir_prefix, f"../../NeuroglancerPrecompute/{tomogram_id}-neuroglancer_config.json"), - ) - config = self.config.load_key_json(path, is_file_required=True) + def generate_neuroglancer_data(self, config_path) -> str: + if not config_path: + return "{}" + config = self.config.load_key_json(config_path, is_file_required=True) # TODO: Log warning return json.dumps(config, separators=(",", ":")) if config else "{}" @@ -94,7 +92,7 @@ def get_computed_fields(self) -> dict[str, Any]: "https_mrc_scale0": self.get_https_url(self.metadata["mrc_file"]), "key_photo_url": None, "key_photo_thumbnail_url": None, - "neuroglancer_config": self.generate_neuroglancer_data(), + "neuroglancer_config": self.generate_neuroglancer_data(self.metadata.get("neuroglancer_config_path")), "type": self.get_tomogram_type(), } if key_photos := self.metadata.get("key_photo"): diff --git a/ingestion_tools/scripts/tests/db_import/test_db_tomo_import.py b/ingestion_tools/scripts/tests/db_import/test_db_tomo_import.py index e07a999a6..3cef1ea82 100644 --- a/ingestion_tools/scripts/tests/db_import/test_db_tomo_import.py +++ b/ingestion_tools/scripts/tests/db_import/test_db_tomo_import.py @@ -120,7 +120,7 @@ def expected_tomograms_by_run(http_prefix: str) -> dict[str, dict[float, list[di "offset_x": 0, "offset_y": 0, "offset_z": 0, - "neuroglancer_config": '{"foo":"bar","baz":"test"}', + "neuroglancer_config": "{}", "type": "CANONICAL", "deposition_id": 300, } diff --git a/test_infra/test_files/30001/RUN1/Reconstructions/VoxelSpacing12.300/NeuroglancerPrecompute/100-neuroglancer_config.json b/test_infra/test_files/30001/RUN1/Reconstructions/VoxelSpacing12.300/NeuroglancerPrecompute/100/neuroglancer_config.json similarity index 100% rename from test_infra/test_files/30001/RUN1/Reconstructions/VoxelSpacing12.300/NeuroglancerPrecompute/100-neuroglancer_config.json rename to test_infra/test_files/30001/RUN1/Reconstructions/VoxelSpacing12.300/NeuroglancerPrecompute/100/neuroglancer_config.json diff --git a/test_infra/test_files/30001/RUN1/Reconstructions/VoxelSpacing12.300/Tomograms/100/tomogram_metadata.json b/test_infra/test_files/30001/RUN1/Reconstructions/VoxelSpacing12.300/Tomograms/100/tomogram_metadata.json index c20f17bc8..b36362623 100644 --- a/test_infra/test_files/30001/RUN1/Reconstructions/VoxelSpacing12.300/Tomograms/100/tomogram_metadata.json +++ b/test_infra/test_files/30001/RUN1/Reconstructions/VoxelSpacing12.300/Tomograms/100/tomogram_metadata.json @@ -94,7 +94,7 @@ "last_modified_date": "2023-09-02", "release_date": "2024-06-01" }, - "alignment_metadata_path": "test-public-bucket/30001/RUN1/Alignments/100/alignment_metadata.json", - "neuroglancer_config_path": "test-public-bucket/30001/RUN1/Reconstructions/VoxelSpacing12.300/NeuroglancerPrecompute/100-neuroglancer_config.json", + "alignment_metadata_path": "30001/RUN1/Alignments/100/alignment_metadata.json", + "neuroglancer_config_path": "30001/RUN1/Reconstructions/VoxelSpacing12.300/NeuroglancerPrecompute/100/neuroglancer_config.json", "last_updated_at": 1728676818 } diff --git a/test_infra/test_files/30001/RUN2/Reconstructions/VoxelSpacing3.456/NeuroglancerPrecompute/100-neuroglancer_config.json b/test_infra/test_files/30001/RUN2/Reconstructions/VoxelSpacing3.456/NeuroglancerPrecompute/100-neuroglancer_config.json deleted file mode 100644 index 6aa34f2fe..000000000 --- a/test_infra/test_files/30001/RUN2/Reconstructions/VoxelSpacing3.456/NeuroglancerPrecompute/100-neuroglancer_config.json +++ /dev/null @@ -1 +0,0 @@ -{"foo":"bar","baz":"test"} diff --git a/test_infra/test_files/30001/RUN2/Reconstructions/VoxelSpacing3.456/Tomograms/100/tomogram_metadata.json b/test_infra/test_files/30001/RUN2/Reconstructions/VoxelSpacing3.456/Tomograms/100/tomogram_metadata.json index dc2215d7b..42deb268b 100644 --- a/test_infra/test_files/30001/RUN2/Reconstructions/VoxelSpacing3.456/Tomograms/100/tomogram_metadata.json +++ b/test_infra/test_files/30001/RUN2/Reconstructions/VoxelSpacing3.456/Tomograms/100/tomogram_metadata.json @@ -71,6 +71,5 @@ "release_date": "2022-06-01" }, "alignment_metadata_path": "foo", - "neuroglancer_config_path": "test-public-bucket/30001/RUN2/Reconstructions/VoxelSpacing3.456/NeuroglancerPrecompute/100-neuroglancer_config.json", "last_updated_at": 1728676818 }