From a0fe54cea6690e41190449d2ce7d0d6a93f88f8b Mon Sep 17 00:00:00 2001 From: kaloster Date: Thu, 9 Jan 2025 17:17:09 -0500 Subject: [PATCH] Update db import --- apiv2/db_import/importers/annotation.py | 1 + apiv2/db_import/importers/base.py | 18 ++++++++++++++ apiv2/db_import/importers/base_importer.py | 17 +++++++++++++ apiv2/db_import/importers/dataset.py | 1 + apiv2/db_import/importers/frame.py | 1 + apiv2/db_import/importers/tiltseries.py | 2 ++ apiv2/schema/schema.yaml | 28 +++++++++++----------- 7 files changed, 54 insertions(+), 14 deletions(-) diff --git a/apiv2/db_import/importers/annotation.py b/apiv2/db_import/importers/annotation.py index e343f4f98..754e7dd38 100644 --- a/apiv2/db_import/importers/annotation.py +++ b/apiv2/db_import/importers/annotation.py @@ -82,6 +82,7 @@ def load_computed_fields(self): self.model_args["source"] = self.calculate_source() self.model_args["s3_path"] = self.get_s3_url(self.input_data["path"]) self.model_args["https_path"] = self.get_https_url(self.input_data["path"]) + self.model_args["file_size"] = self.get_file_size(self.input_data["path"]) class AnnotationImporter(IntegratedDBImporter): diff --git a/apiv2/db_import/importers/base.py b/apiv2/db_import/importers/base.py index 42cf82a98..d6b082373 100644 --- a/apiv2/db_import/importers/base.py +++ b/apiv2/db_import/importers/base.py @@ -47,6 +47,24 @@ def get_s3_url(self, *input_path: tuple[str]) -> str: input_path = input_path[len(self.config.bucket_name) + 1 :] return os.path.join(self.config.s3_prefix, input_path) + def get_file_size(self, *input_path: tuple[str]) -> str: + input_path = os.path.join(*input_path) + if input_path.startswith(self.config.bucket_name): + input_path = input_path[len(self.config.bucket_name) + 1 :] + + total_size = 0 + try: + response = self.config.s3_client.list_objects_v2(Bucket=self.config.bucket_name, Prefix=input_path) + if 'Contents' in response: + for obj in response['Contents']: + total_size += obj['Size'] + + total_size_mb = round(total_size / (1024 * 1024)) + return total_size_mb if total_size_mb > 1 else None + except Exception as e: + print(f"Error retrieving folder size: {e}") + return None + def _map_direct_fields(self): """Iterate over `self.direct_mapped_fields` and populate model args based on the data we find in the input dict.""" for db_key, _ in self.direct_mapped_fields.items(): diff --git a/apiv2/db_import/importers/base_importer.py b/apiv2/db_import/importers/base_importer.py index 8f3b051d1..a8331f988 100644 --- a/apiv2/db_import/importers/base_importer.py +++ b/apiv2/db_import/importers/base_importer.py @@ -79,6 +79,23 @@ def get_s3_url(self, *input_path: tuple[str]) -> str: input_path = input_path[len(self.config.bucket_name) + 1 :] return os.path.join(self.config.s3_prefix, input_path) + def get_file_size(self, *input_path: tuple[str]) -> str: + input_path = os.path.join(*input_path) + if input_path.startswith(self.config.bucket_name): + input_path = input_path[len(self.config.bucket_name) + 1 :] + + total_size = 0 + try: + response = self.config.s3_client.list_objects_v2(Bucket=self.config.bucket_name, Prefix=input_path) + if 'Contents' in response: + for obj in response['Contents']: + total_size += obj['Size'] + + total_size_mb = round(total_size / (1024 * 1024)) + return total_size_mb if total_size_mb > 1 else None + except Exception as e: + print(f"Error retrieving folder size: {e}") + return None class StaleDeletionDBImporter(BaseDBImporter): """ diff --git a/apiv2/db_import/importers/dataset.py b/apiv2/db_import/importers/dataset.py index 1a65ba57c..da26e1f72 100644 --- a/apiv2/db_import/importers/dataset.py +++ b/apiv2/db_import/importers/dataset.py @@ -64,6 +64,7 @@ def get_computed_fields(self) -> dict[str, Any]: extra_data = { "s3_prefix": self.get_s3_url(self.dir_prefix), "https_prefix": self.get_https_url(self.dir_prefix), + "file_size": self.get_file_size(self.dir_prefix), "key_photo_url": None, "key_photo_thumbnail_url": None, } diff --git a/apiv2/db_import/importers/frame.py b/apiv2/db_import/importers/frame.py index b6d881d81..a0bd2d941 100644 --- a/apiv2/db_import/importers/frame.py +++ b/apiv2/db_import/importers/frame.py @@ -24,6 +24,7 @@ class FrameItem(ItemDBImporter): def load_computed_fields(self): self.model_args["s3_frame_path"] = self.get_s3_url(self.input_data["file"]) self.model_args["https_frame_path"] = self.get_https_url(self.input_data["file"]) + self.model_args["file_size"] = self.get_file_size(self.input_data["file"]) self.model_args["run_id"] = self.input_data["run"].id self.model_args["deposition_id"] = self.input_data["deposition"].id diff --git a/apiv2/db_import/importers/tiltseries.py b/apiv2/db_import/importers/tiltseries.py index 4b34b8b38..b4e925c56 100644 --- a/apiv2/db_import/importers/tiltseries.py +++ b/apiv2/db_import/importers/tiltseries.py @@ -79,10 +79,12 @@ def get_computed_fields(self) -> dict[str, Any]: if mrc_path := self.metadata.get("mrc_file"): extra_data["s3_mrc_file"] = self.get_s3_url(mrc_path) extra_data["https_mrc_file"] = self.get_https_url(mrc_path) + extra_data["file_size_mrc"] = self.get_file_size(mrc_path) if omezarr_path := self.metadata.get("omezarr_dir"): extra_data["s3_omezarr_dir"] = self.get_s3_url(omezarr_path) extra_data["https_omezarr_dir"] = self.get_https_url(omezarr_path) + extra_data["file_size_omezarr"] = self.get_file_size(omezarr_path) if angle_list := self.get_first_match_file_name("*.rawtlt") or self.get_first_match_file_name("*.tlt"): extra_data["s3_angle_list"] = self.get_s3_url(angle_list) diff --git a/apiv2/schema/schema.yaml b/apiv2/schema/schema.yaml index 0598eab89..de326ad9e 100644 --- a/apiv2/schema/schema.yaml +++ b/apiv2/schema/schema.yaml @@ -850,8 +850,8 @@ classes: range: integer minimum_value: 0 unit: - symbol: B - descriptive_name: bytes + symbol: MB + descriptive_name: megabytes https_path: name: https_path description: HTTPS path for this annotation file @@ -1430,8 +1430,8 @@ classes: range: integer minimum_value: 0 unit: - symbol: B - descriptive_name: bytes + symbol: MB + descriptive_name: megabytes slot_usage: id: description: An identifier for a CryoET dataset, assigned by the Data Portal. Used to identify the dataset as the directory name in data tree @@ -1697,8 +1697,8 @@ classes: range: integer minimum_value: 0 unit: - symbol: B - descriptive_name: bytes + symbol: MB + descriptive_name: megabytes PerSectionAlignmentParameters: name: PerSectionAlignmentParameters annotations: @@ -1918,8 +1918,8 @@ classes: range: integer minimum_value: 0 unit: - symbol: B - descriptive_name: bytes + symbol: MB + descriptive_name: megabytes s3_mrc_file: name: s3_mrc_file description: S3 path to this tiltseries in MRC format (no scaling) @@ -1936,8 +1936,8 @@ classes: range: integer minimum_value: 0 unit: - symbol: B - descriptive_name: bytes + symbol: MB + descriptive_name: megabytes https_omezarr_dir: name: https_omezarr_dir description: HTTPS path to this tiltseries in multiscale OME-Zarr format @@ -2495,8 +2495,8 @@ classes: - tomogram_file_size_omezarr range: integer unit: - symbol: B - descriptive_name: bytes + symbol: MB + descriptive_name: megabytes s3_mrc_file: name: s3_mrc_file description: S3 path to this tomogram in MRC format (no scaling) @@ -2519,8 +2519,8 @@ classes: - tomogram_file_size_mrc range: integer unit: - symbol: B - descriptive_name: bytes + symbol: MB + descriptive_name: megabytes scale0_dimensions: name: scale0_dimensions description: comma separated x,y,z dimensions of the unscaled tomogram