Skip to content

Commit

Permalink
Update db import
Browse files Browse the repository at this point in the history
  • Loading branch information
kaloster committed Jan 9, 2025
1 parent 9271110 commit 6b40981
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 15 deletions.
1 change: 1 addition & 0 deletions apiv2/db_import/importers/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def load_computed_fields(self):
self.model_args["source"] = self.calculate_source()
self.model_args["s3_path"] = self.get_s3_url(self.input_data["path"])
self.model_args["https_path"] = self.get_https_url(self.input_data["path"])
self.model_args["file_size"] = self.get_file_size(self.input_data["path"])


class AnnotationImporter(IntegratedDBImporter):
Expand Down
18 changes: 18 additions & 0 deletions apiv2/db_import/importers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,24 @@ def get_s3_url(self, *input_path: tuple[str]) -> str:
if input_path.startswith(self.config.bucket_name):
input_path = input_path[len(self.config.bucket_name) + 1 :]
return os.path.join(self.config.s3_prefix, input_path)

def get_file_size(self, *input_path: tuple[str]) -> str:
input_path = os.path.join(*input_path)
if input_path.startswith(self.config.bucket_name):
input_path = input_path[len(self.config.bucket_name) + 1 :]

total_size = 0
try:
response = self.config.s3_client.list_objects_v2(Bucket=self.config.bucket_name, Prefix=input_path)
if 'Contents' in response:
for obj in response['Contents']:
total_size += obj['Size']

total_size_mb = round(total_size / (1024 * 1024))
return total_size_mb if total_size_mb > 1 else None
except Exception as e:
print(f"Error retrieving folder size: {e}")
return None

def _map_direct_fields(self):
"""Iterate over `self.direct_mapped_fields` and populate model args based on the data we find in the input dict."""
Expand Down
17 changes: 17 additions & 0 deletions apiv2/db_import/importers/base_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,24 @@ def get_s3_url(self, *input_path: tuple[str]) -> str:
input_path = input_path[len(self.config.bucket_name) + 1 :]
return os.path.join(self.config.s3_prefix, input_path)

def get_file_size(self, *input_path: tuple[str]) -> str:
input_path = os.path.join(*input_path)
if input_path.startswith(self.config.bucket_name):
input_path = input_path[len(self.config.bucket_name) + 1 :]

total_size = 0
try:
response = self.config.s3_client.list_objects_v2(Bucket=self.config.bucket_name, Prefix=input_path)
if 'Contents' in response:
for obj in response['Contents']:
total_size += obj['Size']

total_size_mb = round(total_size / (1024 * 1024))
return total_size_mb if total_size_mb > 1 else None
except Exception as e:
print(f"Error retrieving folder size: {e}")
return None

class StaleDeletionDBImporter(BaseDBImporter):
"""
Supports insert of new record, update of existing record, and delete of stale records. This class iterates over a
Expand Down
1 change: 1 addition & 0 deletions apiv2/db_import/importers/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def get_computed_fields(self) -> dict[str, Any]:
extra_data = {
"s3_prefix": self.get_s3_url(self.dir_prefix),
"https_prefix": self.get_https_url(self.dir_prefix),
"file_size": self.get_file_size(self.dir_prefix),
"key_photo_url": None,
"key_photo_thumbnail_url": None,
}
Expand Down
1 change: 1 addition & 0 deletions apiv2/db_import/importers/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class FrameItem(ItemDBImporter):
def load_computed_fields(self):
self.model_args["s3_frame_path"] = self.get_s3_url(self.input_data["file"])
self.model_args["https_frame_path"] = self.get_https_url(self.input_data["file"])
self.model_args["file_size"] = self.get_file_size(self.input_data["file"])
self.model_args["run_id"] = self.input_data["run"].id
self.model_args["deposition_id"] = self.input_data["deposition"].id

Expand Down
4 changes: 3 additions & 1 deletion apiv2/db_import/importers/tiltseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,13 @@ def get_computed_fields(self) -> dict[str, Any]:
if mrc_path := self.metadata.get("mrc_file"):
extra_data["s3_mrc_file"] = self.get_s3_url(mrc_path)
extra_data["https_mrc_file"] = self.get_https_url(mrc_path)
extra_data["file_size_mrc"] = self.get_file_size(mrc_path)

if omezarr_path := self.metadata.get("omezarr_dir"):
extra_data["s3_omezarr_dir"] = self.get_s3_url(omezarr_path)
extra_data["https_omezarr_dir"] = self.get_https_url(omezarr_path)

extra_data["file_size_omezarr"] = self.get_file_size(omezarr_path)

if angle_list := self.get_first_match_file_name("*.rawtlt") or self.get_first_match_file_name("*.tlt"):
extra_data["s3_angle_list"] = self.get_s3_url(angle_list)
extra_data["https_angle_list"] = self.get_https_url(angle_list)
Expand Down
28 changes: 14 additions & 14 deletions apiv2/schema/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -850,8 +850,8 @@ classes:
range: integer
minimum_value: 0
unit:
symbol: B
descriptive_name: bytes
symbol: MB
descriptive_name: megabytes
https_path:
name: https_path
description: HTTPS path for this annotation file
Expand Down Expand Up @@ -1430,8 +1430,8 @@ classes:
range: integer
minimum_value: 0
unit:
symbol: B
descriptive_name: bytes
symbol: MB
descriptive_name: megabytes
slot_usage:
id:
description: An identifier for a CryoET dataset, assigned by the Data Portal. Used to identify the dataset as the directory name in data tree
Expand Down Expand Up @@ -1697,8 +1697,8 @@ classes:
range: integer
minimum_value: 0
unit:
symbol: B
descriptive_name: bytes
symbol: MB
descriptive_name: megabytes
PerSectionAlignmentParameters:
name: PerSectionAlignmentParameters
annotations:
Expand Down Expand Up @@ -1918,8 +1918,8 @@ classes:
range: integer
minimum_value: 0
unit:
symbol: B
descriptive_name: bytes
symbol: MB
descriptive_name: megabytes
s3_mrc_file:
name: s3_mrc_file
description: S3 path to this tiltseries in MRC format (no scaling)
Expand All @@ -1936,8 +1936,8 @@ classes:
range: integer
minimum_value: 0
unit:
symbol: B
descriptive_name: bytes
symbol: MB
descriptive_name: megabytes
https_omezarr_dir:
name: https_omezarr_dir
description: HTTPS path to this tiltseries in multiscale OME-Zarr format
Expand Down Expand Up @@ -2495,8 +2495,8 @@ classes:
- tomogram_file_size_omezarr
range: integer
unit:
symbol: B
descriptive_name: bytes
symbol: MB
descriptive_name: megabytes
s3_mrc_file:
name: s3_mrc_file
description: S3 path to this tomogram in MRC format (no scaling)
Expand All @@ -2519,8 +2519,8 @@ classes:
- tomogram_file_size_mrc
range: integer
unit:
symbol: B
descriptive_name: bytes
symbol: MB
descriptive_name: megabytes
scale0_dimensions:
name: scale0_dimensions
description: comma separated x,y,z dimensions of the unscaled tomogram
Expand Down
2 changes: 2 additions & 0 deletions ingestion_tools/scripts/importers/db/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ def get_data_map(self) -> dict[str, Any]:
def update_data_map(self, data_map: dict[str, Any], metadata: dict[str, Any], index: int) -> dict[str, Any]:
data_map["s3_path"] = self.get_s3_url(metadata["path"])
data_map["https_path"] = self.get_https_url(metadata["path"])
# test_file_size = self.get_file_size(data_map["s3_path"])
print(f"DEBUG!!!!!")
return data_map

@classmethod
Expand Down

0 comments on commit 6b40981

Please sign in to comment.