Skip to content

Commit

Permalink
Merge pull request #70 from SciCatProject/refactor-offline-ingestor
Browse files Browse the repository at this point in the history
Refactor orgdatablock constructor
  • Loading branch information
YooSunYoung authored Aug 28, 2024
2 parents 4da7d7f + 513d3d9 commit bdafab7
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 81 deletions.
81 changes: 80 additions & 1 deletion src/scicat_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@

import h5py
from scicat_communication import retrieve_value_from_scicat
from scicat_configuration import DatasetOptions, FileHandlingOptions, SciCatOptions
from scicat_configuration import (
DatasetOptions,
FileHandlingOptions,
SciCatOptions,
)
from scicat_metadata import (
HIGH_LEVEL_METADATA_TYPE,
SCIENTIFIC_METADATA_TYPE,
Expand Down Expand Up @@ -156,6 +160,8 @@ class OrigDataBlockInstance:
size: int
chkAlg: str
dataFileList: list[DataFileListItem]
ownerGroup: str | None = None
accessGroups: list[str] | None = None


def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str | None:
Expand Down Expand Up @@ -483,3 +489,76 @@ def scicat_dataset_to_dict(dataset: ScicatDataset) -> dict:
"""
return {k: v for k, v in asdict(dataset).items() if v is not None}


def _define_dataset_source_folder(datafilelist: list[DataFileListItem]) -> pathlib.Path:
"""
Return the dataset source folder, which is the common path
between all the data files associated with the dataset
"""
import os

return pathlib.Path(os.path.commonpath([item.path for item in datafilelist]))


def _path_to_relative(
datafilelist_item: DataFileListItem, dataset_source_folder: pathlib.Path
) -> DataFileListItem:
"""
Copy the datafiles item and transform the path to the relative path
to the dataset source folder
"""
from copy import copy

origdatablock_datafilelist_item = copy(datafilelist_item)
origdatablock_datafilelist_item.path = (
pathlib.Path(datafilelist_item.path)
.relative_to(dataset_source_folder)
.as_posix()
)
return origdatablock_datafilelist_item


def _prepare_origdatablock_datafilelist(
datafiles_list: list[DataFileListItem], dataset_source_folder: pathlib.Path
) -> list[DataFileListItem]:
"""
Prepare the datafiles list for the origdatablock entry in scicat
That means that the file paths needs to be relative to the dataset source folder
"""
return [_path_to_relative(item, dataset_source_folder) for item in datafiles_list]


def create_origdatablock_instance(
data_file_list: list[DataFileListItem],
scicat_dataset: dict,
config: FileHandlingOptions,
) -> OrigDataBlockInstance:
dataset_source_folder = _define_dataset_source_folder(data_file_list)
origdatablock_datafiles_list = _prepare_origdatablock_datafilelist(
data_file_list, dataset_source_folder
)
return OrigDataBlockInstance(
datasetId=scicat_dataset["pid"],
size=sum([item.size for item in data_file_list if item.size is not None]),
chkAlg=config.file_hash_algorithm,
dataFileList=origdatablock_datafiles_list,
ownerGroup=scicat_dataset["ownerGroup"],
accessGroups=scicat_dataset["accessGroups"],
)


def origdatablock_to_dict(origdatablock: OrigDataBlockInstance) -> dict:
"""
Convert the ``origdatablock`` to a dictionary.
It removes the ``None`` values from the dictionary.
You can add more handlings for specific fields here if needed.
Params
------
origdatablock:
Origdatablock instance to be sent to scicat backend.
"""
return {k: v for k, v in asdict(origdatablock).items() if v is not None}
99 changes: 19 additions & 80 deletions src/scicat_offline_ingestor.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
# import scippnexus as snx
import copy
import json
import os
import pathlib

import h5py
Expand All @@ -14,8 +10,10 @@
)
from scicat_dataset import (
create_data_file_list,
create_origdatablock_instance,
create_scicat_dataset_instance,
extract_variables_values,
origdatablock_to_dict,
scicat_dataset_to_dict,
)
from scicat_logging import build_logger
Expand All @@ -24,68 +22,12 @@
from system_helpers import handle_exceptions


def _prepare_scicat_origdatablock(scicat_dataset, datafilelist, config, logger):
"""
Create local copy of the orig datablock to send to scicat
"""
logger.info(
"_prepare_scicat_origdatablock: Preparing scicat origdatablock structure"
)
origdatablock = {
"ownerGroup": scicat_dataset["ownerGroup"],
"accessGroups": scicat_dataset["accessGroups"],
"size": sum([item["size"] for item in datafilelist]),
"chkAlg": config.ingestion.file_hash_algorithm,
"dataFileList": datafilelist,
"datasetId": scicat_dataset["pid"],
}

logger.info(
"_prepare_scicat_origdatablock: Scicat origdatablock: %s",
json.dumps(origdatablock),
)
return origdatablock


def _define_dataset_source_folder(datafilelist) -> pathlib.Path:
"""
Return the dataset source folder, which is the common path
between all the data files associated with the dataset
"""
return pathlib.Path(os.path.commonpath([item["path"] for item in datafilelist]))


def _path_to_relative(
datafilelist_item: dict, dataset_source_folder: pathlib.Path
) -> dict:
"""
Copy the datafiles item and transform the path to the relative path
to the dataset source folder
"""
origdatablock_datafilelist_item = copy.deepcopy(datafilelist_item)
origdatablock_datafilelist_item["path"] = str(
datafilelist_item["path"].to_relative(dataset_source_folder)
)
return origdatablock_datafilelist_item


def _prepare_origdatablock_datafilelist(
datafiles_list: list, dataset_source_folder: pathlib.Path
) -> list:
"""
Prepare the datafiles list for the origdatablock entry in scicat
That means that the file paths needs to be relative to the dataset source folder
"""
return [_path_to_relative(item, dataset_source_folder) for item in datafiles_list]


def main() -> None:
"""Main entry point of the app."""
arg_parser = build_offline_ingestor_arg_parser()
arg_namespace = arg_parser.parse_args()
config = build_scicat_offline_ingestor_config(arg_namespace)
ingestion_options = config.ingestion
fh_options = ingestion_options.file_handling
fh_options = config.ingestion.file_handling
logger = build_logger(config)

# Log the configuration as dictionary so that it is easier to read from the logs
Expand All @@ -95,17 +37,13 @@ def main() -> None:
)

# Collect all metadata schema configurations
schemas = collect_schemas(ingestion_options.schemas_directory)
schemas = collect_schemas(config.ingestion.schemas_directory)

with handle_exceptions(logger):
nexus_file_path = pathlib.Path(config.offline_run.nexus_file)
logger.info(
"Nexus file to be ingested : %s",
nexus_file_path,
)
logger.info("Nexus file to be ingested: %s", nexus_file_path)

# define which is the directory where the ingestor should save
# the files it creates, if any is created
# Path to the directory where the ingestor saves the files it creates
ingestor_directory = compose_ingestor_directory(fh_options, nexus_file_path)

# open nexus file with h5py
Expand All @@ -127,7 +65,8 @@ def main() -> None:
# TODO: add done_writing_message_file and nexus_structure_file
)

# Create scicat dataset instance(entry)
# Prepare scicat dataset instance(entry)
logger.info("Preparing scicat dataset instance ...")
local_dataset = scicat_dataset_to_dict(
create_scicat_dataset_instance(
metadata_schema_id=metadata_schema["id"],
Expand All @@ -138,22 +77,22 @@ def main() -> None:
logger=logger,
)
)
# create dataset in scicat
logger.debug("Scicat dataset: %s", local_dataset)
# Create dataset in scicat
scicat_dataset = create_scicat_dataset(
dataset=local_dataset, config=config.scicat, logger=logger
)

dataset_source_folder = _define_dataset_source_folder(data_file_list)

origdatablock_datafiles_list = _prepare_origdatablock_datafilelist(
data_file_list, dataset_source_folder
)
# create and populate scicat origdatablock entry
# with files and hashes previously computed
local_origdatablock = _prepare_scicat_origdatablock(
scicat_dataset, origdatablock_datafiles_list, config, logger
# Prepare origdatablock
logger.info("Preparing scicat origdatablock instance ...")
local_origdatablock = origdatablock_to_dict(
create_origdatablock_instance(
data_file_list=data_file_list,
scicat_dataset=local_dataset,
config=fh_options,
)
)

logger.debug("Scicat origdatablock: %s", local_origdatablock)
# create origdatablock in scicat
scicat_origdatablock = create_scicat_origdatablock(
origdatablock=local_origdatablock, config=config.scicat, logger=logger
Expand Down

0 comments on commit bdafab7

Please sign in to comment.