From e58ab5feb0ce6b8ceacae17041ba50b5325dacda Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Mon, 30 Sep 2024 16:52:32 +0200 Subject: [PATCH 1/7] work of the day --- src/scicat_communication.py | 6 +++--- src/scicat_dataset.py | 24 ++++++++++++++++-------- src/scicat_metadata.py | 8 +++++--- src/scicat_offline_ingestor.py | 7 ++++++- src/scicat_online_ingestor.py | 3 +++ 5 files changed, 33 insertions(+), 15 deletions(-) mode change 100644 => 100755 src/scicat_offline_ingestor.py diff --git a/src/scicat_communication.py b/src/scicat_communication.py index d9216c0..aef829c 100644 --- a/src/scicat_communication.py +++ b/src/scicat_communication.py @@ -13,11 +13,11 @@ def retrieve_value_from_scicat( variable_url: str, # It should be already rendered from variable_recipe["url"] field_name: str, # variable_recipe["field"] ) -> str: - url = config.host.removesuffix('/') + variable_url + url = config.host.removesuffix('/') + "/" + variable_url response: dict = requests.get( - url, headers={"token": config.token}, timeout=config.timeout + url, headers={"Authorization": config.token}, timeout=config.timeout ).json() - return response[field_name] + return response[field_name] if field_name else response class ScicatDatasetAPIError(Exception): diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index c465771..4429c48 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -47,6 +47,8 @@ def to_date(value: Any) -> str | None: return datetime.datetime.fromtimestamp(value, tz=datetime.UTC).isoformat() return None +def to_dict(value: Any) -> dict: + return dict(value) _DtypeConvertingMap = MappingProxyType( { @@ -55,6 +57,7 @@ def to_date(value: Any) -> str | None: "integer": to_integer, "float": to_float, "date": to_date, + "dict": to_dict, # TODO: Add email converter } ) @@ -73,6 +76,7 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: { "DO_NOTHING": lambda value: value, "join_with_space": lambda value: ", ".join(value), + "evaluate": lambda value: eval(value), } ) @@ -82,28 +86,32 @@ def _get_operator(operator: str | None) -> Callable: def extract_variables_values( - variables: dict[str, dict], h5file: h5py.File, config: SciCatOptions + variables: dict[str, dict], + h5file: h5py.File, + config: SciCatOptions ) -> dict: variable_map = {} for variable_name, variable_recipe in variables.items(): - if (source := variable_recipe["source"]) == "NXS": - value = h5file[variable_recipe["path"]][...] + print(variable_name) + source = variable_recipe.source + if source == "NXS": + value = h5file[variable_recipe.path][...].item().decode('utf-8') elif source == "SC": value = retrieve_value_from_scicat( config=config, variable_url=render_variable_value( - variable_recipe["url"], variable_map + variable_recipe.url, variable_map ), - field_name=variable_recipe["field"], + field_name=variable_recipe.field, ) elif source == "VALUE": - value = _get_operator(variable_recipe.get("operator"))( - render_variable_value(variable_recipe["value"], variable_map) + value = _get_operator(variable_recipe.operator)( + render_variable_value(variable_recipe.value, variable_map) ) else: raise Exception("Invalid variable source: ", source) variable_map[variable_name] = convert_to_type( - value, variable_recipe["value_type"] + value, variable_recipe.value_type ) return variable_map diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index 0548f6a..2a01898 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -138,8 +138,8 @@ def from_file(cls, schema_file_name: pathlib.Path) -> "MetadataSchema": def render_variable_value(var_value: str, variable_registry: dict) -> str: - for var_name, var_value in variable_registry.items(): - var_value = var_value.replace("<" + var_name + ">", str(var_value)) + for reg_var_name, reg_var_value in variable_registry.items(): + var_value = var_value.replace("<" + reg_var_name + ">", str(reg_var_value)) if "<" in var_value and ">" in var_value: raise Exception(f"Unresolved variable: {var_value}") @@ -167,7 +167,9 @@ def collect_schemas(dir_path: pathlib.Path) -> OrderedDict[str, MetadataSchema]: def select_applicable_schema( - nexus_file: pathlib.Path, schemas: OrderedDict[str, MetadataSchema] + nexus_file: pathlib.Path, + h5_file: any, + schemas: OrderedDict[str, MetadataSchema] ) -> MetadataSchema: """ Evaluates which metadata schema configuration is applicable to ``nexus_file``. diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py old mode 100644 new mode 100755 index 638e1da..85cbd8e --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -70,7 +70,9 @@ def main() -> None: # define variables values variable_map = extract_variables_values( - metadata_schema['variables'], h5file, config.scicat + metadata_schema.variables, + h5file, + config.scicat ) # Collect data-file descriptions @@ -124,3 +126,6 @@ def main() -> None: scicat_origdatablock, ) raise RuntimeError("Failed to create dataset or origdatablock.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/scicat_online_ingestor.py b/src/scicat_online_ingestor.py index b7e4101..5ebb9a0 100644 --- a/src/scicat_online_ingestor.py +++ b/src/scicat_online_ingestor.py @@ -164,3 +164,6 @@ def main() -> None: # check if we need to commit the individual message if config.kafka.individual_message_commit: _individual_message_commit(offline_ingestors, consumer, logger) + +if __name__ == "__main__": + main() \ No newline at end of file From 6c0c5cb9911be6465ebce46459fdf77d776a47c2 Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Wed, 2 Oct 2024 09:33:13 +0200 Subject: [PATCH 2/7] testing and more testing --- src/scicat_dataset.py | 45 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index 4429c48..fcac009 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -22,7 +22,8 @@ VALID_METADATA_TYPES, render_variable_value, ) - +import re +import copy def to_string(value: Any) -> str: return str(value) @@ -75,7 +76,7 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: _OPERATOR_REGISTRY = MappingProxyType( { "DO_NOTHING": lambda value: value, - "join_with_space": lambda value: ", ".join(value), + "join_with_space": lambda value: ", ".join(eval(value) if isinstance(value,str) else value), "evaluate": lambda value: eval(value), } ) @@ -95,7 +96,18 @@ def extract_variables_values( print(variable_name) source = variable_recipe.source if source == "NXS": - value = h5file[variable_recipe.path][...].item().decode('utf-8') + path = variable_recipe.path + if "*" in path: + provided_path = path.split("/")[1:] + provided_path[0] = "/" + provided_path[0] + expanded_paths = extract_paths_from_h5_file(h5file,provided_path) + value = [ + h5file[p][...].item().decode("utf-8") + for p + in expanded_paths + ] + else: + value = h5file[path][...].item().decode("utf-8") elif source == "SC": value = retrieve_value_from_scicat( config=config, @@ -105,9 +117,9 @@ def extract_variables_values( field_name=variable_recipe.field, ) elif source == "VALUE": - value = _get_operator(variable_recipe.operator)( - render_variable_value(variable_recipe.value, variable_map) - ) + value = variable_recipe.value + value = render_variable_value(value, variable_map) if isinstance(value,str) else value + value = _get_operator(variable_recipe.operator)(value) else: raise Exception("Invalid variable source: ", source) variable_map[variable_name] = convert_to_type( @@ -115,6 +127,27 @@ def extract_variables_values( ) return variable_map +def extract_paths_from_h5_file( + h5_object: Any, + path: list[str], +) -> list[str]: + master_key = path.pop(0) + output_paths = [master_key] + if "*" in master_key: + temp_keys = [k2 for k2 in list(h5_object.keys()) if re.search(master_key, k2)] + output_paths = [] + for key in temp_keys: + output_paths += [ + key + "/" + subkey + for subkey + in extract_paths_from_h5_file(h5_object[key], copy.deepcopy(path)) + ] + else: + if path: + output_paths = [master_key + "/" + subkey for subkey in extract_paths_from_h5_file(h5_object[master_key],path)] + + return output_paths + @dataclass(kw_only=True) class TechniqueDesc: From aa9bcb37fab92089c7873b6fd8600826cd4bf82e Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Thu, 3 Oct 2024 13:31:50 +0200 Subject: [PATCH 3/7] tested untill scicat dataset local creation --- src/scicat_dataset.py | 179 +++++++++++++++++++-------------- src/scicat_offline_ingestor.py | 7 +- 2 files changed, 110 insertions(+), 76 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index fcac009..09efe94 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -2,6 +2,7 @@ # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) import datetime import logging +import os.path import pathlib import uuid from collections.abc import Callable, Iterable @@ -14,7 +15,7 @@ from scicat_configuration import ( DatasetOptions, FileHandlingOptions, - SciCatOptions, + SciCatOptions, OfflineIngestorConfig, ) from scicat_metadata import ( HIGH_LEVEL_METADATA_TYPE, @@ -30,7 +31,7 @@ def to_string(value: Any) -> str: def to_string_array(value: list[Any]) -> list[str]: - return [str(v) for v in value] + return [str(v) for v in (eval(value) if isinstance(value, str) else value)] def to_integer(value: Any) -> int: @@ -59,6 +60,7 @@ def to_dict(value: Any) -> dict: "float": to_float, "date": to_date, "dict": to_dict, + "email": to_string # TODO: Add email converter } ) @@ -68,7 +70,8 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: if (converter := _DtypeConvertingMap.get(dtype_desc)) is None: raise ValueError( "Invalid dtype description. Must be one of: ", - "string, string[], integer, float, date.\nGot: {dtype_desc}", + "string, string[], integer, float, date.", + f"Got: {dtype_desc}", ) return converter(input_value) @@ -78,6 +81,8 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: "DO_NOTHING": lambda value: value, "join_with_space": lambda value: ", ".join(eval(value) if isinstance(value,str) else value), "evaluate": lambda value: eval(value), + "filename": lambda value: os.path.basename(value), + "dirname-2": lambda value: os.path.dirname(os.path.dirname(value)) } ) @@ -89,9 +94,12 @@ def _get_operator(operator: str | None) -> Callable: def extract_variables_values( variables: dict[str, dict], h5file: h5py.File, - config: SciCatOptions + config: OfflineIngestorConfig ) -> dict: - variable_map = {} + variable_map = { + "filepath" : pathlib.Path(config.nexus_file), + "now" : datetime.datetime.now().isoformat(), + } for variable_name, variable_recipe in variables.items(): print(variable_name) source = variable_recipe.source @@ -110,7 +118,7 @@ def extract_variables_values( value = h5file[path][...].item().decode("utf-8") elif source == "SC": value = retrieve_value_from_scicat( - config=config, + config=config.scicat, variable_url=render_variable_value( variable_recipe.url, variable_map ), @@ -164,7 +172,7 @@ class ScicatDataset: numberOfFiles: int isPublished: bool = False datasetName: str - description: str + description: str = field(default=None) principalInvestigator: str creationLocation: str scientificMetadata: dict @@ -174,18 +182,18 @@ class ScicatDataset: contactEmail: str creationTime: str type: str = "raw" - sampleId: str + sampleId: str = field(default=None) techniques: list[TechniqueDesc] = field(default_factory=list) instrumentId: str | None = None proposalId: str | None = None ownerGroup: str | None = None - accessGroup: list[str] | None = None + accessGroups: list[str] | None = None @dataclass(kw_only=True) class DataFileListItem: path: str - "Absolute path to the file." + "Relative path of the file to the source folder." size: int | None = None "Size of the single file in bytes." time: str @@ -212,7 +220,7 @@ def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str | N if not file_path.exists(): return None - if algorithm_name != "b2blake": + if algorithm_name != "blake2b": raise ValueError( "Only b2blake hash algorithm is supported for now. Got: ", f"{algorithm_name}", @@ -230,33 +238,59 @@ def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str | N def _create_single_data_file_list_item( *, file_path: pathlib.Path, - calculate_checksum: bool, + compute_file_hash: bool, compute_file_stats: bool, file_hash_algorithm: str = "", ) -> DataFileListItem: """``DataFileListItem`` constructing helper.""" - if file_path.exists() and compute_file_stats: - return DataFileListItem( - path=file_path.absolute().as_posix(), - size=(file_stats := file_path.stat()).st_size, - time=datetime.datetime.fromtimestamp( - file_stats.st_ctime, tz=datetime.UTC - ).strftime("%Y-%m-%dT%H:%M:%S.000Z"), - chk=_calculate_checksum(file_path, file_hash_algorithm) - if calculate_checksum - else None, - uid=str(file_stats.st_uid), - gid=str(file_stats.st_gid), - perm=oct(file_stats.st_mode), - ) - else: - return DataFileListItem( - path=file_path.absolute().as_posix(), - time=datetime.datetime.now(tz=datetime.UTC).strftime( - "%Y-%m-%dT%H:%M:%S.000Z" - ), - ) + file_info = { + "path" : file_path.absolute().as_posix(), + "time" : datetime.datetime.now(tz=datetime.UTC).strftime( + "%Y-%m-%dT%H:%M:%S.000Z" + ), + } + if file_path.exists(): + if compute_file_stats: + file_stats = file_path.stat() + file_info = { + **file_info, + **{ + "size" : file_stats.st_size, + "time" : datetime.datetime.fromtimestamp( + file_stats.st_ctime, tz=datetime.UTC + ).strftime("%Y-%m-%dT%H:%M:%S.000Z"), + "uid" : str(file_stats.st_uid), + "gid" : str(file_stats.st_gid), + "perm" : oct(file_stats.st_mode), + } + } + + if compute_file_hash: + file_info["chk"] = _calculate_checksum(file_path, file_hash_algorithm) + + return DataFileListItem(**file_info) + # if file_path.exists() and compute_file_stats: + # return DataFileListItem( + # path=file_path.absolute().as_posix(), + # size=(file_stats := file_path.stat()).st_size, + # time=datetime.datetime.fromtimestamp( + # file_stats.st_ctime, tz=datetime.UTC + # ).strftime("%Y-%m-%dT%H:%M:%S.000Z"), + # chk=_calculate_checksum(file_path, file_hash_algorithm) + # if compute_file_hash + # else None, + # uid=str(file_stats.st_uid), + # gid=str(file_stats.st_gid), + # perm=oct(file_stats.st_mode), + # ) + # else: + # return DataFileListItem( + # path=file_path.absolute().as_posix(), + # time=datetime.datetime.now(tz=datetime.UTC).strftime( + # "%Y-%m-%dT%H:%M:%S.000Z" + # ), + # ) def _build_hash_path( @@ -289,6 +323,7 @@ def create_data_file_list( nexus_structure_file: pathlib.Path | None = None, ingestor_directory: pathlib.Path, config: FileHandlingOptions, + source_folder: pathlib.Path | str | None = None, logger: logging.Logger, ) -> list[DataFileListItem]: """ @@ -316,6 +351,7 @@ def create_data_file_list( _create_single_data_file_list_item, file_hash_algorithm=config.file_hash_algorithm, compute_file_stats=config.compute_file_stats, + compute_file_hash=config.compute_file_hash ) # Collect the files that will be ingested @@ -331,7 +367,6 @@ def create_data_file_list( logger.info("Adding file %s to the datafiles list", minimum_file_path) new_file_item = single_file_constructor( file_path=minimum_file_path, - calculate_checksum=config.compute_file_hash, ) data_file_list.append(new_file_item) if config.save_file_hash: @@ -344,31 +379,29 @@ def create_data_file_list( hash_file_extension=config.hash_file_extension, ) logger.info("Saving hash into a file ... %s", hash_file_path) - if new_file_item.chk is not None: - _save_hash_file( - original_file_instance=new_file_item, hash_path=hash_file_path - ) - data_file_list.append( - single_file_constructor( - file_path=hash_file_path, calculate_checksum=False - ) - ) - else: - logger.warning( - "File instance of (%s) does not have checksum. " - "Probably the file does not exist. " - "Skip saving...", - minimum_file_path, + _save_hash_file( + original_file_instance=new_file_item, hash_path=hash_file_path + ) + data_file_list.append( + single_file_constructor( + file_path=hash_file_path, + compute_file_hash=False ) + ) + if source_folder: + for data_file in data_file_list: + data_file.path = str( + pathlib.Path(data_file.path).relative_to(source_folder)) return data_file_list def _filter_by_field_type(schemas: Iterable[dict], field_type: str) -> list[dict]: - return [field for field in schemas if field["field_type"] == field_type] + return [field for field in schemas if field.field_type == field_type] def _render_variable_as_type(value: str, variable_map: dict, dtype: str) -> Any: + print(value, dtype) return convert_to_type(render_variable_value(value, variable_map), dtype) @@ -399,13 +432,13 @@ def _create_scientific_metadata( "type": "string", }, **{ - field["machine_name"]: { + field.machine_name: { "value": _render_variable_as_type( - field["value"], variable_map, field["type"] + field.value, variable_map, field.type ), - "unit": field.get("unit", ""), - "human_name": field.get("human_name", field["machine_name"]), - "type": field["type"], + "unit": getattr(field,"unit", ""), + "human_name": getattr(field,"human_name", field.machine_name), + "type": field.type, } for field in sm_schemas }, @@ -413,15 +446,15 @@ def _create_scientific_metadata( def _validate_metadata_schemas( - metadata_schemas: dict[str, dict], + metadata_schema: dict[str, dict], ) -> None: - if any( - invalid_types := [ - field["field_type"] - for field in metadata_schemas.values() - if field["field_type"] not in VALID_METADATA_TYPES - ] - ): + invalid_types = [ + field.field_type + for field in metadata_schema.values() + if field.field_type not in VALID_METADATA_TYPES + ] + + if any(invalid_types): raise ValueError( "Invalid metadata schema types found. Valid types are: ", VALID_METADATA_TYPES, @@ -433,7 +466,7 @@ def _validate_metadata_schemas( def create_scicat_dataset_instance( *, metadata_schema_id: str, # metadata-schema["id"] - metadata_schemas: dict[str, dict], # metadata-schema["schema"] + metadata_schema: dict[str, dict], # metadata-schema["schema"] variable_map: dict, data_file_list: list[DataFileListItem], config: DatasetOptions, @@ -456,7 +489,7 @@ def create_scicat_dataset_instance( Logger instance. """ - _validate_metadata_schemas(metadata_schemas) + _validate_metadata_schemas(metadata_schema) # Create the dataset instance scicat_dataset = ScicatDataset( size=sum([file.size for file in data_file_list if file.size is not None]), @@ -465,23 +498,23 @@ def create_scicat_dataset_instance( scientificMetadata=_create_scientific_metadata( metadata_schema_id=metadata_schema_id, sm_schemas=_filter_by_field_type( - metadata_schemas.values(), SCIENTIFIC_METADATA_TYPE + metadata_schema.values(), SCIENTIFIC_METADATA_TYPE ), # Scientific metadata schemas variable_map=variable_map, ), **{ - field["machine_name"]: _render_variable_as_type( - field["value"], variable_map, field["type"] + field.machine_name: _render_variable_as_type( + field.value, variable_map, field.type ) for field in _filter_by_field_type( - metadata_schemas.values(), HIGH_LEVEL_METADATA_TYPE + metadata_schema.values(), HIGH_LEVEL_METADATA_TYPE ) # High level schemas }, ) # Auto generate or assign default values if needed - if not config.allow_dataset_pid: + if not config.allow_dataset_pid and scicat_dataset.pid: logger.info("PID is not allowed in the dataset by configuration.") scicat_dataset.pid = None elif config.generate_dataset_pid: @@ -505,11 +538,11 @@ def create_scicat_dataset_instance( "Owner group is not provided. Setting to default value. %s", scicat_dataset.ownerGroup, ) - if scicat_dataset.accessGroup is None: - scicat_dataset.accessGroup = config.default_access_groups + if scicat_dataset.accessGroups is None: + scicat_dataset.accessGroups = config.default_access_groups logger.info( "Access group is not provided. Setting to default value. %s", - scicat_dataset.accessGroup, + scicat_dataset.accessGroups, ) logger.info("Dataset instance is created successfully. %s", scicat_dataset) diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index 85cbd8e..cc2a465 100755 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -72,7 +72,7 @@ def main() -> None: variable_map = extract_variables_values( metadata_schema.variables, h5file, - config.scicat + config ) # Collect data-file descriptions @@ -80,6 +80,7 @@ def main() -> None: nexus_file=nexus_file_path, ingestor_directory=ingestor_directory, config=fh_options, + source_folder=variable_map["source_folder"], logger=logger, # TODO: add done_writing_message_file and nexus_structure_file ) @@ -88,8 +89,8 @@ def main() -> None: logger.info("Preparing scicat dataset instance ...") local_dataset = scicat_dataset_to_dict( create_scicat_dataset_instance( - metadata_schema_id=metadata_schema["id"], - metadata_schemas=metadata_schema["schemas"], + metadata_schema_id=metadata_schema.id, + metadata_schema=metadata_schema.schema, variable_map=variable_map, data_file_list=data_file_list, config=config.dataset, From 7014bc6c2237c75bf800536076ac0f955963490a Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Wed, 9 Oct 2024 16:44:47 +0200 Subject: [PATCH 4/7] first successful ingestion --- src/scicat_communication.py | 31 +++++++++++++++++++++++-------- src/scicat_configuration.py | 20 ++++++++++++++++++-- src/scicat_dataset.py | 10 +++++++--- src/scicat_offline_ingestor.py | 15 +++++++++++---- 4 files changed, 59 insertions(+), 17 deletions(-) diff --git a/src/scicat_communication.py b/src/scicat_communication.py index aef829c..01375c5 100644 --- a/src/scicat_communication.py +++ b/src/scicat_communication.py @@ -10,12 +10,13 @@ def retrieve_value_from_scicat( *, config: SciCatOptions, - variable_url: str, # It should be already rendered from variable_recipe["url"] + scicat_endpoint_url: str, # It should be already rendered from variable_recipe["url"] field_name: str, # variable_recipe["field"] ) -> str: - url = config.host.removesuffix('/') + "/" + variable_url response: dict = requests.get( - url, headers={"Authorization": config.token}, timeout=config.timeout + scicat_endpoint_url, + headers=config.headers, + timeout=config.timeout ).json() return response[field_name] if field_name else response @@ -44,9 +45,9 @@ def create_scicat_dataset( """ logger.info("Sending POST request to create new dataset") response = _post_to_scicat( - url=urljoin(config.host, "datasets"), + url= config.urls["datasets"], posting_obj=dataset, - headers={"token": config.token, **config.headers}, + headers=config.headers, timeout=config.timeout, ) result: dict = response.json() @@ -69,16 +70,19 @@ class ScicatOrigDatablockAPIError(Exception): def create_scicat_origdatablock( - *, origdatablock: dict, config: SciCatOptions, logger: logging.Logger + *, + origdatablock: dict, + config: SciCatOptions, + logger: logging.Logger ) -> dict: """ Execute a POST request to scicat to create a new origdatablock """ logger.info("Sending POST request to create new origdatablock") response = _post_to_scicat( - url=urljoin(config.host, "origdatablocks"), + url=config.urls["origdatablocks"], posting_obj=origdatablock, - headers={"token": config.token, **config.headers}, + headers=config.headers, timeout=config.timeout, ) result: dict = response.json() @@ -97,3 +101,14 @@ def create_scicat_origdatablock( result['_id'], ) return result + +def render_full_url( + url: str, + config: SciCatOptions, +) -> str: + if not url.startswith("http://") and not url.startswith("https://"): + for endpoint in config.urls.keys(): + if url.startswith(endpoint): + url = url.replace(endpoint,config.urls[endpoint]) + break + return url \ No newline at end of file diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index 16ce518..c4360fb 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -8,6 +8,7 @@ from pathlib import Path from types import MappingProxyType from typing import Any, TypeVar, get_origin +from urllib.parse import urljoin def _load_config(config_file: Path) -> dict: @@ -237,12 +238,23 @@ class SciCatOptions: timeout: int = 0 stream: bool = True verify: bool = False + urls: dict = field(default_factory=dict) @classmethod def from_configurations(cls, config: dict) -> "SciCatOptions": """Create SciCatOptions from a dictionary.""" options = cls(**config) - options.headers = {"Authorization": f"Bearer {options.token}"} + options.host = options.host.removesuffix('/') + "/" + options.headers = { + **options.headers, + **{"Authorization": f"Bearer {options.token}"} + } + options.urls = { + "datasets" : urljoin(options.host, "datasets"), + "proposals" : urljoin(options.host, "proposals"), + "origdatablocks" : urljoin(options.host, "origdatablocks"), + "instruments": urljoin(options.host, "instruments"), + } return options @@ -335,9 +347,13 @@ def merge_config_and_input_args( def _validate_config_file(target_type: type[T], config_file: Path) -> T: + config = { + **_load_config(config_file), + "config_file": config_file.as_posix() + } return build_dataclass( target_type, - {**_load_config(config_file), "config_file": config_file.as_posix()}, + config, ) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index 09efe94..bec2038 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -11,7 +11,7 @@ from typing import Any import h5py -from scicat_communication import retrieve_value_from_scicat +from scicat_communication import retrieve_value_from_scicat, render_full_url from scicat_configuration import ( DatasetOptions, FileHandlingOptions, @@ -119,8 +119,12 @@ def extract_variables_values( elif source == "SC": value = retrieve_value_from_scicat( config=config.scicat, - variable_url=render_variable_value( - variable_recipe.url, variable_map + scicat_endpoint_url=render_full_url( + render_variable_value( + variable_recipe.url, + variable_map + ), + config.scicat, ), field_name=variable_recipe.field, ) diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index cc2a465..bf97637 100755 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -9,7 +9,7 @@ OfflineIngestorConfig, build_arg_parser, build_dataclass, - merge_config_and_input_args, + merge_config_and_input_args, SciCatOptions, ) from scicat_dataset import ( create_data_file_list, @@ -38,7 +38,10 @@ def build_offline_config() -> OfflineIngestorConfig: # with ``OnlineIngestorConfig``. del merged_configuration["kafka"] - return build_dataclass(OfflineIngestorConfig, merged_configuration) + config = build_dataclass(OfflineIngestorConfig, merged_configuration) + config.scicat = SciCatOptions.from_configurations(merged_configuration["scicat"]) + + return config def main() -> None: @@ -100,7 +103,9 @@ def main() -> None: logger.debug("Scicat dataset: %s", local_dataset) # Create dataset in scicat scicat_dataset = create_scicat_dataset( - dataset=local_dataset, config=config.scicat, logger=logger + dataset=local_dataset, + config=config.scicat, + logger=logger ) # Prepare origdatablock @@ -115,7 +120,9 @@ def main() -> None: logger.debug("Scicat origdatablock: %s", local_origdatablock) # create origdatablock in scicat scicat_origdatablock = create_scicat_origdatablock( - origdatablock=local_origdatablock, config=config.scicat, logger=logger + origdatablock=local_origdatablock, + config=config.scicat, + logger=logger ) # check one more time if we successfully created the entries in scicat From 8d1110c98fa7338f23a78c91fd1017f47aebc25b Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Mon, 21 Oct 2024 15:22:35 +0200 Subject: [PATCH 5/7] Fix formatting and type-hints --- resources/config.sample.json | 3 +- src/scicat_communication.py | 20 ++-- src/scicat_configuration.py | 13 +-- src/scicat_dataset.py | 187 +++++++++++++++++---------------- src/scicat_metadata.py | 3 +- src/scicat_offline_ingestor.py | 20 ++-- src/scicat_online_ingestor.py | 3 +- tests/test_scicat_dataset.py | 4 +- 8 files changed, 125 insertions(+), 128 deletions(-) diff --git a/resources/config.sample.json b/resources/config.sample.json index e49c33c..d77e935 100644 --- a/resources/config.sample.json +++ b/resources/config.sample.json @@ -60,6 +60,7 @@ "headers": {}, "timeout": 0, "stream": true, - "verify": false + "verify": false, + "urls": {} } } diff --git a/src/scicat_communication.py b/src/scicat_communication.py index 01375c5..9199cd8 100644 --- a/src/scicat_communication.py +++ b/src/scicat_communication.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) import logging -from urllib.parse import urljoin import requests from scicat_configuration import SciCatOptions @@ -10,13 +9,12 @@ def retrieve_value_from_scicat( *, config: SciCatOptions, - scicat_endpoint_url: str, # It should be already rendered from variable_recipe["url"] + scicat_endpoint_url: str, # It should be already rendered + # from variable_recipe["url"] field_name: str, # variable_recipe["field"] ) -> str: response: dict = requests.get( - scicat_endpoint_url, - headers=config.headers, - timeout=config.timeout + scicat_endpoint_url, headers=config.headers, timeout=config.timeout ).json() return response[field_name] if field_name else response @@ -45,7 +43,7 @@ def create_scicat_dataset( """ logger.info("Sending POST request to create new dataset") response = _post_to_scicat( - url= config.urls["datasets"], + url=config.urls["datasets"], posting_obj=dataset, headers=config.headers, timeout=config.timeout, @@ -70,10 +68,7 @@ class ScicatOrigDatablockAPIError(Exception): def create_scicat_origdatablock( - *, - origdatablock: dict, - config: SciCatOptions, - logger: logging.Logger + *, origdatablock: dict, config: SciCatOptions, logger: logging.Logger ) -> dict: """ Execute a POST request to scicat to create a new origdatablock @@ -102,6 +97,7 @@ def create_scicat_origdatablock( ) return result + def render_full_url( url: str, config: SciCatOptions, @@ -109,6 +105,6 @@ def render_full_url( if not url.startswith("http://") and not url.startswith("https://"): for endpoint in config.urls.keys(): if url.startswith(endpoint): - url = url.replace(endpoint,config.urls[endpoint]) + url = url.replace(endpoint, config.urls[endpoint]) break - return url \ No newline at end of file + return url diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index c4360fb..df131cf 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -247,12 +247,12 @@ def from_configurations(cls, config: dict) -> "SciCatOptions": options.host = options.host.removesuffix('/') + "/" options.headers = { **options.headers, - **{"Authorization": f"Bearer {options.token}"} + **{"Authorization": f"Bearer {options.token}"}, } options.urls = { - "datasets" : urljoin(options.host, "datasets"), - "proposals" : urljoin(options.host, "proposals"), - "origdatablocks" : urljoin(options.host, "origdatablocks"), + "datasets": urljoin(options.host, "datasets"), + "proposals": urljoin(options.host, "proposals"), + "origdatablocks": urljoin(options.host, "origdatablocks"), "instruments": urljoin(options.host, "instruments"), } return options @@ -347,10 +347,7 @@ def merge_config_and_input_args( def _validate_config_file(target_type: type[T], config_file: Path) -> T: - config = { - **_load_config(config_file), - "config_file": config_file.as_posix() - } + config = {**_load_config(config_file), "config_file": config_file.as_posix()} return build_dataclass( target_type, config, diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index bec2038..df81824 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) +import ast +import copy import datetime import logging import os.path import pathlib +import re import uuid from collections.abc import Callable, Iterable from dataclasses import asdict, dataclass, field @@ -11,27 +14,33 @@ from typing import Any import h5py -from scicat_communication import retrieve_value_from_scicat, render_full_url +from scicat_communication import render_full_url, retrieve_value_from_scicat from scicat_configuration import ( DatasetOptions, FileHandlingOptions, - SciCatOptions, OfflineIngestorConfig, + OfflineIngestorConfig, ) from scicat_metadata import ( HIGH_LEVEL_METADATA_TYPE, SCIENTIFIC_METADATA_TYPE, VALID_METADATA_TYPES, + MetadataItem, + MetadataSchemaVariable, + NexusFileMetadataVariable, + ScicatMetadataVariable, + ValueMetadataVariable, render_variable_value, ) -import re -import copy + def to_string(value: Any) -> str: return str(value) def to_string_array(value: list[Any]) -> list[str]: - return [str(v) for v in (eval(value) if isinstance(value, str) else value)] + return [ + str(v) for v in (ast.literal_eval(value) if isinstance(value, str) else value) + ] def to_integer(value: Any) -> int: @@ -49,9 +58,11 @@ def to_date(value: Any) -> str | None: return datetime.datetime.fromtimestamp(value, tz=datetime.UTC).isoformat() return None + def to_dict(value: Any) -> dict: return dict(value) + _DtypeConvertingMap = MappingProxyType( { "string": to_string, @@ -60,7 +71,7 @@ def to_dict(value: Any) -> dict: "float": to_float, "date": to_date, "dict": to_dict, - "email": to_string + "email": to_string, # TODO: Add email converter } ) @@ -79,10 +90,12 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: _OPERATOR_REGISTRY = MappingProxyType( { "DO_NOTHING": lambda value: value, - "join_with_space": lambda value: ", ".join(eval(value) if isinstance(value,str) else value), - "evaluate": lambda value: eval(value), + "join_with_space": lambda value: ", ".join( + ast.literal_eval(value) if isinstance(value, str) else value + ), + "evaluate": lambda value: ast.literal_eval(value), "filename": lambda value: os.path.basename(value), - "dirname-2": lambda value: os.path.dirname(os.path.dirname(value)) + "dirname-2": lambda value: os.path.dirname(os.path.dirname(value)), } ) @@ -91,72 +104,83 @@ def _get_operator(operator: str | None) -> Callable: return _OPERATOR_REGISTRY.get(operator or "DO_NOTHING", lambda _: _) +def _retrieve_as_string( + h5file: h5py.File, path: str, *, encoding: str = "utf-8" +) -> str: + return h5file[path][...].item().decode(encoding) + + +def _retrieve_values_from_file( + variable_recipe: NexusFileMetadataVariable, h5file: h5py.File +) -> Any: + if "*" in variable_recipe.path: # Selectors are used + path = variable_recipe.path.split("/")[1:] + path[0] += "/" + paths = extract_paths_from_h5_file(h5file, path) + value = [_retrieve_as_string(h5file, p) for p in paths] + else: + value = _retrieve_as_string(h5file, variable_recipe.path) + return value + + def extract_variables_values( - variables: dict[str, dict], + variables: dict[str, MetadataSchemaVariable], h5file: h5py.File, - config: OfflineIngestorConfig + config: OfflineIngestorConfig, ) -> dict: variable_map = { - "filepath" : pathlib.Path(config.nexus_file), - "now" : datetime.datetime.now().isoformat(), + "filepath": pathlib.Path(config.nexus_file), + "now": datetime.datetime.now(tz=datetime.UTC).isoformat(), } for variable_name, variable_recipe in variables.items(): - print(variable_name) source = variable_recipe.source - if source == "NXS": - path = variable_recipe.path - if "*" in path: - provided_path = path.split("/")[1:] - provided_path[0] = "/" + provided_path[0] - expanded_paths = extract_paths_from_h5_file(h5file,provided_path) - value = [ - h5file[p][...].item().decode("utf-8") - for p - in expanded_paths - ] - else: - value = h5file[path][...].item().decode("utf-8") - elif source == "SC": + if isinstance(variable_recipe, NexusFileMetadataVariable): + value = _retrieve_values_from_file(variable_recipe, h5file) + elif isinstance(variable_recipe, ScicatMetadataVariable): value = retrieve_value_from_scicat( config=config.scicat, scicat_endpoint_url=render_full_url( - render_variable_value( - variable_recipe.url, - variable_map - ), + render_variable_value(variable_recipe.url, variable_map), config.scicat, ), field_name=variable_recipe.field, ) - elif source == "VALUE": + elif isinstance(variable_recipe, ValueMetadataVariable): value = variable_recipe.value - value = render_variable_value(value, variable_map) if isinstance(value,str) else value + value = ( + render_variable_value(value, variable_map) + if isinstance(value, str) + else value + ) value = _get_operator(variable_recipe.operator)(value) else: raise Exception("Invalid variable source: ", source) - variable_map[variable_name] = convert_to_type( - value, variable_recipe.value_type - ) + variable_map[variable_name] = convert_to_type(value, variable_recipe.value_type) + return variable_map + def extract_paths_from_h5_file( - h5_object: Any, - path: list[str], + _h5_object: h5py.Group | h5py.File, + _path: list[str], ) -> list[str]: - master_key = path.pop(0) + master_key = _path.pop(0) output_paths = [master_key] if "*" in master_key: - temp_keys = [k2 for k2 in list(h5_object.keys()) if re.search(master_key, k2)] - output_paths = [] + temp_keys = [k2 for k2 in _h5_object.keys() if re.search(master_key, k2)] for key in temp_keys: output_paths += [ key + "/" + subkey - for subkey - in extract_paths_from_h5_file(h5_object[key], copy.deepcopy(path)) + for subkey in extract_paths_from_h5_file( + _h5_object[key], copy.deepcopy(_path) + ) ] else: - if path: - output_paths = [master_key + "/" + subkey for subkey in extract_paths_from_h5_file(h5_object[master_key],path)] + if _path: + output_paths = [ + master_key + "/" + subkey + for subkey in extract_paths_from_h5_file(_h5_object[master_key], _path) + ] return output_paths @@ -176,7 +200,7 @@ class ScicatDataset: numberOfFiles: int isPublished: bool = False datasetName: str - description: str = field(default=None) + description: str | None = None principalInvestigator: str creationLocation: str scientificMetadata: dict @@ -186,7 +210,7 @@ class ScicatDataset: contactEmail: str creationTime: str type: str = "raw" - sampleId: str = field(default=None) + sampleId: str | None = None techniques: list[TechniqueDesc] = field(default_factory=list) instrumentId: str | None = None proposalId: str | None = None @@ -248,53 +272,33 @@ def _create_single_data_file_list_item( ) -> DataFileListItem: """``DataFileListItem`` constructing helper.""" - file_info = { - "path" : file_path.absolute().as_posix(), - "time" : datetime.datetime.now(tz=datetime.UTC).strftime( + file_info: dict[str, Any] = { + "path": file_path.absolute().as_posix(), + "time": datetime.datetime.now(tz=datetime.UTC).strftime( "%Y-%m-%dT%H:%M:%S.000Z" ), } if file_path.exists(): if compute_file_stats: file_stats = file_path.stat() + timestamp_str = datetime.datetime.fromtimestamp( + file_stats.st_ctime, tz=datetime.UTC + ).strftime("%Y-%m-%dT%H:%M:%S.000Z") file_info = { **file_info, **{ - "size" : file_stats.st_size, - "time" : datetime.datetime.fromtimestamp( - file_stats.st_ctime, tz=datetime.UTC - ).strftime("%Y-%m-%dT%H:%M:%S.000Z"), - "uid" : str(file_stats.st_uid), - "gid" : str(file_stats.st_gid), - "perm" : oct(file_stats.st_mode), - } + "size": file_stats.st_size, + "time": timestamp_str, + "uid": str(file_stats.st_uid), + "gid": str(file_stats.st_gid), + "perm": oct(file_stats.st_mode), + }, } if compute_file_hash: file_info["chk"] = _calculate_checksum(file_path, file_hash_algorithm) return DataFileListItem(**file_info) - # if file_path.exists() and compute_file_stats: - # return DataFileListItem( - # path=file_path.absolute().as_posix(), - # size=(file_stats := file_path.stat()).st_size, - # time=datetime.datetime.fromtimestamp( - # file_stats.st_ctime, tz=datetime.UTC - # ).strftime("%Y-%m-%dT%H:%M:%S.000Z"), - # chk=_calculate_checksum(file_path, file_hash_algorithm) - # if compute_file_hash - # else None, - # uid=str(file_stats.st_uid), - # gid=str(file_stats.st_gid), - # perm=oct(file_stats.st_mode), - # ) - # else: - # return DataFileListItem( - # path=file_path.absolute().as_posix(), - # time=datetime.datetime.now(tz=datetime.UTC).strftime( - # "%Y-%m-%dT%H:%M:%S.000Z" - # ), - # ) def _build_hash_path( @@ -355,7 +359,7 @@ def create_data_file_list( _create_single_data_file_list_item, file_hash_algorithm=config.file_hash_algorithm, compute_file_stats=config.compute_file_stats, - compute_file_hash=config.compute_file_hash + compute_file_hash=config.compute_file_hash, ) # Collect the files that will be ingested @@ -388,31 +392,32 @@ def create_data_file_list( ) data_file_list.append( single_file_constructor( - file_path=hash_file_path, - compute_file_hash=False + file_path=hash_file_path, compute_file_hash=False ) ) if source_folder: for data_file in data_file_list: data_file.path = str( - pathlib.Path(data_file.path).relative_to(source_folder)) + pathlib.Path(data_file.path).relative_to(source_folder) + ) return data_file_list -def _filter_by_field_type(schemas: Iterable[dict], field_type: str) -> list[dict]: +def _filter_by_field_type( + schemas: Iterable[MetadataItem], field_type: str +) -> list[MetadataItem]: return [field for field in schemas if field.field_type == field_type] def _render_variable_as_type(value: str, variable_map: dict, dtype: str) -> Any: - print(value, dtype) return convert_to_type(render_variable_value(value, variable_map), dtype) def _create_scientific_metadata( *, metadata_schema_id: str, - sm_schemas: list[dict], + sm_schemas: list[MetadataItem], variable_map: dict, ) -> dict: """Create scientific metadata from the metadata schema configuration. @@ -440,8 +445,8 @@ def _create_scientific_metadata( "value": _render_variable_as_type( field.value, variable_map, field.type ), - "unit": getattr(field,"unit", ""), - "human_name": getattr(field,"human_name", field.machine_name), + "unit": getattr(field, "unit", ""), + "human_name": getattr(field, "human_name", field.machine_name), "type": field.type, } for field in sm_schemas @@ -450,7 +455,7 @@ def _create_scientific_metadata( def _validate_metadata_schemas( - metadata_schema: dict[str, dict], + metadata_schema: dict[str, MetadataItem], ) -> None: invalid_types = [ field.field_type @@ -470,7 +475,7 @@ def _validate_metadata_schemas( def create_scicat_dataset_instance( *, metadata_schema_id: str, # metadata-schema["id"] - metadata_schema: dict[str, dict], # metadata-schema["schema"] + metadata_schema: dict[str, MetadataItem], # metadata-schema["schema"] variable_map: dict, data_file_list: list[DataFileListItem], config: DatasetOptions, diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index 2a01898..9006249 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -168,8 +168,7 @@ def collect_schemas(dir_path: pathlib.Path) -> OrderedDict[str, MetadataSchema]: def select_applicable_schema( nexus_file: pathlib.Path, - h5_file: any, - schemas: OrderedDict[str, MetadataSchema] + schemas: OrderedDict[str, MetadataSchema], ) -> MetadataSchema: """ Evaluates which metadata schema configuration is applicable to ``nexus_file``. diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index bf97637..d78e354 100755 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -7,9 +7,10 @@ from scicat_communication import create_scicat_dataset, create_scicat_origdatablock from scicat_configuration import ( OfflineIngestorConfig, + SciCatOptions, build_arg_parser, build_dataclass, - merge_config_and_input_args, SciCatOptions, + merge_config_and_input_args, ) from scicat_dataset import ( create_data_file_list, @@ -69,13 +70,11 @@ def main() -> None: # open nexus file with h5py with h5py.File(nexus_file_path) as h5file: # load instrument metadata configuration - metadata_schema = select_applicable_schema(nexus_file_path, h5file, schemas) + metadata_schema = select_applicable_schema(nexus_file_path, schemas) # define variables values variable_map = extract_variables_values( - metadata_schema.variables, - h5file, - config + metadata_schema.variables, h5file, config ) # Collect data-file descriptions @@ -103,9 +102,7 @@ def main() -> None: logger.debug("Scicat dataset: %s", local_dataset) # Create dataset in scicat scicat_dataset = create_scicat_dataset( - dataset=local_dataset, - config=config.scicat, - logger=logger + dataset=local_dataset, config=config.scicat, logger=logger ) # Prepare origdatablock @@ -120,9 +117,7 @@ def main() -> None: logger.debug("Scicat origdatablock: %s", local_origdatablock) # create origdatablock in scicat scicat_origdatablock = create_scicat_origdatablock( - origdatablock=local_origdatablock, - config=config.scicat, - logger=logger + origdatablock=local_origdatablock, config=config.scicat, logger=logger ) # check one more time if we successfully created the entries in scicat @@ -135,5 +130,6 @@ def main() -> None: ) raise RuntimeError("Failed to create dataset or origdatablock.") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/scicat_online_ingestor.py b/src/scicat_online_ingestor.py index 5ebb9a0..17893d4 100644 --- a/src/scicat_online_ingestor.py +++ b/src/scicat_online_ingestor.py @@ -165,5 +165,6 @@ def main() -> None: if config.kafka.individual_message_commit: _individual_message_commit(offline_ingestors, consumer, logger) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_scicat_dataset.py b/tests/test_scicat_dataset.py index e53b492..c68fbe5 100644 --- a/tests/test_scicat_dataset.py +++ b/tests/test_scicat_dataset.py @@ -11,7 +11,9 @@ def test_dtype_string_converter() -> None: def test_dtype_string_array_converter() -> None: - assert convert_to_type("test", "string[]") == ["t", "e", "s", "t"] + assert convert_to_type("'test'", "string[]") == ["t", "e", "s", "t"] + assert convert_to_type("['test']", "string[]") == ["test"] + assert convert_to_type("['test', 'test2']", "string[]") == ["test", "test2"] assert convert_to_type([1, 2, 3], "string[]") == ["1", "2", "3"] assert convert_to_type([1.1, 2.2, 3.3], "string[]") == ["1.1", "2.2", "3.3"] From 2cb5137ea7623575bc31cd716a6e137b3fd3c31b Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Mon, 21 Oct 2024 15:32:05 +0200 Subject: [PATCH 6/7] Hide api url from argument and expose api-endpoints instead. --- resources/config.sample.json | 7 ++++++- src/scicat_configuration.py | 33 ++++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/resources/config.sample.json b/resources/config.sample.json index d77e935..dd93ea6 100644 --- a/resources/config.sample.json +++ b/resources/config.sample.json @@ -61,6 +61,11 @@ "timeout": 0, "stream": true, "verify": false, - "urls": {} + "api_endpoints": { + "datasets": "datasets", + "proposals": "proposals", + "origdatablocks": "origdatablocks", + "instruments": "instruments" + } } } diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index df131cf..2920823 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -230,6 +230,22 @@ class DatasetOptions: default_access_groups: list[str] = field(default_factory=default_access_groups) +@dataclass(kw_only=True) +class _ScicatAPIURLs: + datasets: str + proposals: str + origdatablocks: str + instruments: str + + +@dataclass(kw_only=True) +class ScicatEndpoints: + datasets: str = "datasets" + proposals: str = "proposals" + origdatablocks: str = "origdatablocks" + instruments: str = "instruments" + + @dataclass(kw_only=True) class SciCatOptions: host: str = "https://scicat.host" @@ -238,7 +254,16 @@ class SciCatOptions: timeout: int = 0 stream: bool = True verify: bool = False - urls: dict = field(default_factory=dict) + api_endpoints: ScicatEndpoints = field(default_factory=ScicatEndpoints) + + @property + def urls(self) -> _ScicatAPIURLs: + return _ScicatAPIURLs( + datasets=urljoin(self.host, self.api_endpoints.datasets), + proposals=urljoin(self.host, self.api_endpoints.proposals), + origdatablocks=urljoin(self.host, self.api_endpoints.origdatablocks), + instruments=urljoin(self.host, self.api_endpoints.instruments), + ) @classmethod def from_configurations(cls, config: dict) -> "SciCatOptions": @@ -249,12 +274,6 @@ def from_configurations(cls, config: dict) -> "SciCatOptions": **options.headers, **{"Authorization": f"Bearer {options.token}"}, } - options.urls = { - "datasets": urljoin(options.host, "datasets"), - "proposals": urljoin(options.host, "proposals"), - "origdatablocks": urljoin(options.host, "origdatablocks"), - "instruments": urljoin(options.host, "instruments"), - } return options From 5cc22a7ba80abebee946621dd0be1a0a7b233cb1 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Thu, 31 Oct 2024 13:15:50 +0100 Subject: [PATCH 7/7] Use attributes instead of key. --- resources/config.sample.json | 2 +- src/scicat_communication.py | 8 ++++---- src/scicat_configuration.py | 10 ++++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/resources/config.sample.json b/resources/config.sample.json index f7204f8..b6718ac 100644 --- a/resources/config.sample.json +++ b/resources/config.sample.json @@ -59,7 +59,7 @@ "scicat": { "host": "https://scicat.host", "token": "JWT_TOKEN", - "headers": {}, + "additional_headers": {}, "timeout": 0, "stream": true, "verify": false, diff --git a/src/scicat_communication.py b/src/scicat_communication.py index c169c15..1257564 100644 --- a/src/scicat_communication.py +++ b/src/scicat_communication.py @@ -52,7 +52,7 @@ def create_scicat_dataset( """ logger.info("Sending POST request to create new dataset") response = _post_to_scicat( - url=config.urls["datasets"], + url=config.urls.datasets, posting_obj=dataset, headers=config.additional_headers, timeout=config.timeout, @@ -84,7 +84,7 @@ def create_scicat_origdatablock( """ logger.info("Sending POST request to create new origdatablock") response = _post_to_scicat( - url=config.urls["origdatablocks"], + url=config.urls.origdatablocks, posting_obj=origdatablock, headers=config.additional_headers, timeout=config.timeout, @@ -123,7 +123,7 @@ def check_dataset_by_pid( pid: str, config: SciCatOptions, logger: logging.Logger ) -> bool: response = _get_from_scicat( - url=urljoin(config.host, f"datasets/{quote(pid)}"), + url=urljoin(config.host_address, f"datasets/{quote(pid)}"), headers=config.additional_headers, timeout=config.timeout, stream=config.stream, @@ -157,7 +157,7 @@ def check_dataset_by_metadata( ) -> bool: metadata_dict = {f"scientificMetadata.{metadata_key}.value": metadata_value} filter_string = '?filter={"where":' + json.dumps(metadata_dict) + "}" - url = urljoin(config.host, "datasets") + filter_string + url = urljoin(config.host_address, "datasets") + filter_string logger.info("Checking if dataset exists by metadata with url: %s", url) response = _get_from_scicat( url=url, diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index 895b770..12de863 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -261,10 +261,12 @@ class SciCatOptions: @property def urls(self) -> _ScicatAPIURLs: return _ScicatAPIURLs( - datasets=urljoin(self.host, self.api_endpoints.datasets), - proposals=urljoin(self.host, self.api_endpoints.proposals), - origdatablocks=urljoin(self.host, self.api_endpoints.origdatablocks), - instruments=urljoin(self.host, self.api_endpoints.instruments), + datasets=urljoin(self.host_address, self.api_endpoints.datasets), + proposals=urljoin(self.host_address, self.api_endpoints.proposals), + origdatablocks=urljoin( + self.host_address, self.api_endpoints.origdatablocks + ), + instruments=urljoin(self.host_address, self.api_endpoints.instruments), ) @property