From 874e1ccbbe40ce1206d30e5b84275d9887877450 Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Thu, 7 Nov 2024 16:04:39 +0100 Subject: [PATCH 1/3] All the fixes introduced while testing the production deployments --- resources/small-coda.imsc.json.example | 272 +++++++++++++++++++ src/scicat_dataset.py | 18 +- src/scicat_logging.py | 2 +- src/scicat_metadata.py | 16 +- src/scicat_offline_ingestor.py | 1 + test-data/README.md | 4 +- test-data/{small_coda.hdf => small-coda.hdf} | Bin test-data/{small_ymir.hdf => small-ymir.hdf} | Bin 8 files changed, 298 insertions(+), 15 deletions(-) create mode 100644 resources/small-coda.imsc.json.example rename test-data/{small_coda.hdf => small-coda.hdf} (100%) rename test-data/{small_ymir.hdf => small-ymir.hdf} (100%) diff --git a/resources/small-coda.imsc.json.example b/resources/small-coda.imsc.json.example new file mode 100644 index 0000000..74fbd63 --- /dev/null +++ b/resources/small-coda.imsc.json.example @@ -0,0 +1,272 @@ +{ + "id" : "628b28d6-9c26-11ef-948d-0b2d405fc82f", + "name" : "Test Coda Metadata Schema", + "instrument": "coda", + "selector": "filename:starts_with:/ess/services/scicat-ingestor/software/test-data/small-coda", + "order": 110, + "variables" : { + "job_id": { + "source": "NXS", + "path": "/entry/entry_identifier_uuid", + "value_type": "string" + }, + "pid": { + "source": "VALUE", + "value": "20.500.12269/", + "value_type": "string" + }, + "proposal_id": { + "source": "NXS", + "path": "/entry/experiment_identifier", + "value_type": "string" + }, + "proposal_data": { + "source": "SC", + "url": "proposals/", + "field" : "", + "value_type": "dict" + }, + "pi_firstname": { + "source": "VALUE", + "operator": "getitem", + "value": "", + "field" : "pi_firstname", + "value_type": "string" + }, + "pi_lastname": { + "source": "VALUE", + "operator": "getitem", + "value": "", + "field" : "pi_lastname", + "value_type": "string" + }, + "pi_email": { + "source": "VALUE", + "operator": "getitem", + "value": "", + "field" : "pi_email", + "value_type": "string" + }, + "dataset_original_name": { + "source": "NXS", + "path": "/entry/title", + "value_type": "string" + }, + "dataset_name": { + "source": "VALUE", + "value" : "coda test - - ", + "value_type": "string" + }, + "instrument_name": { + "source": "NXS", + "path": "/entry/instrument/name", + "value_type": "string" + }, + "instruments_data": { + "source": "SC", + "url": "instruments?filter=%7B%22where%22%20%3A%20%7B%20%22name%22%20%3A%20%22coda%22%20%7D%20%7D", + "field": "", + "value_type": "list" + }, + "instrument_data": { + "source": "VALUE", + "operator": "getitem", + "value": "", + "field" : 0, + "value_type": "dict" + }, + "instrument_id": { + "source": "VALUE", + "operator": "getitem", + "value": "", + "field" : "id", + "value_type": "string" + }, + "start_time": { + "source": "NXS", + "path": "/entry/start_time", + "value_type": "date" + }, + "end_time": { + "source": "NXS", + "path": "/entry/end_time", + "value_type": "date" + }, + "run_number": { + "source": "NXS", + "path": "/entry/entry_identifier", + "value_type": "integer" + }, + "acquisition_team_members_list": { + "source": "NXS", + "path" : "/entry/user_*/name", + "value_type": "string[]" + }, + "acquisition_team_members": { + "source": "VALUE", + "operator" : "join_with_space", + "value" : "", + "value_type": "string" + }, + "owner_group": { + "source": "VALUE", + "value": "", + "value_type": "string" + }, + "access_groups": { + "source": "VALUE", + "value": ["scientific information management systems group"], + "value_type": "string[]" + }, + "source_folder": { + "source": "VALUE", + "operator": "dirname", + "value": "", + "value_type": "string" + }, + "keywords" : { + "source": "VALUE", + "value": ["TEST CODA","Scicat Ingestor 05","TEST RUN","CODA","","CODA "], + "value_type": "string[]" + } + }, + "schema": { + "pid": { + "field_type": "high_level", + "machine_name": "pid", + "value": "", + "type": "string" + }, + "type" : { + "field_type": "high_level", + "machine_name": "type", + "value": "raw", + "type": "string" + }, + "proposal_id": { + "field_type": "high_level", + "machine_name": "proposalId", + "value": "", + "type": "string" + }, + "dataset_name": { + "field_type": "high_level", + "machine_name": "datasetName", + "value": "", + "type": "string" + }, + "principal_investigator": { + "field_type": "high_level", + "machine_name": "principalInvestigator", + "value": " ", + "type": "string" + }, + "owner": { + "field_type": "high_level", + "machine_name": "owner", + "value": " ", + "type": "string" + }, + "owner_email": { + "field_type": "high_level", + "machine_name": "ownerEmail", + "value": "", + "type": "email" + }, + "contact_email": { + "field_type": "high_level", + "machine_name": "contactEmail", + "value": "", + "type": "email" + }, + "instrument_id": { + "field_type": "high_level", + "machine_name": "instrumentId", + "value": "", + "type": "string" + }, + "creation_location": { + "field_type": "high_level", + "machine_name": "creationLocation", + "value": "ESS:CODA:", + "type": "string" + }, + "start_time_hl": { + "field_type": "high_level", + "machine_name": "startTime", + "value": "", + "type": "date" + }, + "end_time_hl": { + "field_type": "high_level", + "machine_name": "endTime", + "value": "", + "type": "date" + }, + "start_time_sm": { + "field_type": "scientific_metadata", + "machine_name": "start_time", + "human_name": "Start Time", + "value": "", + "type": "date" + }, + "end_time_sm": { + "field_type": "scientific_metadata", + "machine_name": "end_time", + "human_name": "End Time", + "value": "", + "type": "date" + }, + "run_number_sm": { + "field_type": "scientific_metadata", + "machine_name": "run_number", + "human_name": "Run Number", + "value": "", + "type": "integer" + }, + "job_id": { + "field_type": "scientific_metadata", + "machine_name": "job_id", + "human_name": "ESS Data Collection Job Id", + "value": "", + "type": "string" + }, + "acquisition_team_members": { + "field_type": "scientific_metadata", + "machine_name": "acquisition_team_members", + "human_name": "Acquisition Team Members", + "value": "", + "type": "string" + }, + "owner_group": { + "field_type": "high_level", + "machine_name": "ownerGroup", + "value": "", + "type": "string" + }, + "access_groups": { + "field_type": "high_level", + "machine_name": "accessGroups", + "value": "", + "type": "string[]" + }, + "source_folder": { + "field_type": "high_level", + "machine_name": "sourceFolder", + "value": "", + "type": "string" + }, + "creation_time": { + "field_type": "high_level", + "machine_name": "creationTime", + "value": "", + "type": "date" + }, + "keywords": { + "field_type": "high_level", + "machine_name": "keywords", + "value": "", + "type": "string[]" + } + } +} diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index a4cdd1d..5c9e910 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -168,8 +168,9 @@ def extract_variables_values( ) -> dict: variable_map = { "ingestor_run_id": str(uuid.uuid4()), - "filepath": pathlib.Path(config.nexus_file), + "data_file_path": pathlib.Path(config.nexus_file), "now": datetime.datetime.now(tz=datetime.UTC).isoformat(), + "ingestor_files_directory": config.ingestion.file_handling.ingestor_files_directory } for variable_name, variable_recipe in variables.items(): source = variable_recipe.source @@ -187,11 +188,7 @@ def extract_variables_values( ) elif isinstance(variable_recipe, ValueMetadataVariable): value = variable_recipe.value - value = ( - render_variable_value(value, variable_map) - if isinstance(value, str) - else value - ) + value = render_variable_value(value, variable_map) _operator = _get_operator(variable_recipe.operator) if variable_recipe.field is not None: value = _operator(value, variable_recipe.field) @@ -265,7 +262,8 @@ class ScicatDataset: accessGroups: list[str] | None = None startTime: str | None = None endTime: str | None = None - + runNumber: str | None = None + keywords: list[str] | None = None @dataclass(kw_only=True) class DataFileListItem: @@ -459,7 +457,11 @@ def _filter_by_field_type( return [field for field in schemas if field.field_type == field_type] -def _render_variable_as_type(value: str, variable_map: dict, dtype: str) -> Any: +def _render_variable_as_type( + value: Any, + variable_map: dict, + dtype: str +) -> Any: return convert_to_type(render_variable_value(value, variable_map), dtype) diff --git a/src/scicat_logging.py b/src/scicat_logging.py index d9ff480..3db56d1 100644 --- a/src/scicat_logging.py +++ b/src/scicat_logging.py @@ -47,7 +47,7 @@ def build_logger( # Add graylog handler if logging_options.graylog: - graylog_handler = graypy.GELFTCPHandler( + graylog_handler = graypy.GELFUDPHandler( logging_options.graylog_host, int(logging_options.graylog_port), facility=logging_options.graylog_facility, diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index eacaab0..503d153 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -6,6 +6,7 @@ from collections.abc import Callable from dataclasses import dataclass from importlib.metadata import entry_points +from typing import Any SCIENTIFIC_METADATA_TYPE = "scientific_metadata" HIGH_LEVEL_METADATA_TYPE = "high_level" @@ -140,20 +141,27 @@ def from_file(cls, schema_file_name: pathlib.Path) -> "MetadataSchema": return cls.from_dict(_load_json_schema(schema_file_name)) -def render_variable_value(var_value: str, variable_registry: dict) -> str: +def render_variable_value( + var_value: Any, + variable_registry: dict +) -> str: + # if input is not a string it converts it to string + output_value = var_value if isinstance(var_value,str) else json.dumps(var_value) + # If it is only one variable, then it is a simple replacement - if (var_key := var_value.removesuffix(">").removeprefix("<")) in variable_registry: + if (var_key := output_value.removesuffix(">").removeprefix("<")) in variable_registry: return variable_registry[var_key] # If it is a complex variable, then it is a combination of variables # similar to f-string in python for reg_var_name, reg_var_value in variable_registry.items(): - var_value = var_value.replace("<" + reg_var_name + ">", str(reg_var_value)) + output_value = output_value.replace("<" + reg_var_name + ">", str(reg_var_value)) if "<" in var_value and ">" in var_value: raise Exception(f"Unresolved variable: {var_value}") - return var_value + output_value = output_value if isinstance(var_value,str) else json.loads(output_value) + return output_value def collect_schemas(dir_path: pathlib.Path) -> OrderedDict[str, MetadataSchema]: diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index d7d88b0..0ee9c30 100755 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -139,6 +139,7 @@ def main() -> None: with h5py.File(nexus_file_path) as h5file: # load instrument metadata configuration metadata_schema = select_applicable_schema(nexus_file_path, schemas) + logger.info("Metadata Schema selected : %s (Id: %s)", metadata_schema.name, metadata_schema.id) # define variables values variable_map = extract_variables_values( diff --git a/test-data/README.md b/test-data/README.md index b784480..ed15fd1 100644 --- a/test-data/README.md +++ b/test-data/README.md @@ -18,7 +18,7 @@ with h5py.File('copied_coda.hdf', 'r+') as f: del instrument_gr[name] # Copy the rest of the file - with h5py.File('small_coda.hdf', 'w') as new_f: + with h5py.File('small-coda.hdf', 'w') as new_f: # copy everything f.copy('entry', new_f) @@ -39,7 +39,7 @@ with h5py.File('copied_ymir.hdf', 'r+') as f: del instrument_gr[name] # Copy the rest of the file - with h5py.File('small_ymir.hdf', 'w') as new_f: + with h5py.File('small-ymir.hdf', 'w') as new_f: # copy everything f.copy('entry', new_f) diff --git a/test-data/small_coda.hdf b/test-data/small-coda.hdf similarity index 100% rename from test-data/small_coda.hdf rename to test-data/small-coda.hdf diff --git a/test-data/small_ymir.hdf b/test-data/small-ymir.hdf similarity index 100% rename from test-data/small_ymir.hdf rename to test-data/small-ymir.hdf From 9f37a65296e06251de02540db4dce0e87ff07dde Mon Sep 17 00:00:00 2001 From: "pre-commit-ci-lite[bot]" <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:06:31 +0000 Subject: [PATCH 2/3] Apply automatic formatting --- src/scicat_dataset.py | 9 +++------ src/scicat_metadata.py | 19 +++++++++++-------- src/scicat_offline_ingestor.py | 6 +++++- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index 5c9e910..523aac2 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -170,7 +170,7 @@ def extract_variables_values( "ingestor_run_id": str(uuid.uuid4()), "data_file_path": pathlib.Path(config.nexus_file), "now": datetime.datetime.now(tz=datetime.UTC).isoformat(), - "ingestor_files_directory": config.ingestion.file_handling.ingestor_files_directory + "ingestor_files_directory": config.ingestion.file_handling.ingestor_files_directory, } for variable_name, variable_recipe in variables.items(): source = variable_recipe.source @@ -265,6 +265,7 @@ class ScicatDataset: runNumber: str | None = None keywords: list[str] | None = None + @dataclass(kw_only=True) class DataFileListItem: path: str @@ -457,11 +458,7 @@ def _filter_by_field_type( return [field for field in schemas if field.field_type == field_type] -def _render_variable_as_type( - value: Any, - variable_map: dict, - dtype: str -) -> Any: +def _render_variable_as_type(value: Any, variable_map: dict, dtype: str) -> Any: return convert_to_type(render_variable_value(value, variable_map), dtype) diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index 503d153..ba9f60e 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -141,26 +141,29 @@ def from_file(cls, schema_file_name: pathlib.Path) -> "MetadataSchema": return cls.from_dict(_load_json_schema(schema_file_name)) -def render_variable_value( - var_value: Any, - variable_registry: dict -) -> str: +def render_variable_value(var_value: Any, variable_registry: dict) -> str: # if input is not a string it converts it to string - output_value = var_value if isinstance(var_value,str) else json.dumps(var_value) + output_value = var_value if isinstance(var_value, str) else json.dumps(var_value) # If it is only one variable, then it is a simple replacement - if (var_key := output_value.removesuffix(">").removeprefix("<")) in variable_registry: + if ( + var_key := output_value.removesuffix(">").removeprefix("<") + ) in variable_registry: return variable_registry[var_key] # If it is a complex variable, then it is a combination of variables # similar to f-string in python for reg_var_name, reg_var_value in variable_registry.items(): - output_value = output_value.replace("<" + reg_var_name + ">", str(reg_var_value)) + output_value = output_value.replace( + "<" + reg_var_name + ">", str(reg_var_value) + ) if "<" in var_value and ">" in var_value: raise Exception(f"Unresolved variable: {var_value}") - output_value = output_value if isinstance(var_value,str) else json.loads(output_value) + output_value = ( + output_value if isinstance(var_value, str) else json.loads(output_value) + ) return output_value diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index 0ee9c30..a266d9f 100755 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -139,7 +139,11 @@ def main() -> None: with h5py.File(nexus_file_path) as h5file: # load instrument metadata configuration metadata_schema = select_applicable_schema(nexus_file_path, schemas) - logger.info("Metadata Schema selected : %s (Id: %s)", metadata_schema.name, metadata_schema.id) + logger.info( + "Metadata Schema selected : %s (Id: %s)", + metadata_schema.name, + metadata_schema.id, + ) # define variables values variable_map = extract_variables_values( From 85e8ac2187c94c5c8574501b0572d2badc795dec Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Fri, 8 Nov 2024 13:53:51 +0100 Subject: [PATCH 3/3] Add new small-coda metadata file in the test. --- tests/test_scicat_metadata_schema.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/test_scicat_metadata_schema.py b/tests/test_scicat_metadata_schema.py index 04c7a20..e5ee699 100644 --- a/tests/test_scicat_metadata_schema.py +++ b/tests/test_scicat_metadata_schema.py @@ -54,12 +54,16 @@ def test_collect_metadata_schema() -> None: # Check if the schema is ordered by the schema order and name. # The expected keys are hardcoded on purpose. # Always hardcode the expected keys to avoid the test being too flexible. - assert list(schemas.keys()) == [ - "715ce7ba-3f91-11ef-932f-37a5c6fd60b1", # Coda, 1, Coda Metadata Schema - "72a991ee-437a-11ef-8fd2-1f95660accb7", # Dream, 1, dream Metadata Schema - "c5bed39a-4379-11ef-ba5a-ffbc783163b6", # Base, 1, Generic metadata schema - "891322f6-437a-11ef-980a-7bdc756bd0b3", # Loki, 1, Loki Metadata Schema - ] + assert ( + list(schemas.keys()) + == [ + "715ce7ba-3f91-11ef-932f-37a5c6fd60b1", # Coda, 1, Coda Metadata Schema + "72a991ee-437a-11ef-8fd2-1f95660accb7", # Dream, 1, dream Metadata Schema + "c5bed39a-4379-11ef-ba5a-ffbc783163b6", # Base, 1, Generic metadata schema + "891322f6-437a-11ef-980a-7bdc756bd0b3", # Loki, 1, Loki Metadata Schema + "628b28d6-9c26-11ef-948d-0b2d405fc82f", # Small-Coda, 110, Small-Coda Metadata Schema + ] + ) def test_metadata_schema_selection() -> None: