From e658db560cfb0866f062d13f405b32451044dd15 Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Wed, 30 Oct 2024 14:58:50 +0100 Subject: [PATCH 1/9] chaged literal_eval to eval --- src/scicat_dataset.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index df81824..f0e4df2 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -93,7 +93,7 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: "join_with_space": lambda value: ", ".join( ast.literal_eval(value) if isinstance(value, str) else value ), - "evaluate": lambda value: ast.literal_eval(value), + "evaluate": lambda value: eval(value), "filename": lambda value: os.path.basename(value), "dirname-2": lambda value: os.path.dirname(os.path.dirname(value)), } @@ -137,12 +137,13 @@ def extract_variables_values( if isinstance(variable_recipe, NexusFileMetadataVariable): value = _retrieve_values_from_file(variable_recipe, h5file) elif isinstance(variable_recipe, ScicatMetadataVariable): + full_endpoint_url = render_full_url( + render_variable_value(variable_recipe.url, variable_map), + config.scicat, + ) value = retrieve_value_from_scicat( config=config.scicat, - scicat_endpoint_url=render_full_url( - render_variable_value(variable_recipe.url, variable_map), - config.scicat, - ), + scicat_endpoint_url=full_endpoint_url, field_name=variable_recipe.field, ) elif isinstance(variable_recipe, ValueMetadataVariable): From 251aa883f2c202f0048df5f5d485dd70493f2459 Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Wed, 30 Oct 2024 17:47:02 +0100 Subject: [PATCH 2/9] fixes of the day --- src/scicat_communication.py | 4 ++-- src/scicat_configuration.py | 1 + src/scicat_dataset.py | 6 +++++- src/scicat_metadata.py | 5 +++-- src/scicat_offline_ingestor.py | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/scicat_communication.py b/src/scicat_communication.py index 1257564..a63a474 100644 --- a/src/scicat_communication.py +++ b/src/scicat_communication.py @@ -112,9 +112,9 @@ def render_full_url( config: SciCatOptions, ) -> str: if not url.startswith("http://") and not url.startswith("https://"): - for endpoint in config.urls.keys(): + for endpoint in config.urls.__dict__.keys(): if url.startswith(endpoint): - url = url.replace(endpoint, config.urls[endpoint]) + url = url.replace(endpoint, config.urls.__getattribute__(endpoint)) break return url diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index 12de863..129a10a 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -204,6 +204,7 @@ class FileHandlingOptions: ingestor_files_directory: str = "../ingestor" message_to_file: bool = True message_file_extension: str = "message.json" + use_full_file_path: bool = False @dataclass(kw_only=True) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index f0e4df2..b43fb2f 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -95,6 +95,7 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: ), "evaluate": lambda value: eval(value), "filename": lambda value: os.path.basename(value), + "dirname": lambda value: os.path.dirname(value), "dirname-2": lambda value: os.path.dirname(os.path.dirname(value)), } ) @@ -129,6 +130,7 @@ def extract_variables_values( config: OfflineIngestorConfig, ) -> dict: variable_map = { + "ingestor_run_id": str(uuid.uuid4()), "filepath": pathlib.Path(config.nexus_file), "now": datetime.datetime.now(tz=datetime.UTC).isoformat(), } @@ -166,7 +168,7 @@ def extract_paths_from_h5_file( _path: list[str], ) -> list[str]: master_key = _path.pop(0) - output_paths = [master_key] + output_paths = [] if "*" in master_key: temp_keys = [k2 for k2 in _h5_object.keys() if re.search(master_key, k2)] for key in temp_keys: @@ -217,6 +219,8 @@ class ScicatDataset: proposalId: str | None = None ownerGroup: str | None = None accessGroups: list[str] | None = None + startTime: str | None = None + endTime: str | None = None @dataclass(kw_only=True) diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index 9006249..6d08c17 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -110,6 +110,7 @@ class MetadataSchema: name: str instrument: str selector: str | dict + order: int variables: dict[str, MetadataSchemaVariable] schema: dict[str, MetadataItem] @@ -158,11 +159,11 @@ def collect_schemas(dir_path: pathlib.Path) -> OrderedDict[str, MetadataSchema]: MetadataSchema.from_file(schema_file_path) for schema_file_path in list_schema_file_names(dir_path) ], - key=lambda schema: schema.name, + key=lambda schema: (schema.order, schema.name), ) schemas = OrderedDict() for metadata_schema in metadata_schemas: - schemas[metadata_schema.name] = metadata_schema + schemas[metadata_schema.id] = metadata_schema return schemas diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index d240688..c0f8c4d 100755 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -150,7 +150,7 @@ def main() -> None: nexus_file=nexus_file_path, ingestor_directory=ingestor_directory, config=fh_options, - source_folder=variable_map["source_folder"], + source_folder=variable_map["source_folder"] if config.ingestion.file_handling.use_full_file_path==False else "", logger=logger, # TODO: add done_writing_message_file and nexus_structure_file ) From 08d6534e7d90d11c00daaba613cf388ca6343be2 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Thu, 31 Oct 2024 13:35:35 +0100 Subject: [PATCH 3/9] Use asdict instead of hidden interface. --- src/scicat_communication.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/scicat_communication.py b/src/scicat_communication.py index a63a474..f7f9f7a 100644 --- a/src/scicat_communication.py +++ b/src/scicat_communication.py @@ -2,6 +2,7 @@ # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) import json import logging +from dataclasses import asdict from typing import Any from urllib.parse import quote, urljoin @@ -107,15 +108,13 @@ def create_scicat_origdatablock( return result -def render_full_url( - url: str, - config: SciCatOptions, -) -> str: +def render_full_url(url: str, config: SciCatOptions) -> str: + urls = asdict(config.urls) if not url.startswith("http://") and not url.startswith("https://"): - for endpoint in config.urls.__dict__.keys(): + for endpoint in urls.keys(): if url.startswith(endpoint): - url = url.replace(endpoint, config.urls.__getattribute__(endpoint)) - break + return url.replace(endpoint, urls[endpoint]) + return url From d4ad32486591258d9abd4c2b7cc92186eecf7cb6 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Thu, 31 Oct 2024 13:36:05 +0100 Subject: [PATCH 4/9] Add synchronized config file. --- resources/config.sample.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/config.sample.json b/resources/config.sample.json index b6718ac..be1953a 100644 --- a/resources/config.sample.json +++ b/resources/config.sample.json @@ -27,7 +27,8 @@ "hash_file_extension": "b2b", "ingestor_files_directory": "../ingestor", "message_to_file": true, - "message_file_extension": "message.json" + "message_file_extension": "message.json", + "use_full_file_path": false } }, "kafka": { From 1b0fe49a04c185bd1dcbe894769b32d91f4d9994 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Thu, 31 Oct 2024 13:41:47 +0100 Subject: [PATCH 5/9] Fix formatting. --- src/scicat_offline_ingestor.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index c0f8c4d..abc2095 100755 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -146,11 +146,16 @@ def main() -> None: ) # Collect data-file descriptions + if not config.ingestion.file_handling.use_full_file_path: + source_folder = variable_map["source_folder"] + else: + source_folder = None + data_file_list = create_data_file_list( nexus_file=nexus_file_path, ingestor_directory=ingestor_directory, config=fh_options, - source_folder=variable_map["source_folder"] if config.ingestion.file_handling.use_full_file_path==False else "", + source_folder=source_folder, logger=logger, # TODO: add done_writing_message_file and nexus_structure_file ) From 88571d900acff81f9f7816a73f9f6f070d87f89c Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Thu, 31 Oct 2024 15:03:44 +0100 Subject: [PATCH 6/9] Metadata operator update. --- src/scicat_dataset.py | 21 +++++++++++++++++++-- src/scicat_metadata.py | 8 ++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index b43fb2f..afbff04 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -60,6 +60,15 @@ def to_date(value: Any) -> str | None: def to_dict(value: Any) -> dict: + if isinstance(value, str): + result = ast.literal_eval(value) + if isinstance(result, dict): + return result + else: + raise ValueError( + "Invalid value. Must be able to convert to a dictionary. Got ", value + ) + return dict(value) @@ -93,10 +102,13 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: "join_with_space": lambda value: ", ".join( ast.literal_eval(value) if isinstance(value, str) else value ), - "evaluate": lambda value: eval(value), + # "evaluate": lambda value: ast.literal_eval(value), "filename": lambda value: os.path.basename(value), "dirname": lambda value: os.path.dirname(value), "dirname-2": lambda value: os.path.dirname(os.path.dirname(value)), + "getitem": lambda value, key: value[ + key + ], # The only operator that takes an argument } ) @@ -155,7 +167,12 @@ def extract_variables_values( if isinstance(value, str) else value ) - value = _get_operator(variable_recipe.operator)(value) + _operator = _get_operator(variable_recipe.operator) + if variable_recipe.argument: + value = _operator(value, variable_recipe.argument) + else: + value = _operator(value) + else: raise Exception("Invalid variable source: ", source) variable_map[variable_name] = convert_to_type(value, variable_recipe.value_type) diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index 6d08c17..fcce189 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -67,6 +67,8 @@ class ValueMetadataVariable(MetadataSchemaVariable): operator: str = "" value: str + argument: str | None = None + # We only allow one argument for now @dataclass(kw_only=True) @@ -139,6 +141,12 @@ def from_file(cls, schema_file_name: pathlib.Path) -> "MetadataSchema": def render_variable_value(var_value: str, variable_registry: dict) -> str: + # If it is only one variable, then it is a simple replacement + if (var_key := var_value.removesuffix(">").removeprefix("<")) in variable_registry: + return variable_registry[var_key] + + # If it is a complex variable, then it is a combination of variables + # similar to f-string in python for reg_var_name, reg_var_value in variable_registry.items(): var_value = var_value.replace("<" + reg_var_name + ">", str(reg_var_value)) From 3b76f7a5d0289d8469144b4fb7606a54831e9875 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Thu, 31 Oct 2024 15:26:05 +0100 Subject: [PATCH 7/9] Rename argument field. --- src/scicat_dataset.py | 4 ++-- src/scicat_metadata.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index afbff04..6f67b46 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -168,8 +168,8 @@ def extract_variables_values( else value ) _operator = _get_operator(variable_recipe.operator) - if variable_recipe.argument: - value = _operator(value, variable_recipe.argument) + if variable_recipe.field: + value = _operator(value, variable_recipe.field) else: value = _operator(value) diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index fcce189..a77233d 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -67,8 +67,8 @@ class ValueMetadataVariable(MetadataSchemaVariable): operator: str = "" value: str - argument: str | None = None - # We only allow one argument for now + field: str | None = None + # We only allow one field(argument) for now @dataclass(kw_only=True) From 65470ba46f3c8287a7e3d916c378de5a02a593ab Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Fri, 1 Nov 2024 10:37:49 +0100 Subject: [PATCH 8/9] Fix metadata tests and examples. --- resources/base.imsc.json.example | 1 + resources/coda.imsc.json.example | 1 + resources/dream.imsc.json.example | 1 + resources/loki.imsc.json.example | 1 + src/scicat_metadata.py | 4 +++- tests/test_scicat_metadata_schema.py | 18 +++++++++++++++--- 6 files changed, 22 insertions(+), 4 deletions(-) diff --git a/resources/base.imsc.json.example b/resources/base.imsc.json.example index 6afc927..2033913 100644 --- a/resources/base.imsc.json.example +++ b/resources/base.imsc.json.example @@ -1,4 +1,5 @@ { + "order": 1, "id": "c5bed39a-4379-11ef-ba5a-ffbc783163b6", "name" : "Generic metadata schema", "instrument" : "", diff --git a/resources/coda.imsc.json.example b/resources/coda.imsc.json.example index f0efeb6..da2d12f 100644 --- a/resources/coda.imsc.json.example +++ b/resources/coda.imsc.json.example @@ -1,4 +1,5 @@ { + "order": 1, "id" : "715ce7ba-3f91-11ef-932f-37a5c6fd60b1", "name" : "Coda Metadata Schema", "instrument": "coda", diff --git a/resources/dream.imsc.json.example b/resources/dream.imsc.json.example index 37ed6d4..6a0c36d 100644 --- a/resources/dream.imsc.json.example +++ b/resources/dream.imsc.json.example @@ -1,4 +1,5 @@ { + "order": 1, "id" : "72a991ee-437a-11ef-8fd2-1f95660accb7", "name" : "dream Metadata Schema", "instrument": "dream", diff --git a/resources/loki.imsc.json.example b/resources/loki.imsc.json.example index cc9e813..625e86b 100644 --- a/resources/loki.imsc.json.example +++ b/resources/loki.imsc.json.example @@ -1,4 +1,5 @@ { + "order": 1, "id" : "891322f6-437a-11ef-980a-7bdc756bd0b3", "name" : "Loki Metadata Schema", "instrument": "loki", diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index a77233d..eacaab0 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -167,7 +167,9 @@ def collect_schemas(dir_path: pathlib.Path) -> OrderedDict[str, MetadataSchema]: MetadataSchema.from_file(schema_file_path) for schema_file_path in list_schema_file_names(dir_path) ], - key=lambda schema: (schema.order, schema.name), + key=lambda schema: (schema.order, schema.name.capitalize()), + # name is capitalized to make sure that the order is + # alphabetically sorted in a non-case-sensitive way ) schemas = OrderedDict() for metadata_schema in metadata_schemas: diff --git a/tests/test_scicat_metadata_schema.py b/tests/test_scicat_metadata_schema.py index 7d8876c..04c7a20 100644 --- a/tests/test_scicat_metadata_schema.py +++ b/tests/test_scicat_metadata_schema.py @@ -48,17 +48,25 @@ def test_collect_metadata_schema() -> None: assert len(schemas) == len(ALL_SCHEMA_EXAMPLES) for schema_name, schema in schemas.items(): assert isinstance(schema, MetadataSchema) - assert schema_name == schema.name + assert schema_name == schema.id assert isinstance(schemas, OrderedDict) - # Check if the schema is ordered by the schema name - assert list(schemas.keys()) == sorted(schemas.keys()) + # Check if the schema is ordered by the schema order and name. + # The expected keys are hardcoded on purpose. + # Always hardcode the expected keys to avoid the test being too flexible. + assert list(schemas.keys()) == [ + "715ce7ba-3f91-11ef-932f-37a5c6fd60b1", # Coda, 1, Coda Metadata Schema + "72a991ee-437a-11ef-8fd2-1f95660accb7", # Dream, 1, dream Metadata Schema + "c5bed39a-4379-11ef-ba5a-ffbc783163b6", # Base, 1, Generic metadata schema + "891322f6-437a-11ef-980a-7bdc756bd0b3", # Loki, 1, Loki Metadata Schema + ] def test_metadata_schema_selection() -> None: schemas = OrderedDict( { "schema1": MetadataSchema( + order=1, id="schema1", name="Schema 1", instrument="", @@ -67,6 +75,7 @@ def test_metadata_schema_selection() -> None: schema={}, ), "schema2": MetadataSchema( + order=2, id="schema2", name="Schema 2", instrument="", @@ -75,6 +84,7 @@ def test_metadata_schema_selection() -> None: schema={}, ), "schema3": MetadataSchema( + order=3, id="schema3", name="Schema 3", instrument="", @@ -96,6 +106,7 @@ def test_metadata_schema_selection_wrong_selector_target_name_raises() -> None: OrderedDict( { "schema1": MetadataSchema( + order=1, id="schema1", name="Schema 1", instrument="", @@ -115,6 +126,7 @@ def test_metadata_schema_selection_wrong_selector_function_name_raises() -> None OrderedDict( { "schema1": MetadataSchema( + order=1, id="schema1", name="Schema 1", instrument="", From c9339856c194265d975f3784907798c568384ff1 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Fri, 1 Nov 2024 10:55:07 +0100 Subject: [PATCH 9/9] Add comments. --- src/scicat_dataset.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index 6f67b46..b10fad7 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -103,6 +103,13 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: ast.literal_eval(value) if isinstance(value, str) else value ), # "evaluate": lambda value: ast.literal_eval(value), + # We are not adding the evaluate function here since + # ``evaluate`` function should be avoided if possible. + # It might seem easy to use, but it is very easy to break + # when the input is not as expected. + # It is better to use the specific converters for the types. + # However, if it is the only way to go, you can add it here. + # Please add a comment to explain why it is needed. "filename": lambda value: os.path.basename(value), "dirname": lambda value: os.path.dirname(value), "dirname-2": lambda value: os.path.dirname(os.path.dirname(value)),