From 874e1ccbbe40ce1206d30e5b84275d9887877450 Mon Sep 17 00:00:00 2001
From: Max Novelli <max.novelli@ess.eu>
Date: Thu, 7 Nov 2024 16:04:39 +0100
Subject: [PATCH 1/3] All the fixes introduced while testing the production
 deployments

---
 resources/small-coda.imsc.json.example       | 272 +++++++++++++++++++
 src/scicat_dataset.py                        |  18 +-
 src/scicat_logging.py                        |   2 +-
 src/scicat_metadata.py                       |  16 +-
 src/scicat_offline_ingestor.py               |   1 +
 test-data/README.md                          |   4 +-
 test-data/{small_coda.hdf => small-coda.hdf} | Bin
 test-data/{small_ymir.hdf => small-ymir.hdf} | Bin
 8 files changed, 298 insertions(+), 15 deletions(-)
 create mode 100644 resources/small-coda.imsc.json.example
 rename test-data/{small_coda.hdf => small-coda.hdf} (100%)
 rename test-data/{small_ymir.hdf => small-ymir.hdf} (100%)

diff --git a/resources/small-coda.imsc.json.example b/resources/small-coda.imsc.json.example
new file mode 100644
index 0000000..74fbd63
--- /dev/null
+++ b/resources/small-coda.imsc.json.example
@@ -0,0 +1,272 @@
+{
+  "id" : "628b28d6-9c26-11ef-948d-0b2d405fc82f",
+  "name" : "Test Coda Metadata Schema",
+  "instrument": "coda",
+  "selector": "filename:starts_with:/ess/services/scicat-ingestor/software/test-data/small-coda",
+  "order": 110,
+  "variables" : {
+    "job_id": {
+      "source": "NXS",
+      "path": "/entry/entry_identifier_uuid",
+      "value_type": "string"
+    },
+    "pid": {
+      "source": "VALUE",
+      "value": "20.500.12269/<ingestor_run_id>",
+      "value_type": "string"
+    },
+    "proposal_id": {
+      "source": "NXS",
+      "path": "/entry/experiment_identifier",
+      "value_type": "string"
+    },
+    "proposal_data": {
+      "source": "SC",
+      "url": "proposals/<proposal_id>",
+      "field" : "",
+      "value_type": "dict"
+    },
+    "pi_firstname": {
+      "source": "VALUE",
+      "operator": "getitem",
+      "value": "<proposal_data>",
+      "field" : "pi_firstname",
+      "value_type": "string"
+    },
+    "pi_lastname": {
+      "source": "VALUE",
+      "operator": "getitem",
+      "value": "<proposal_data>",
+      "field" : "pi_lastname",
+      "value_type": "string"
+    },
+    "pi_email": {
+      "source": "VALUE",
+      "operator": "getitem",
+      "value": "<proposal_data>",
+      "field" : "pi_email",
+      "value_type": "string"
+    },
+    "dataset_original_name": {
+      "source": "NXS",
+      "path": "/entry/title",
+      "value_type": "string"
+    },
+    "dataset_name": {
+      "source": "VALUE",
+      "value" : "coda test - <dataset_original_name> - <ingestor_run_id>",
+      "value_type": "string"
+    },
+    "instrument_name": {
+      "source": "NXS",
+      "path": "/entry/instrument/name",
+      "value_type": "string"
+    },
+    "instruments_data": {
+      "source": "SC",
+      "url": "instruments?filter=%7B%22where%22%20%3A%20%7B%20%22name%22%20%3A%20%22coda%22%20%7D%20%7D",
+      "field": "",
+      "value_type": "list"
+    },
+    "instrument_data": {
+      "source": "VALUE",
+      "operator": "getitem",
+      "value": "<instruments_data>",
+      "field" : 0,
+      "value_type": "dict"
+    },
+    "instrument_id": {
+      "source": "VALUE",
+      "operator": "getitem",
+      "value": "<instrument_data>",
+      "field" : "id",
+      "value_type": "string"
+    },
+    "start_time": {
+      "source": "NXS",
+      "path": "/entry/start_time",
+      "value_type": "date"
+    },
+    "end_time": {
+      "source": "NXS",
+      "path": "/entry/end_time",
+      "value_type": "date"
+    },
+    "run_number": {
+      "source": "NXS",
+      "path": "/entry/entry_identifier",
+      "value_type": "integer"
+    },
+    "acquisition_team_members_list": {
+      "source": "NXS",
+      "path" : "/entry/user_*/name",
+      "value_type": "string[]"
+    },
+    "acquisition_team_members": {
+      "source": "VALUE",
+      "operator" : "join_with_space",
+      "value" : "<acquisition_team_members_list>",
+      "value_type": "string"
+    },
+    "owner_group": {
+      "source": "VALUE",
+      "value": "<proposal_id>",
+      "value_type": "string"
+    },
+    "access_groups": {
+      "source": "VALUE",
+      "value": ["scientific information management systems group"],
+      "value_type": "string[]"
+    },
+    "source_folder": {
+      "source": "VALUE",
+      "operator": "dirname",
+      "value": "<filepath>",
+      "value_type": "string"
+    },
+    "keywords" : {
+      "source": "VALUE",
+      "value": ["TEST CODA","Scicat Ingestor 05","TEST RUN","CODA","<instrument_name>","CODA <instrument_name>"],
+      "value_type": "string[]"
+    }
+  },
+  "schema": {
+    "pid": {
+      "field_type": "high_level",
+      "machine_name": "pid",
+      "value": "<pid>",
+      "type": "string"
+    },
+    "type" : {
+      "field_type": "high_level",
+      "machine_name": "type",
+      "value": "raw",
+      "type": "string"
+    },
+   "proposal_id": {
+      "field_type": "high_level",
+      "machine_name": "proposalId",
+      "value": "<proposal_id>",
+      "type": "string"
+    },
+    "dataset_name": {
+      "field_type": "high_level",
+      "machine_name": "datasetName",
+      "value": "<dataset_name>",
+      "type": "string"
+    },
+    "principal_investigator": {
+      "field_type": "high_level",
+      "machine_name": "principalInvestigator",
+      "value": "<pi_firstname> <pi_lastname>",
+      "type": "string"
+    },
+    "owner": {
+      "field_type": "high_level",
+      "machine_name": "owner",
+      "value": "<pi_firstname> <pi_lastname>",
+      "type": "string"
+    },
+    "owner_email": {
+      "field_type": "high_level",
+      "machine_name": "ownerEmail",
+      "value": "<pi_email>",
+      "type": "email"
+    },
+    "contact_email": {
+      "field_type": "high_level",
+      "machine_name": "contactEmail",
+      "value": "<pi_email>",
+      "type": "email"
+    },
+    "instrument_id": {
+      "field_type": "high_level",
+      "machine_name": "instrumentId",
+      "value": "<instrument_id>",
+      "type": "string"
+    },
+    "creation_location": {
+      "field_type": "high_level",
+      "machine_name": "creationLocation",
+      "value": "ESS:CODA:<instrument_name>",
+      "type": "string"
+    },
+  "start_time_hl": {
+      "field_type": "high_level",
+      "machine_name": "startTime",
+      "value": "<start_time>",
+      "type": "date"
+    },
+    "end_time_hl": {
+      "field_type": "high_level",
+      "machine_name": "endTime",
+      "value": "<end_time>",
+      "type": "date"
+    },
+    "start_time_sm": {
+      "field_type": "scientific_metadata",
+      "machine_name": "start_time",
+      "human_name": "Start Time",
+      "value": "<start_time>",
+      "type": "date"
+    },
+    "end_time_sm": {
+      "field_type": "scientific_metadata",
+      "machine_name": "end_time",
+      "human_name": "End Time",
+      "value": "<end_time>",
+      "type": "date"
+    },
+    "run_number_sm": {
+      "field_type": "scientific_metadata",
+      "machine_name": "run_number",
+      "human_name": "Run Number",
+      "value": "<run_number>",
+      "type": "integer"
+    },
+    "job_id": {
+      "field_type": "scientific_metadata",
+      "machine_name": "job_id",
+      "human_name": "ESS Data Collection Job Id",
+      "value": "<job_id>",
+      "type": "string"
+    },
+    "acquisition_team_members": {
+      "field_type": "scientific_metadata",
+      "machine_name": "acquisition_team_members",
+      "human_name": "Acquisition Team Members",
+      "value": "<acquisition_team_members>",
+      "type": "string"
+    },
+    "owner_group": {
+      "field_type": "high_level",
+      "machine_name": "ownerGroup",
+      "value": "<owner_group>",
+      "type": "string"
+    },
+    "access_groups": {
+      "field_type": "high_level",
+      "machine_name": "accessGroups",
+      "value": "<access_groups>",
+      "type": "string[]"
+    },
+    "source_folder": {
+      "field_type": "high_level",
+      "machine_name": "sourceFolder",
+      "value": "<source_folder>",
+      "type": "string"
+    },
+    "creation_time": {
+      "field_type": "high_level",
+      "machine_name": "creationTime",
+      "value": "<now>",
+      "type": "date"
+    },
+    "keywords": {
+      "field_type": "high_level",
+      "machine_name": "keywords",
+      "value": "<keywords>",
+      "type": "string[]"
+    }
+  }
+}
diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py
index a4cdd1d..5c9e910 100644
--- a/src/scicat_dataset.py
+++ b/src/scicat_dataset.py
@@ -168,8 +168,9 @@ def extract_variables_values(
 ) -> dict:
     variable_map = {
         "ingestor_run_id": str(uuid.uuid4()),
-        "filepath": pathlib.Path(config.nexus_file),
+        "data_file_path": pathlib.Path(config.nexus_file),
         "now": datetime.datetime.now(tz=datetime.UTC).isoformat(),
+        "ingestor_files_directory": config.ingestion.file_handling.ingestor_files_directory
     }
     for variable_name, variable_recipe in variables.items():
         source = variable_recipe.source
@@ -187,11 +188,7 @@ def extract_variables_values(
             )
         elif isinstance(variable_recipe, ValueMetadataVariable):
             value = variable_recipe.value
-            value = (
-                render_variable_value(value, variable_map)
-                if isinstance(value, str)
-                else value
-            )
+            value = render_variable_value(value, variable_map)
             _operator = _get_operator(variable_recipe.operator)
             if variable_recipe.field is not None:
                 value = _operator(value, variable_recipe.field)
@@ -265,7 +262,8 @@ class ScicatDataset:
     accessGroups: list[str] | None = None
     startTime: str | None = None
     endTime: str | None = None
-
+    runNumber: str | None = None
+    keywords: list[str] | None = None
 
 @dataclass(kw_only=True)
 class DataFileListItem:
@@ -459,7 +457,11 @@ def _filter_by_field_type(
     return [field for field in schemas if field.field_type == field_type]
 
 
-def _render_variable_as_type(value: str, variable_map: dict, dtype: str) -> Any:
+def _render_variable_as_type(
+        value: Any,
+        variable_map: dict,
+        dtype: str
+) -> Any:
     return convert_to_type(render_variable_value(value, variable_map), dtype)
 
 
diff --git a/src/scicat_logging.py b/src/scicat_logging.py
index d9ff480..3db56d1 100644
--- a/src/scicat_logging.py
+++ b/src/scicat_logging.py
@@ -47,7 +47,7 @@ def build_logger(
 
     # Add graylog handler
     if logging_options.graylog:
-        graylog_handler = graypy.GELFTCPHandler(
+        graylog_handler = graypy.GELFUDPHandler(
             logging_options.graylog_host,
             int(logging_options.graylog_port),
             facility=logging_options.graylog_facility,
diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py
index eacaab0..503d153 100644
--- a/src/scicat_metadata.py
+++ b/src/scicat_metadata.py
@@ -6,6 +6,7 @@
 from collections.abc import Callable
 from dataclasses import dataclass
 from importlib.metadata import entry_points
+from typing import Any
 
 SCIENTIFIC_METADATA_TYPE = "scientific_metadata"
 HIGH_LEVEL_METADATA_TYPE = "high_level"
@@ -140,20 +141,27 @@ def from_file(cls, schema_file_name: pathlib.Path) -> "MetadataSchema":
         return cls.from_dict(_load_json_schema(schema_file_name))
 
 
-def render_variable_value(var_value: str, variable_registry: dict) -> str:
+def render_variable_value(
+        var_value: Any,
+        variable_registry: dict
+) -> str:
+    # if input is not a string it converts it to string
+    output_value = var_value if isinstance(var_value,str) else json.dumps(var_value)
+
     # If it is only one variable, then it is a simple replacement
-    if (var_key := var_value.removesuffix(">").removeprefix("<")) in variable_registry:
+    if (var_key := output_value.removesuffix(">").removeprefix("<")) in variable_registry:
         return variable_registry[var_key]
 
     # If it is a complex variable, then it is a combination of variables
     # similar to f-string in python
     for reg_var_name, reg_var_value in variable_registry.items():
-        var_value = var_value.replace("<" + reg_var_name + ">", str(reg_var_value))
+        output_value = output_value.replace("<" + reg_var_name + ">", str(reg_var_value))
 
     if "<" in var_value and ">" in var_value:
         raise Exception(f"Unresolved variable: {var_value}")
 
-    return var_value
+    output_value = output_value if isinstance(var_value,str) else json.loads(output_value)
+    return output_value
 
 
 def collect_schemas(dir_path: pathlib.Path) -> OrderedDict[str, MetadataSchema]:
diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py
index d7d88b0..0ee9c30 100755
--- a/src/scicat_offline_ingestor.py
+++ b/src/scicat_offline_ingestor.py
@@ -139,6 +139,7 @@ def main() -> None:
         with h5py.File(nexus_file_path) as h5file:
             # load instrument metadata configuration
             metadata_schema = select_applicable_schema(nexus_file_path, schemas)
+            logger.info("Metadata Schema selected : %s (Id: %s)", metadata_schema.name, metadata_schema.id)
 
             # define variables values
             variable_map = extract_variables_values(
diff --git a/test-data/README.md b/test-data/README.md
index b784480..ed15fd1 100644
--- a/test-data/README.md
+++ b/test-data/README.md
@@ -18,7 +18,7 @@ with h5py.File('copied_coda.hdf', 'r+') as f:
             del instrument_gr[name]
 
     # Copy the rest of the file
-    with h5py.File('small_coda.hdf', 'w') as new_f:
+    with h5py.File('small-coda.hdf', 'w') as new_f:
         # copy everything
         f.copy('entry', new_f)
 
@@ -39,7 +39,7 @@ with h5py.File('copied_ymir.hdf', 'r+') as f:
             del instrument_gr[name]
 
     # Copy the rest of the file
-    with h5py.File('small_ymir.hdf', 'w') as new_f:
+    with h5py.File('small-ymir.hdf', 'w') as new_f:
         # copy everything
         f.copy('entry', new_f)
 
diff --git a/test-data/small_coda.hdf b/test-data/small-coda.hdf
similarity index 100%
rename from test-data/small_coda.hdf
rename to test-data/small-coda.hdf
diff --git a/test-data/small_ymir.hdf b/test-data/small-ymir.hdf
similarity index 100%
rename from test-data/small_ymir.hdf
rename to test-data/small-ymir.hdf

From 9f37a65296e06251de02540db4dce0e87ff07dde Mon Sep 17 00:00:00 2001
From: "pre-commit-ci-lite[bot]"
 <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Date: Thu, 7 Nov 2024 15:06:31 +0000
Subject: [PATCH 2/3] Apply automatic formatting

---
 src/scicat_dataset.py          |  9 +++------
 src/scicat_metadata.py         | 19 +++++++++++--------
 src/scicat_offline_ingestor.py |  6 +++++-
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py
index 5c9e910..523aac2 100644
--- a/src/scicat_dataset.py
+++ b/src/scicat_dataset.py
@@ -170,7 +170,7 @@ def extract_variables_values(
         "ingestor_run_id": str(uuid.uuid4()),
         "data_file_path": pathlib.Path(config.nexus_file),
         "now": datetime.datetime.now(tz=datetime.UTC).isoformat(),
-        "ingestor_files_directory": config.ingestion.file_handling.ingestor_files_directory
+        "ingestor_files_directory": config.ingestion.file_handling.ingestor_files_directory,
     }
     for variable_name, variable_recipe in variables.items():
         source = variable_recipe.source
@@ -265,6 +265,7 @@ class ScicatDataset:
     runNumber: str | None = None
     keywords: list[str] | None = None
 
+
 @dataclass(kw_only=True)
 class DataFileListItem:
     path: str
@@ -457,11 +458,7 @@ def _filter_by_field_type(
     return [field for field in schemas if field.field_type == field_type]
 
 
-def _render_variable_as_type(
-        value: Any,
-        variable_map: dict,
-        dtype: str
-) -> Any:
+def _render_variable_as_type(value: Any, variable_map: dict, dtype: str) -> Any:
     return convert_to_type(render_variable_value(value, variable_map), dtype)
 
 
diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py
index 503d153..ba9f60e 100644
--- a/src/scicat_metadata.py
+++ b/src/scicat_metadata.py
@@ -141,26 +141,29 @@ def from_file(cls, schema_file_name: pathlib.Path) -> "MetadataSchema":
         return cls.from_dict(_load_json_schema(schema_file_name))
 
 
-def render_variable_value(
-        var_value: Any,
-        variable_registry: dict
-) -> str:
+def render_variable_value(var_value: Any, variable_registry: dict) -> str:
     # if input is not a string it converts it to string
-    output_value = var_value if isinstance(var_value,str) else json.dumps(var_value)
+    output_value = var_value if isinstance(var_value, str) else json.dumps(var_value)
 
     # If it is only one variable, then it is a simple replacement
-    if (var_key := output_value.removesuffix(">").removeprefix("<")) in variable_registry:
+    if (
+        var_key := output_value.removesuffix(">").removeprefix("<")
+    ) in variable_registry:
         return variable_registry[var_key]
 
     # If it is a complex variable, then it is a combination of variables
     # similar to f-string in python
     for reg_var_name, reg_var_value in variable_registry.items():
-        output_value = output_value.replace("<" + reg_var_name + ">", str(reg_var_value))
+        output_value = output_value.replace(
+            "<" + reg_var_name + ">", str(reg_var_value)
+        )
 
     if "<" in var_value and ">" in var_value:
         raise Exception(f"Unresolved variable: {var_value}")
 
-    output_value = output_value if isinstance(var_value,str) else json.loads(output_value)
+    output_value = (
+        output_value if isinstance(var_value, str) else json.loads(output_value)
+    )
     return output_value
 
 
diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py
index 0ee9c30..a266d9f 100755
--- a/src/scicat_offline_ingestor.py
+++ b/src/scicat_offline_ingestor.py
@@ -139,7 +139,11 @@ def main() -> None:
         with h5py.File(nexus_file_path) as h5file:
             # load instrument metadata configuration
             metadata_schema = select_applicable_schema(nexus_file_path, schemas)
-            logger.info("Metadata Schema selected : %s (Id: %s)", metadata_schema.name, metadata_schema.id)
+            logger.info(
+                "Metadata Schema selected : %s (Id: %s)",
+                metadata_schema.name,
+                metadata_schema.id,
+            )
 
             # define variables values
             variable_map = extract_variables_values(

From 85e8ac2187c94c5c8574501b0572d2badc795dec Mon Sep 17 00:00:00 2001
From: YooSunyoung <luysunyoung9@gmail.com>
Date: Fri, 8 Nov 2024 13:53:51 +0100
Subject: [PATCH 3/3] Add new small-coda metadata file in the test.

---
 tests/test_scicat_metadata_schema.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/test_scicat_metadata_schema.py b/tests/test_scicat_metadata_schema.py
index 04c7a20..e5ee699 100644
--- a/tests/test_scicat_metadata_schema.py
+++ b/tests/test_scicat_metadata_schema.py
@@ -54,12 +54,16 @@ def test_collect_metadata_schema() -> None:
     # Check if the schema is ordered by the schema order and name.
     # The expected keys are hardcoded on purpose.
     # Always hardcode the expected keys to avoid the test being too flexible.
-    assert list(schemas.keys()) == [
-        "715ce7ba-3f91-11ef-932f-37a5c6fd60b1",  # Coda, 1, Coda Metadata Schema
-        "72a991ee-437a-11ef-8fd2-1f95660accb7",  # Dream, 1, dream Metadata Schema
-        "c5bed39a-4379-11ef-ba5a-ffbc783163b6",  # Base, 1, Generic metadata schema
-        "891322f6-437a-11ef-980a-7bdc756bd0b3",  # Loki, 1, Loki Metadata Schema
-    ]
+    assert (
+        list(schemas.keys())
+        == [
+            "715ce7ba-3f91-11ef-932f-37a5c6fd60b1",  # Coda, 1, Coda Metadata Schema
+            "72a991ee-437a-11ef-8fd2-1f95660accb7",  # Dream, 1, dream Metadata Schema
+            "c5bed39a-4379-11ef-ba5a-ffbc783163b6",  # Base, 1, Generic metadata schema
+            "891322f6-437a-11ef-980a-7bdc756bd0b3",  # Loki, 1, Loki Metadata Schema
+            "628b28d6-9c26-11ef-948d-0b2d405fc82f",  # Small-Coda, 110, Small-Coda Metadata Schema
+        ]
+    )
 
 
 def test_metadata_schema_selection() -> None: