From e58ab5feb0ce6b8ceacae17041ba50b5325dacda Mon Sep 17 00:00:00 2001
From: Max Novelli <max.novelli@ess.eu>
Date: Mon, 30 Sep 2024 16:52:32 +0200
Subject: [PATCH 1/7] work of the day

---
 src/scicat_communication.py    |  6 +++---
 src/scicat_dataset.py          | 24 ++++++++++++++++--------
 src/scicat_metadata.py         |  8 +++++---
 src/scicat_offline_ingestor.py |  7 ++++++-
 src/scicat_online_ingestor.py  |  3 +++
 5 files changed, 33 insertions(+), 15 deletions(-)
 mode change 100644 => 100755 src/scicat_offline_ingestor.py

diff --git a/src/scicat_communication.py b/src/scicat_communication.py
index d9216c0..aef829c 100644
--- a/src/scicat_communication.py
+++ b/src/scicat_communication.py
@@ -13,11 +13,11 @@ def retrieve_value_from_scicat(
     variable_url: str,  # It should be already rendered from variable_recipe["url"]
     field_name: str,  # variable_recipe["field"]
 ) -> str:
-    url = config.host.removesuffix('/') + variable_url
+    url = config.host.removesuffix('/') + "/" + variable_url
     response: dict = requests.get(
-        url, headers={"token": config.token}, timeout=config.timeout
+        url, headers={"Authorization": config.token}, timeout=config.timeout
     ).json()
-    return response[field_name]
+    return response[field_name] if field_name else response
 
 
 class ScicatDatasetAPIError(Exception):
diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py
index c465771..4429c48 100644
--- a/src/scicat_dataset.py
+++ b/src/scicat_dataset.py
@@ -47,6 +47,8 @@ def to_date(value: Any) -> str | None:
         return datetime.datetime.fromtimestamp(value, tz=datetime.UTC).isoformat()
     return None
 
+def to_dict(value: Any) -> dict:
+    return dict(value)
 
 _DtypeConvertingMap = MappingProxyType(
     {
@@ -55,6 +57,7 @@ def to_date(value: Any) -> str | None:
         "integer": to_integer,
         "float": to_float,
         "date": to_date,
+        "dict": to_dict,
         # TODO: Add email converter
     }
 )
@@ -73,6 +76,7 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any:
     {
         "DO_NOTHING": lambda value: value,
         "join_with_space": lambda value: ", ".join(value),
+        "evaluate": lambda value: eval(value),
     }
 )
 
@@ -82,28 +86,32 @@ def _get_operator(operator: str | None) -> Callable:
 
 
 def extract_variables_values(
-    variables: dict[str, dict], h5file: h5py.File, config: SciCatOptions
+    variables: dict[str, dict],
+    h5file: h5py.File,
+    config: SciCatOptions
 ) -> dict:
     variable_map = {}
     for variable_name, variable_recipe in variables.items():
-        if (source := variable_recipe["source"]) == "NXS":
-            value = h5file[variable_recipe["path"]][...]
+        print(variable_name)
+        source = variable_recipe.source
+        if source == "NXS":
+            value = h5file[variable_recipe.path][...].item().decode('utf-8')
         elif source == "SC":
             value = retrieve_value_from_scicat(
                 config=config,
                 variable_url=render_variable_value(
-                    variable_recipe["url"], variable_map
+                    variable_recipe.url, variable_map
                 ),
-                field_name=variable_recipe["field"],
+                field_name=variable_recipe.field,
             )
         elif source == "VALUE":
-            value = _get_operator(variable_recipe.get("operator"))(
-                render_variable_value(variable_recipe["value"], variable_map)
+            value = _get_operator(variable_recipe.operator)(
+                render_variable_value(variable_recipe.value, variable_map)
             )
         else:
             raise Exception("Invalid variable source: ", source)
         variable_map[variable_name] = convert_to_type(
-            value, variable_recipe["value_type"]
+            value, variable_recipe.value_type
         )
     return variable_map
 
diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py
index 0548f6a..2a01898 100644
--- a/src/scicat_metadata.py
+++ b/src/scicat_metadata.py
@@ -138,8 +138,8 @@ def from_file(cls, schema_file_name: pathlib.Path) -> "MetadataSchema":
 
 
 def render_variable_value(var_value: str, variable_registry: dict) -> str:
-    for var_name, var_value in variable_registry.items():
-        var_value = var_value.replace("<" + var_name + ">", str(var_value))
+    for reg_var_name, reg_var_value in variable_registry.items():
+        var_value = var_value.replace("<" + reg_var_name + ">", str(reg_var_value))
 
     if "<" in var_value and ">" in var_value:
         raise Exception(f"Unresolved variable: {var_value}")
@@ -167,7 +167,9 @@ def collect_schemas(dir_path: pathlib.Path) -> OrderedDict[str, MetadataSchema]:
 
 
 def select_applicable_schema(
-    nexus_file: pathlib.Path, schemas: OrderedDict[str, MetadataSchema]
+    nexus_file: pathlib.Path,
+    h5_file: any,
+    schemas: OrderedDict[str, MetadataSchema]
 ) -> MetadataSchema:
     """
     Evaluates which metadata schema configuration is applicable to ``nexus_file``.
diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py
old mode 100644
new mode 100755
index 638e1da..85cbd8e
--- a/src/scicat_offline_ingestor.py
+++ b/src/scicat_offline_ingestor.py
@@ -70,7 +70,9 @@ def main() -> None:
 
             # define variables values
             variable_map = extract_variables_values(
-                metadata_schema['variables'], h5file, config.scicat
+                metadata_schema.variables,
+                h5file,
+                config.scicat
             )
 
         # Collect data-file descriptions
@@ -124,3 +126,6 @@ def main() -> None:
                 scicat_origdatablock,
             )
             raise RuntimeError("Failed to create dataset or origdatablock.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/scicat_online_ingestor.py b/src/scicat_online_ingestor.py
index b7e4101..5ebb9a0 100644
--- a/src/scicat_online_ingestor.py
+++ b/src/scicat_online_ingestor.py
@@ -164,3 +164,6 @@ def main() -> None:
                 # check if we need to commit the individual message
                 if config.kafka.individual_message_commit:
                     _individual_message_commit(offline_ingestors, consumer, logger)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 6c0c5cb9911be6465ebce46459fdf77d776a47c2 Mon Sep 17 00:00:00 2001
From: Max Novelli <max.novelli@ess.eu>
Date: Wed, 2 Oct 2024 09:33:13 +0200
Subject: [PATCH 2/7] testing and more testing

---
 src/scicat_dataset.py | 45 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py
index 4429c48..fcac009 100644
--- a/src/scicat_dataset.py
+++ b/src/scicat_dataset.py
@@ -22,7 +22,8 @@
     VALID_METADATA_TYPES,
     render_variable_value,
 )
-
+import re
+import copy
 
 def to_string(value: Any) -> str:
     return str(value)
@@ -75,7 +76,7 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any:
 _OPERATOR_REGISTRY = MappingProxyType(
     {
         "DO_NOTHING": lambda value: value,
-        "join_with_space": lambda value: ", ".join(value),
+        "join_with_space": lambda value: ", ".join(eval(value) if isinstance(value,str) else value),
         "evaluate": lambda value: eval(value),
     }
 )
@@ -95,7 +96,18 @@ def extract_variables_values(
         print(variable_name)
         source = variable_recipe.source
         if source == "NXS":
-            value = h5file[variable_recipe.path][...].item().decode('utf-8')
+            path = variable_recipe.path
+            if "*" in path:
+                provided_path = path.split("/")[1:]
+                provided_path[0] = "/" + provided_path[0]
+                expanded_paths = extract_paths_from_h5_file(h5file,provided_path)
+                value = [
+                    h5file[p][...].item().decode("utf-8")
+                    for p
+                    in expanded_paths
+                ]
+            else:
+                value = h5file[path][...].item().decode("utf-8")
         elif source == "SC":
             value = retrieve_value_from_scicat(
                 config=config,
@@ -105,9 +117,9 @@ def extract_variables_values(
                 field_name=variable_recipe.field,
             )
         elif source == "VALUE":
-            value = _get_operator(variable_recipe.operator)(
-                render_variable_value(variable_recipe.value, variable_map)
-            )
+            value = variable_recipe.value
+            value = render_variable_value(value, variable_map) if isinstance(value,str) else value
+            value = _get_operator(variable_recipe.operator)(value)
         else:
             raise Exception("Invalid variable source: ", source)
         variable_map[variable_name] = convert_to_type(
@@ -115,6 +127,27 @@ def extract_variables_values(
         )
     return variable_map
 
+def extract_paths_from_h5_file(
+    h5_object: Any,
+    path: list[str],
+) -> list[str]:
+    master_key = path.pop(0)
+    output_paths = [master_key]
+    if "*" in master_key:
+        temp_keys = [k2 for k2 in list(h5_object.keys()) if re.search(master_key, k2)]
+        output_paths = []
+        for key in temp_keys:
+            output_paths += [
+                key + "/" + subkey
+                for subkey
+                in extract_paths_from_h5_file(h5_object[key], copy.deepcopy(path))
+            ]
+    else:
+        if path:
+            output_paths = [master_key + "/" + subkey for subkey in extract_paths_from_h5_file(h5_object[master_key],path)]
+
+    return output_paths
+
 
 @dataclass(kw_only=True)
 class TechniqueDesc:

From aa9bcb37fab92089c7873b6fd8600826cd4bf82e Mon Sep 17 00:00:00 2001
From: Max Novelli <max.novelli@ess.eu>
Date: Thu, 3 Oct 2024 13:31:50 +0200
Subject: [PATCH 3/7] tested untill scicat dataset local creation

---
 src/scicat_dataset.py          | 179 +++++++++++++++++++--------------
 src/scicat_offline_ingestor.py |   7 +-
 2 files changed, 110 insertions(+), 76 deletions(-)

diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py
index fcac009..09efe94 100644
--- a/src/scicat_dataset.py
+++ b/src/scicat_dataset.py
@@ -2,6 +2,7 @@
 # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
 import datetime
 import logging
+import os.path
 import pathlib
 import uuid
 from collections.abc import Callable, Iterable
@@ -14,7 +15,7 @@
 from scicat_configuration import (
     DatasetOptions,
     FileHandlingOptions,
-    SciCatOptions,
+    SciCatOptions, OfflineIngestorConfig,
 )
 from scicat_metadata import (
     HIGH_LEVEL_METADATA_TYPE,
@@ -30,7 +31,7 @@ def to_string(value: Any) -> str:
 
 
 def to_string_array(value: list[Any]) -> list[str]:
-    return [str(v) for v in value]
+    return [str(v) for v in (eval(value) if isinstance(value, str) else value)]
 
 
 def to_integer(value: Any) -> int:
@@ -59,6 +60,7 @@ def to_dict(value: Any) -> dict:
         "float": to_float,
         "date": to_date,
         "dict": to_dict,
+        "email": to_string
         # TODO: Add email converter
     }
 )
@@ -68,7 +70,8 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any:
     if (converter := _DtypeConvertingMap.get(dtype_desc)) is None:
         raise ValueError(
             "Invalid dtype description. Must be one of: ",
-            "string, string[], integer, float, date.\nGot: {dtype_desc}",
+            "string, string[], integer, float, date.",
+            f"Got: {dtype_desc}",
         )
     return converter(input_value)
 
@@ -78,6 +81,8 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any:
         "DO_NOTHING": lambda value: value,
         "join_with_space": lambda value: ", ".join(eval(value) if isinstance(value,str) else value),
         "evaluate": lambda value: eval(value),
+        "filename": lambda value: os.path.basename(value),
+        "dirname-2": lambda value: os.path.dirname(os.path.dirname(value))
     }
 )
 
@@ -89,9 +94,12 @@ def _get_operator(operator: str | None) -> Callable:
 def extract_variables_values(
     variables: dict[str, dict],
     h5file: h5py.File,
-    config: SciCatOptions
+    config: OfflineIngestorConfig
 ) -> dict:
-    variable_map = {}
+    variable_map = {
+        "filepath" : pathlib.Path(config.nexus_file),
+        "now" : datetime.datetime.now().isoformat(),
+    }
     for variable_name, variable_recipe in variables.items():
         print(variable_name)
         source = variable_recipe.source
@@ -110,7 +118,7 @@ def extract_variables_values(
                 value = h5file[path][...].item().decode("utf-8")
         elif source == "SC":
             value = retrieve_value_from_scicat(
-                config=config,
+                config=config.scicat,
                 variable_url=render_variable_value(
                     variable_recipe.url, variable_map
                 ),
@@ -164,7 +172,7 @@ class ScicatDataset:
     numberOfFiles: int
     isPublished: bool = False
     datasetName: str
-    description: str
+    description: str = field(default=None)
     principalInvestigator: str
     creationLocation: str
     scientificMetadata: dict
@@ -174,18 +182,18 @@ class ScicatDataset:
     contactEmail: str
     creationTime: str
     type: str = "raw"
-    sampleId: str
+    sampleId: str = field(default=None)
     techniques: list[TechniqueDesc] = field(default_factory=list)
     instrumentId: str | None = None
     proposalId: str | None = None
     ownerGroup: str | None = None
-    accessGroup: list[str] | None = None
+    accessGroups: list[str] | None = None
 
 
 @dataclass(kw_only=True)
 class DataFileListItem:
     path: str
-    "Absolute path to the file."
+    "Relative path of the file to the source folder."
     size: int | None = None
     "Size of the single file in bytes."
     time: str
@@ -212,7 +220,7 @@ def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str | N
     if not file_path.exists():
         return None
 
-    if algorithm_name != "b2blake":
+    if algorithm_name != "blake2b":
         raise ValueError(
             "Only b2blake hash algorithm is supported for now. Got: ",
             f"{algorithm_name}",
@@ -230,33 +238,59 @@ def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str | N
 def _create_single_data_file_list_item(
     *,
     file_path: pathlib.Path,
-    calculate_checksum: bool,
+    compute_file_hash: bool,
     compute_file_stats: bool,
     file_hash_algorithm: str = "",
 ) -> DataFileListItem:
     """``DataFileListItem`` constructing helper."""
 
-    if file_path.exists() and compute_file_stats:
-        return DataFileListItem(
-            path=file_path.absolute().as_posix(),
-            size=(file_stats := file_path.stat()).st_size,
-            time=datetime.datetime.fromtimestamp(
-                file_stats.st_ctime, tz=datetime.UTC
-            ).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
-            chk=_calculate_checksum(file_path, file_hash_algorithm)
-            if calculate_checksum
-            else None,
-            uid=str(file_stats.st_uid),
-            gid=str(file_stats.st_gid),
-            perm=oct(file_stats.st_mode),
-        )
-    else:
-        return DataFileListItem(
-            path=file_path.absolute().as_posix(),
-            time=datetime.datetime.now(tz=datetime.UTC).strftime(
-                "%Y-%m-%dT%H:%M:%S.000Z"
-            ),
-        )
+    file_info = {
+        "path" : file_path.absolute().as_posix(),
+        "time" : datetime.datetime.now(tz=datetime.UTC).strftime(
+            "%Y-%m-%dT%H:%M:%S.000Z"
+        ),
+    }
+    if file_path.exists():
+        if compute_file_stats:
+            file_stats = file_path.stat()
+            file_info = {
+                **file_info,
+                **{
+                    "size" : file_stats.st_size,
+                    "time" : datetime.datetime.fromtimestamp(
+                         file_stats.st_ctime, tz=datetime.UTC
+                    ).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+                    "uid" : str(file_stats.st_uid),
+                    "gid" : str(file_stats.st_gid),
+                    "perm" : oct(file_stats.st_mode),
+                }
+            }
+
+        if compute_file_hash:
+            file_info["chk"] = _calculate_checksum(file_path, file_hash_algorithm)
+
+    return DataFileListItem(**file_info)
+    # if file_path.exists() and compute_file_stats:
+    #     return DataFileListItem(
+    #         path=file_path.absolute().as_posix(),
+    #         size=(file_stats := file_path.stat()).st_size,
+    #         time=datetime.datetime.fromtimestamp(
+    #             file_stats.st_ctime, tz=datetime.UTC
+    #         ).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+    #         chk=_calculate_checksum(file_path, file_hash_algorithm)
+    #         if compute_file_hash
+    #         else None,
+    #         uid=str(file_stats.st_uid),
+    #         gid=str(file_stats.st_gid),
+    #         perm=oct(file_stats.st_mode),
+    #     )
+    # else:
+    #     return DataFileListItem(
+    #         path=file_path.absolute().as_posix(),
+    #         time=datetime.datetime.now(tz=datetime.UTC).strftime(
+    #             "%Y-%m-%dT%H:%M:%S.000Z"
+    #         ),
+    #     )
 
 
 def _build_hash_path(
@@ -289,6 +323,7 @@ def create_data_file_list(
     nexus_structure_file: pathlib.Path | None = None,
     ingestor_directory: pathlib.Path,
     config: FileHandlingOptions,
+    source_folder: pathlib.Path | str | None = None,
     logger: logging.Logger,
 ) -> list[DataFileListItem]:
     """
@@ -316,6 +351,7 @@ def create_data_file_list(
         _create_single_data_file_list_item,
         file_hash_algorithm=config.file_hash_algorithm,
         compute_file_stats=config.compute_file_stats,
+        compute_file_hash=config.compute_file_hash
     )
 
     # Collect the files that will be ingested
@@ -331,7 +367,6 @@ def create_data_file_list(
         logger.info("Adding file %s to the datafiles list", minimum_file_path)
         new_file_item = single_file_constructor(
             file_path=minimum_file_path,
-            calculate_checksum=config.compute_file_hash,
         )
         data_file_list.append(new_file_item)
         if config.save_file_hash:
@@ -344,31 +379,29 @@ def create_data_file_list(
                 hash_file_extension=config.hash_file_extension,
             )
             logger.info("Saving hash into a file ... %s", hash_file_path)
-            if new_file_item.chk is not None:
-                _save_hash_file(
-                    original_file_instance=new_file_item, hash_path=hash_file_path
-                )
-                data_file_list.append(
-                    single_file_constructor(
-                        file_path=hash_file_path, calculate_checksum=False
-                    )
-                )
-            else:
-                logger.warning(
-                    "File instance of (%s) does not have checksum. "
-                    "Probably the file does not exist. "
-                    "Skip saving...",
-                    minimum_file_path,
+            _save_hash_file(
+                original_file_instance=new_file_item, hash_path=hash_file_path
+            )
+            data_file_list.append(
+                single_file_constructor(
+                    file_path=hash_file_path,
+                    compute_file_hash=False
                 )
+            )
+        if source_folder:
+            for data_file in data_file_list:
+                data_file.path = str(
+                    pathlib.Path(data_file.path).relative_to(source_folder))
 
     return data_file_list
 
 
 def _filter_by_field_type(schemas: Iterable[dict], field_type: str) -> list[dict]:
-    return [field for field in schemas if field["field_type"] == field_type]
+    return [field for field in schemas if field.field_type == field_type]
 
 
 def _render_variable_as_type(value: str, variable_map: dict, dtype: str) -> Any:
+    print(value, dtype)
     return convert_to_type(render_variable_value(value, variable_map), dtype)
 
 
@@ -399,13 +432,13 @@ def _create_scientific_metadata(
             "type": "string",
         },
         **{
-            field["machine_name"]: {
+            field.machine_name: {
                 "value": _render_variable_as_type(
-                    field["value"], variable_map, field["type"]
+                    field.value, variable_map, field.type
                 ),
-                "unit": field.get("unit", ""),
-                "human_name": field.get("human_name", field["machine_name"]),
-                "type": field["type"],
+                "unit": getattr(field,"unit", ""),
+                "human_name": getattr(field,"human_name", field.machine_name),
+                "type": field.type,
             }
             for field in sm_schemas
         },
@@ -413,15 +446,15 @@ def _create_scientific_metadata(
 
 
 def _validate_metadata_schemas(
-    metadata_schemas: dict[str, dict],
+    metadata_schema: dict[str, dict],
 ) -> None:
-    if any(
-        invalid_types := [
-            field["field_type"]
-            for field in metadata_schemas.values()
-            if field["field_type"] not in VALID_METADATA_TYPES
-        ]
-    ):
+    invalid_types = [
+        field.field_type
+        for field in metadata_schema.values()
+        if field.field_type not in VALID_METADATA_TYPES
+    ]
+
+    if any(invalid_types):
         raise ValueError(
             "Invalid metadata schema types found. Valid types are: ",
             VALID_METADATA_TYPES,
@@ -433,7 +466,7 @@ def _validate_metadata_schemas(
 def create_scicat_dataset_instance(
     *,
     metadata_schema_id: str,  # metadata-schema["id"]
-    metadata_schemas: dict[str, dict],  # metadata-schema["schema"]
+    metadata_schema: dict[str, dict],  # metadata-schema["schema"]
     variable_map: dict,
     data_file_list: list[DataFileListItem],
     config: DatasetOptions,
@@ -456,7 +489,7 @@ def create_scicat_dataset_instance(
         Logger instance.
 
     """
-    _validate_metadata_schemas(metadata_schemas)
+    _validate_metadata_schemas(metadata_schema)
     # Create the dataset instance
     scicat_dataset = ScicatDataset(
         size=sum([file.size for file in data_file_list if file.size is not None]),
@@ -465,23 +498,23 @@ def create_scicat_dataset_instance(
         scientificMetadata=_create_scientific_metadata(
             metadata_schema_id=metadata_schema_id,
             sm_schemas=_filter_by_field_type(
-                metadata_schemas.values(), SCIENTIFIC_METADATA_TYPE
+                metadata_schema.values(), SCIENTIFIC_METADATA_TYPE
             ),  # Scientific metadata schemas
             variable_map=variable_map,
         ),
         **{
-            field["machine_name"]: _render_variable_as_type(
-                field["value"], variable_map, field["type"]
+            field.machine_name: _render_variable_as_type(
+                field.value, variable_map, field.type
             )
             for field in _filter_by_field_type(
-                metadata_schemas.values(), HIGH_LEVEL_METADATA_TYPE
+                metadata_schema.values(), HIGH_LEVEL_METADATA_TYPE
             )
             # High level schemas
         },
     )
 
     # Auto generate or assign default values if needed
-    if not config.allow_dataset_pid:
+    if not config.allow_dataset_pid and scicat_dataset.pid:
         logger.info("PID is not allowed in the dataset by configuration.")
         scicat_dataset.pid = None
     elif config.generate_dataset_pid:
@@ -505,11 +538,11 @@ def create_scicat_dataset_instance(
             "Owner group is not provided. Setting to default value. %s",
             scicat_dataset.ownerGroup,
         )
-    if scicat_dataset.accessGroup is None:
-        scicat_dataset.accessGroup = config.default_access_groups
+    if scicat_dataset.accessGroups is None:
+        scicat_dataset.accessGroups = config.default_access_groups
         logger.info(
             "Access group is not provided. Setting to default value. %s",
-            scicat_dataset.accessGroup,
+            scicat_dataset.accessGroups,
         )
 
     logger.info("Dataset instance is created successfully. %s", scicat_dataset)
diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py
index 85cbd8e..cc2a465 100755
--- a/src/scicat_offline_ingestor.py
+++ b/src/scicat_offline_ingestor.py
@@ -72,7 +72,7 @@ def main() -> None:
             variable_map = extract_variables_values(
                 metadata_schema.variables,
                 h5file,
-                config.scicat
+                config
             )
 
         # Collect data-file descriptions
@@ -80,6 +80,7 @@ def main() -> None:
             nexus_file=nexus_file_path,
             ingestor_directory=ingestor_directory,
             config=fh_options,
+            source_folder=variable_map["source_folder"],
             logger=logger,
             # TODO: add done_writing_message_file and nexus_structure_file
         )
@@ -88,8 +89,8 @@ def main() -> None:
         logger.info("Preparing scicat dataset instance ...")
         local_dataset = scicat_dataset_to_dict(
             create_scicat_dataset_instance(
-                metadata_schema_id=metadata_schema["id"],
-                metadata_schemas=metadata_schema["schemas"],
+                metadata_schema_id=metadata_schema.id,
+                metadata_schema=metadata_schema.schema,
                 variable_map=variable_map,
                 data_file_list=data_file_list,
                 config=config.dataset,

From 7014bc6c2237c75bf800536076ac0f955963490a Mon Sep 17 00:00:00 2001
From: Max Novelli <max.novelli@ess.eu>
Date: Wed, 9 Oct 2024 16:44:47 +0200
Subject: [PATCH 4/7] first successful ingestion

---
 src/scicat_communication.py    | 31 +++++++++++++++++++++++--------
 src/scicat_configuration.py    | 20 ++++++++++++++++++--
 src/scicat_dataset.py          | 10 +++++++---
 src/scicat_offline_ingestor.py | 15 +++++++++++----
 4 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/src/scicat_communication.py b/src/scicat_communication.py
index aef829c..01375c5 100644
--- a/src/scicat_communication.py
+++ b/src/scicat_communication.py
@@ -10,12 +10,13 @@
 def retrieve_value_from_scicat(
     *,
     config: SciCatOptions,
-    variable_url: str,  # It should be already rendered from variable_recipe["url"]
+    scicat_endpoint_url: str,  # It should be already rendered from variable_recipe["url"]
     field_name: str,  # variable_recipe["field"]
 ) -> str:
-    url = config.host.removesuffix('/') + "/" + variable_url
     response: dict = requests.get(
-        url, headers={"Authorization": config.token}, timeout=config.timeout
+        scicat_endpoint_url,
+        headers=config.headers,
+        timeout=config.timeout
     ).json()
     return response[field_name] if field_name else response
 
@@ -44,9 +45,9 @@ def create_scicat_dataset(
     """
     logger.info("Sending POST request to create new dataset")
     response = _post_to_scicat(
-        url=urljoin(config.host, "datasets"),
+        url= config.urls["datasets"],
         posting_obj=dataset,
-        headers={"token": config.token, **config.headers},
+        headers=config.headers,
         timeout=config.timeout,
     )
     result: dict = response.json()
@@ -69,16 +70,19 @@ class ScicatOrigDatablockAPIError(Exception):
 
 
 def create_scicat_origdatablock(
-    *, origdatablock: dict, config: SciCatOptions, logger: logging.Logger
+    *,
+    origdatablock: dict,
+    config: SciCatOptions,
+    logger: logging.Logger
 ) -> dict:
     """
     Execute a POST request to scicat to create a new origdatablock
     """
     logger.info("Sending POST request to create new origdatablock")
     response = _post_to_scicat(
-        url=urljoin(config.host, "origdatablocks"),
+        url=config.urls["origdatablocks"],
         posting_obj=origdatablock,
-        headers={"token": config.token, **config.headers},
+        headers=config.headers,
         timeout=config.timeout,
     )
     result: dict = response.json()
@@ -97,3 +101,14 @@ def create_scicat_origdatablock(
         result['_id'],
     )
     return result
+
+def render_full_url(
+    url: str,
+    config: SciCatOptions,
+) -> str:
+    if not url.startswith("http://") and not url.startswith("https://"):
+        for endpoint in config.urls.keys():
+            if url.startswith(endpoint):
+                url = url.replace(endpoint,config.urls[endpoint])
+                break
+    return url
\ No newline at end of file
diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py
index 16ce518..c4360fb 100644
--- a/src/scicat_configuration.py
+++ b/src/scicat_configuration.py
@@ -8,6 +8,7 @@
 from pathlib import Path
 from types import MappingProxyType
 from typing import Any, TypeVar, get_origin
+from urllib.parse import urljoin
 
 
 def _load_config(config_file: Path) -> dict:
@@ -237,12 +238,23 @@ class SciCatOptions:
     timeout: int = 0
     stream: bool = True
     verify: bool = False
+    urls: dict = field(default_factory=dict)
 
     @classmethod
     def from_configurations(cls, config: dict) -> "SciCatOptions":
         """Create SciCatOptions from a dictionary."""
         options = cls(**config)
-        options.headers = {"Authorization": f"Bearer {options.token}"}
+        options.host = options.host.removesuffix('/') + "/"
+        options.headers = {
+            **options.headers,
+            **{"Authorization": f"Bearer {options.token}"}
+        }
+        options.urls = {
+            "datasets" : urljoin(options.host, "datasets"),
+            "proposals" : urljoin(options.host, "proposals"),
+            "origdatablocks" : urljoin(options.host, "origdatablocks"),
+            "instruments": urljoin(options.host, "instruments"),
+        }
         return options
 
 
@@ -335,9 +347,13 @@ def merge_config_and_input_args(
 
 
 def _validate_config_file(target_type: type[T], config_file: Path) -> T:
+    config = {
+        **_load_config(config_file),
+        "config_file": config_file.as_posix()
+    }
     return build_dataclass(
         target_type,
-        {**_load_config(config_file), "config_file": config_file.as_posix()},
+        config,
     )
 
 
diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py
index 09efe94..bec2038 100644
--- a/src/scicat_dataset.py
+++ b/src/scicat_dataset.py
@@ -11,7 +11,7 @@
 from typing import Any
 
 import h5py
-from scicat_communication import retrieve_value_from_scicat
+from scicat_communication import retrieve_value_from_scicat, render_full_url
 from scicat_configuration import (
     DatasetOptions,
     FileHandlingOptions,
@@ -119,8 +119,12 @@ def extract_variables_values(
         elif source == "SC":
             value = retrieve_value_from_scicat(
                 config=config.scicat,
-                variable_url=render_variable_value(
-                    variable_recipe.url, variable_map
+                scicat_endpoint_url=render_full_url(
+                    render_variable_value(
+                        variable_recipe.url,
+                        variable_map
+                    ),
+                    config.scicat,
                 ),
                 field_name=variable_recipe.field,
             )
diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py
index cc2a465..bf97637 100755
--- a/src/scicat_offline_ingestor.py
+++ b/src/scicat_offline_ingestor.py
@@ -9,7 +9,7 @@
     OfflineIngestorConfig,
     build_arg_parser,
     build_dataclass,
-    merge_config_and_input_args,
+    merge_config_and_input_args, SciCatOptions,
 )
 from scicat_dataset import (
     create_data_file_list,
@@ -38,7 +38,10 @@ def build_offline_config() -> OfflineIngestorConfig:
     # with ``OnlineIngestorConfig``.
     del merged_configuration["kafka"]
 
-    return build_dataclass(OfflineIngestorConfig, merged_configuration)
+    config = build_dataclass(OfflineIngestorConfig, merged_configuration)
+    config.scicat = SciCatOptions.from_configurations(merged_configuration["scicat"])
+
+    return config
 
 
 def main() -> None:
@@ -100,7 +103,9 @@ def main() -> None:
         logger.debug("Scicat dataset: %s", local_dataset)
         # Create dataset in scicat
         scicat_dataset = create_scicat_dataset(
-            dataset=local_dataset, config=config.scicat, logger=logger
+            dataset=local_dataset,
+            config=config.scicat,
+            logger=logger
         )
 
         # Prepare origdatablock
@@ -115,7 +120,9 @@ def main() -> None:
         logger.debug("Scicat origdatablock: %s", local_origdatablock)
         # create origdatablock in scicat
         scicat_origdatablock = create_scicat_origdatablock(
-            origdatablock=local_origdatablock, config=config.scicat, logger=logger
+            origdatablock=local_origdatablock,
+            config=config.scicat,
+            logger=logger
         )
 
         # check one more time if we successfully created the entries in scicat

From 8d1110c98fa7338f23a78c91fd1017f47aebc25b Mon Sep 17 00:00:00 2001
From: YooSunyoung <luysunyoung9@gmail.com>
Date: Mon, 21 Oct 2024 15:22:35 +0200
Subject: [PATCH 5/7] Fix formatting and type-hints

---
 resources/config.sample.json   |   3 +-
 src/scicat_communication.py    |  20 ++--
 src/scicat_configuration.py    |  13 +--
 src/scicat_dataset.py          | 187 +++++++++++++++++----------------
 src/scicat_metadata.py         |   3 +-
 src/scicat_offline_ingestor.py |  20 ++--
 src/scicat_online_ingestor.py  |   3 +-
 tests/test_scicat_dataset.py   |   4 +-
 8 files changed, 125 insertions(+), 128 deletions(-)

diff --git a/resources/config.sample.json b/resources/config.sample.json
index e49c33c..d77e935 100644
--- a/resources/config.sample.json
+++ b/resources/config.sample.json
@@ -60,6 +60,7 @@
     "headers": {},
     "timeout": 0,
     "stream": true,
-    "verify": false
+    "verify": false,
+    "urls": {}
   }
 }
diff --git a/src/scicat_communication.py b/src/scicat_communication.py
index 01375c5..9199cd8 100644
--- a/src/scicat_communication.py
+++ b/src/scicat_communication.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
 import logging
-from urllib.parse import urljoin
 
 import requests
 from scicat_configuration import SciCatOptions
@@ -10,13 +9,12 @@
 def retrieve_value_from_scicat(
     *,
     config: SciCatOptions,
-    scicat_endpoint_url: str,  # It should be already rendered from variable_recipe["url"]
+    scicat_endpoint_url: str,  # It should be already rendered
+    # from variable_recipe["url"]
     field_name: str,  # variable_recipe["field"]
 ) -> str:
     response: dict = requests.get(
-        scicat_endpoint_url,
-        headers=config.headers,
-        timeout=config.timeout
+        scicat_endpoint_url, headers=config.headers, timeout=config.timeout
     ).json()
     return response[field_name] if field_name else response
 
@@ -45,7 +43,7 @@ def create_scicat_dataset(
     """
     logger.info("Sending POST request to create new dataset")
     response = _post_to_scicat(
-        url= config.urls["datasets"],
+        url=config.urls["datasets"],
         posting_obj=dataset,
         headers=config.headers,
         timeout=config.timeout,
@@ -70,10 +68,7 @@ class ScicatOrigDatablockAPIError(Exception):
 
 
 def create_scicat_origdatablock(
-    *,
-    origdatablock: dict,
-    config: SciCatOptions,
-    logger: logging.Logger
+    *, origdatablock: dict, config: SciCatOptions, logger: logging.Logger
 ) -> dict:
     """
     Execute a POST request to scicat to create a new origdatablock
@@ -102,6 +97,7 @@ def create_scicat_origdatablock(
     )
     return result
 
+
 def render_full_url(
     url: str,
     config: SciCatOptions,
@@ -109,6 +105,6 @@ def render_full_url(
     if not url.startswith("http://") and not url.startswith("https://"):
         for endpoint in config.urls.keys():
             if url.startswith(endpoint):
-                url = url.replace(endpoint,config.urls[endpoint])
+                url = url.replace(endpoint, config.urls[endpoint])
                 break
-    return url
\ No newline at end of file
+    return url
diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py
index c4360fb..df131cf 100644
--- a/src/scicat_configuration.py
+++ b/src/scicat_configuration.py
@@ -247,12 +247,12 @@ def from_configurations(cls, config: dict) -> "SciCatOptions":
         options.host = options.host.removesuffix('/') + "/"
         options.headers = {
             **options.headers,
-            **{"Authorization": f"Bearer {options.token}"}
+            **{"Authorization": f"Bearer {options.token}"},
         }
         options.urls = {
-            "datasets" : urljoin(options.host, "datasets"),
-            "proposals" : urljoin(options.host, "proposals"),
-            "origdatablocks" : urljoin(options.host, "origdatablocks"),
+            "datasets": urljoin(options.host, "datasets"),
+            "proposals": urljoin(options.host, "proposals"),
+            "origdatablocks": urljoin(options.host, "origdatablocks"),
             "instruments": urljoin(options.host, "instruments"),
         }
         return options
@@ -347,10 +347,7 @@ def merge_config_and_input_args(
 
 
 def _validate_config_file(target_type: type[T], config_file: Path) -> T:
-    config = {
-        **_load_config(config_file),
-        "config_file": config_file.as_posix()
-    }
+    config = {**_load_config(config_file), "config_file": config_file.as_posix()}
     return build_dataclass(
         target_type,
         config,
diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py
index bec2038..df81824 100644
--- a/src/scicat_dataset.py
+++ b/src/scicat_dataset.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
+import ast
+import copy
 import datetime
 import logging
 import os.path
 import pathlib
+import re
 import uuid
 from collections.abc import Callable, Iterable
 from dataclasses import asdict, dataclass, field
@@ -11,27 +14,33 @@
 from typing import Any
 
 import h5py
-from scicat_communication import retrieve_value_from_scicat, render_full_url
+from scicat_communication import render_full_url, retrieve_value_from_scicat
 from scicat_configuration import (
     DatasetOptions,
     FileHandlingOptions,
-    SciCatOptions, OfflineIngestorConfig,
+    OfflineIngestorConfig,
 )
 from scicat_metadata import (
     HIGH_LEVEL_METADATA_TYPE,
     SCIENTIFIC_METADATA_TYPE,
     VALID_METADATA_TYPES,
+    MetadataItem,
+    MetadataSchemaVariable,
+    NexusFileMetadataVariable,
+    ScicatMetadataVariable,
+    ValueMetadataVariable,
     render_variable_value,
 )
-import re
-import copy
+
 
 def to_string(value: Any) -> str:
     return str(value)
 
 
 def to_string_array(value: list[Any]) -> list[str]:
-    return [str(v) for v in (eval(value) if isinstance(value, str) else value)]
+    return [
+        str(v) for v in (ast.literal_eval(value) if isinstance(value, str) else value)
+    ]
 
 
 def to_integer(value: Any) -> int:
@@ -49,9 +58,11 @@ def to_date(value: Any) -> str | None:
         return datetime.datetime.fromtimestamp(value, tz=datetime.UTC).isoformat()
     return None
 
+
 def to_dict(value: Any) -> dict:
     return dict(value)
 
+
 _DtypeConvertingMap = MappingProxyType(
     {
         "string": to_string,
@@ -60,7 +71,7 @@ def to_dict(value: Any) -> dict:
         "float": to_float,
         "date": to_date,
         "dict": to_dict,
-        "email": to_string
+        "email": to_string,
         # TODO: Add email converter
     }
 )
@@ -79,10 +90,12 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any:
 _OPERATOR_REGISTRY = MappingProxyType(
     {
         "DO_NOTHING": lambda value: value,
-        "join_with_space": lambda value: ", ".join(eval(value) if isinstance(value,str) else value),
-        "evaluate": lambda value: eval(value),
+        "join_with_space": lambda value: ", ".join(
+            ast.literal_eval(value) if isinstance(value, str) else value
+        ),
+        "evaluate": lambda value: ast.literal_eval(value),
         "filename": lambda value: os.path.basename(value),
-        "dirname-2": lambda value: os.path.dirname(os.path.dirname(value))
+        "dirname-2": lambda value: os.path.dirname(os.path.dirname(value)),
     }
 )
 
@@ -91,72 +104,83 @@ def _get_operator(operator: str | None) -> Callable:
     return _OPERATOR_REGISTRY.get(operator or "DO_NOTHING", lambda _: _)
 
 
+def _retrieve_as_string(
+    h5file: h5py.File, path: str, *, encoding: str = "utf-8"
+) -> str:
+    return h5file[path][...].item().decode(encoding)
+
+
+def _retrieve_values_from_file(
+    variable_recipe: NexusFileMetadataVariable, h5file: h5py.File
+) -> Any:
+    if "*" in variable_recipe.path:  # Selectors are used
+        path = variable_recipe.path.split("/")[1:]
+        path[0] += "/"
+        paths = extract_paths_from_h5_file(h5file, path)
+        value = [_retrieve_as_string(h5file, p) for p in paths]
+    else:
+        value = _retrieve_as_string(h5file, variable_recipe.path)
+    return value
+
+
 def extract_variables_values(
-    variables: dict[str, dict],
+    variables: dict[str, MetadataSchemaVariable],
     h5file: h5py.File,
-    config: OfflineIngestorConfig
+    config: OfflineIngestorConfig,
 ) -> dict:
     variable_map = {
-        "filepath" : pathlib.Path(config.nexus_file),
-        "now" : datetime.datetime.now().isoformat(),
+        "filepath": pathlib.Path(config.nexus_file),
+        "now": datetime.datetime.now(tz=datetime.UTC).isoformat(),
     }
     for variable_name, variable_recipe in variables.items():
-        print(variable_name)
         source = variable_recipe.source
-        if source == "NXS":
-            path = variable_recipe.path
-            if "*" in path:
-                provided_path = path.split("/")[1:]
-                provided_path[0] = "/" + provided_path[0]
-                expanded_paths = extract_paths_from_h5_file(h5file,provided_path)
-                value = [
-                    h5file[p][...].item().decode("utf-8")
-                    for p
-                    in expanded_paths
-                ]
-            else:
-                value = h5file[path][...].item().decode("utf-8")
-        elif source == "SC":
+        if isinstance(variable_recipe, NexusFileMetadataVariable):
+            value = _retrieve_values_from_file(variable_recipe, h5file)
+        elif isinstance(variable_recipe, ScicatMetadataVariable):
             value = retrieve_value_from_scicat(
                 config=config.scicat,
                 scicat_endpoint_url=render_full_url(
-                    render_variable_value(
-                        variable_recipe.url,
-                        variable_map
-                    ),
+                    render_variable_value(variable_recipe.url, variable_map),
                     config.scicat,
                 ),
                 field_name=variable_recipe.field,
             )
-        elif source == "VALUE":
+        elif isinstance(variable_recipe, ValueMetadataVariable):
             value = variable_recipe.value
-            value = render_variable_value(value, variable_map) if isinstance(value,str) else value
+            value = (
+                render_variable_value(value, variable_map)
+                if isinstance(value, str)
+                else value
+            )
             value = _get_operator(variable_recipe.operator)(value)
         else:
             raise Exception("Invalid variable source: ", source)
-        variable_map[variable_name] = convert_to_type(
-            value, variable_recipe.value_type
-        )
+        variable_map[variable_name] = convert_to_type(value, variable_recipe.value_type)
+
     return variable_map
 
+
 def extract_paths_from_h5_file(
-    h5_object: Any,
-    path: list[str],
+    _h5_object: h5py.Group | h5py.File,
+    _path: list[str],
 ) -> list[str]:
-    master_key = path.pop(0)
+    master_key = _path.pop(0)
     output_paths = [master_key]
     if "*" in master_key:
-        temp_keys = [k2 for k2 in list(h5_object.keys()) if re.search(master_key, k2)]
-        output_paths = []
+        temp_keys = [k2 for k2 in _h5_object.keys() if re.search(master_key, k2)]
         for key in temp_keys:
             output_paths += [
                 key + "/" + subkey
-                for subkey
-                in extract_paths_from_h5_file(h5_object[key], copy.deepcopy(path))
+                for subkey in extract_paths_from_h5_file(
+                    _h5_object[key], copy.deepcopy(_path)
+                )
             ]
     else:
-        if path:
-            output_paths = [master_key + "/" + subkey for subkey in extract_paths_from_h5_file(h5_object[master_key],path)]
+        if _path:
+            output_paths = [
+                master_key + "/" + subkey
+                for subkey in extract_paths_from_h5_file(_h5_object[master_key], _path)
+            ]
 
     return output_paths
 
@@ -176,7 +200,7 @@ class ScicatDataset:
     numberOfFiles: int
     isPublished: bool = False
     datasetName: str
-    description: str = field(default=None)
+    description: str | None = None
     principalInvestigator: str
     creationLocation: str
     scientificMetadata: dict
@@ -186,7 +210,7 @@ class ScicatDataset:
     contactEmail: str
     creationTime: str
     type: str = "raw"
-    sampleId: str = field(default=None)
+    sampleId: str | None = None
     techniques: list[TechniqueDesc] = field(default_factory=list)
     instrumentId: str | None = None
     proposalId: str | None = None
@@ -248,53 +272,33 @@ def _create_single_data_file_list_item(
 ) -> DataFileListItem:
     """``DataFileListItem`` constructing helper."""
 
-    file_info = {
-        "path" : file_path.absolute().as_posix(),
-        "time" : datetime.datetime.now(tz=datetime.UTC).strftime(
+    file_info: dict[str, Any] = {
+        "path": file_path.absolute().as_posix(),
+        "time": datetime.datetime.now(tz=datetime.UTC).strftime(
             "%Y-%m-%dT%H:%M:%S.000Z"
         ),
     }
     if file_path.exists():
         if compute_file_stats:
             file_stats = file_path.stat()
+            timestamp_str = datetime.datetime.fromtimestamp(
+                file_stats.st_ctime, tz=datetime.UTC
+            ).strftime("%Y-%m-%dT%H:%M:%S.000Z")
             file_info = {
                 **file_info,
                 **{
-                    "size" : file_stats.st_size,
-                    "time" : datetime.datetime.fromtimestamp(
-                         file_stats.st_ctime, tz=datetime.UTC
-                    ).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
-                    "uid" : str(file_stats.st_uid),
-                    "gid" : str(file_stats.st_gid),
-                    "perm" : oct(file_stats.st_mode),
-                }
+                    "size": file_stats.st_size,
+                    "time": timestamp_str,
+                    "uid": str(file_stats.st_uid),
+                    "gid": str(file_stats.st_gid),
+                    "perm": oct(file_stats.st_mode),
+                },
             }
 
         if compute_file_hash:
             file_info["chk"] = _calculate_checksum(file_path, file_hash_algorithm)
 
     return DataFileListItem(**file_info)
-    # if file_path.exists() and compute_file_stats:
-    #     return DataFileListItem(
-    #         path=file_path.absolute().as_posix(),
-    #         size=(file_stats := file_path.stat()).st_size,
-    #         time=datetime.datetime.fromtimestamp(
-    #             file_stats.st_ctime, tz=datetime.UTC
-    #         ).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
-    #         chk=_calculate_checksum(file_path, file_hash_algorithm)
-    #         if compute_file_hash
-    #         else None,
-    #         uid=str(file_stats.st_uid),
-    #         gid=str(file_stats.st_gid),
-    #         perm=oct(file_stats.st_mode),
-    #     )
-    # else:
-    #     return DataFileListItem(
-    #         path=file_path.absolute().as_posix(),
-    #         time=datetime.datetime.now(tz=datetime.UTC).strftime(
-    #             "%Y-%m-%dT%H:%M:%S.000Z"
-    #         ),
-    #     )
 
 
 def _build_hash_path(
@@ -355,7 +359,7 @@ def create_data_file_list(
         _create_single_data_file_list_item,
         file_hash_algorithm=config.file_hash_algorithm,
         compute_file_stats=config.compute_file_stats,
-        compute_file_hash=config.compute_file_hash
+        compute_file_hash=config.compute_file_hash,
     )
 
     # Collect the files that will be ingested
@@ -388,31 +392,32 @@ def create_data_file_list(
             )
             data_file_list.append(
                 single_file_constructor(
-                    file_path=hash_file_path,
-                    compute_file_hash=False
+                    file_path=hash_file_path, compute_file_hash=False
                 )
             )
         if source_folder:
             for data_file in data_file_list:
                 data_file.path = str(
-                    pathlib.Path(data_file.path).relative_to(source_folder))
+                    pathlib.Path(data_file.path).relative_to(source_folder)
+                )
 
     return data_file_list
 
 
-def _filter_by_field_type(schemas: Iterable[dict], field_type: str) -> list[dict]:
+def _filter_by_field_type(
+    schemas: Iterable[MetadataItem], field_type: str
+) -> list[MetadataItem]:
     return [field for field in schemas if field.field_type == field_type]
 
 
 def _render_variable_as_type(value: str, variable_map: dict, dtype: str) -> Any:
-    print(value, dtype)
     return convert_to_type(render_variable_value(value, variable_map), dtype)
 
 
 def _create_scientific_metadata(
     *,
     metadata_schema_id: str,
-    sm_schemas: list[dict],
+    sm_schemas: list[MetadataItem],
     variable_map: dict,
 ) -> dict:
     """Create scientific metadata from the metadata schema configuration.
@@ -440,8 +445,8 @@ def _create_scientific_metadata(
                 "value": _render_variable_as_type(
                     field.value, variable_map, field.type
                 ),
-                "unit": getattr(field,"unit", ""),
-                "human_name": getattr(field,"human_name", field.machine_name),
+                "unit": getattr(field, "unit", ""),
+                "human_name": getattr(field, "human_name", field.machine_name),
                 "type": field.type,
             }
             for field in sm_schemas
@@ -450,7 +455,7 @@ def _create_scientific_metadata(
 
 
 def _validate_metadata_schemas(
-    metadata_schema: dict[str, dict],
+    metadata_schema: dict[str, MetadataItem],
 ) -> None:
     invalid_types = [
         field.field_type
@@ -470,7 +475,7 @@ def _validate_metadata_schemas(
 def create_scicat_dataset_instance(
     *,
     metadata_schema_id: str,  # metadata-schema["id"]
-    metadata_schema: dict[str, dict],  # metadata-schema["schema"]
+    metadata_schema: dict[str, MetadataItem],  # metadata-schema["schema"]
     variable_map: dict,
     data_file_list: list[DataFileListItem],
     config: DatasetOptions,
diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py
index 2a01898..9006249 100644
--- a/src/scicat_metadata.py
+++ b/src/scicat_metadata.py
@@ -168,8 +168,7 @@ def collect_schemas(dir_path: pathlib.Path) -> OrderedDict[str, MetadataSchema]:
 
 def select_applicable_schema(
     nexus_file: pathlib.Path,
-    h5_file: any,
-    schemas: OrderedDict[str, MetadataSchema]
+    schemas: OrderedDict[str, MetadataSchema],
 ) -> MetadataSchema:
     """
     Evaluates which metadata schema configuration is applicable to ``nexus_file``.
diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py
index bf97637..d78e354 100755
--- a/src/scicat_offline_ingestor.py
+++ b/src/scicat_offline_ingestor.py
@@ -7,9 +7,10 @@
 from scicat_communication import create_scicat_dataset, create_scicat_origdatablock
 from scicat_configuration import (
     OfflineIngestorConfig,
+    SciCatOptions,
     build_arg_parser,
     build_dataclass,
-    merge_config_and_input_args, SciCatOptions,
+    merge_config_and_input_args,
 )
 from scicat_dataset import (
     create_data_file_list,
@@ -69,13 +70,11 @@ def main() -> None:
         # open nexus file with h5py
         with h5py.File(nexus_file_path) as h5file:
             # load instrument metadata configuration
-            metadata_schema = select_applicable_schema(nexus_file_path, h5file, schemas)
+            metadata_schema = select_applicable_schema(nexus_file_path, schemas)
 
             # define variables values
             variable_map = extract_variables_values(
-                metadata_schema.variables,
-                h5file,
-                config
+                metadata_schema.variables, h5file, config
             )
 
         # Collect data-file descriptions
@@ -103,9 +102,7 @@ def main() -> None:
         logger.debug("Scicat dataset: %s", local_dataset)
         # Create dataset in scicat
         scicat_dataset = create_scicat_dataset(
-            dataset=local_dataset,
-            config=config.scicat,
-            logger=logger
+            dataset=local_dataset, config=config.scicat, logger=logger
         )
 
         # Prepare origdatablock
@@ -120,9 +117,7 @@ def main() -> None:
         logger.debug("Scicat origdatablock: %s", local_origdatablock)
         # create origdatablock in scicat
         scicat_origdatablock = create_scicat_origdatablock(
-            origdatablock=local_origdatablock,
-            config=config.scicat,
-            logger=logger
+            origdatablock=local_origdatablock, config=config.scicat, logger=logger
         )
 
         # check one more time if we successfully created the entries in scicat
@@ -135,5 +130,6 @@ def main() -> None:
             )
             raise RuntimeError("Failed to create dataset or origdatablock.")
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/src/scicat_online_ingestor.py b/src/scicat_online_ingestor.py
index 5ebb9a0..17893d4 100644
--- a/src/scicat_online_ingestor.py
+++ b/src/scicat_online_ingestor.py
@@ -165,5 +165,6 @@ def main() -> None:
                 if config.kafka.individual_message_commit:
                     _individual_message_commit(offline_ingestors, consumer, logger)
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/tests/test_scicat_dataset.py b/tests/test_scicat_dataset.py
index e53b492..c68fbe5 100644
--- a/tests/test_scicat_dataset.py
+++ b/tests/test_scicat_dataset.py
@@ -11,7 +11,9 @@ def test_dtype_string_converter() -> None:
 
 
 def test_dtype_string_array_converter() -> None:
-    assert convert_to_type("test", "string[]") == ["t", "e", "s", "t"]
+    assert convert_to_type("'test'", "string[]") == ["t", "e", "s", "t"]
+    assert convert_to_type("['test']", "string[]") == ["test"]
+    assert convert_to_type("['test', 'test2']", "string[]") == ["test", "test2"]
     assert convert_to_type([1, 2, 3], "string[]") == ["1", "2", "3"]
     assert convert_to_type([1.1, 2.2, 3.3], "string[]") == ["1.1", "2.2", "3.3"]
 

From 2cb5137ea7623575bc31cd716a6e137b3fd3c31b Mon Sep 17 00:00:00 2001
From: YooSunyoung <luysunyoung9@gmail.com>
Date: Mon, 21 Oct 2024 15:32:05 +0200
Subject: [PATCH 6/7] Hide api url from argument and expose api-endpoints
 instead.

---
 resources/config.sample.json |  7 ++++++-
 src/scicat_configuration.py  | 33 ++++++++++++++++++++++++++-------
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/resources/config.sample.json b/resources/config.sample.json
index d77e935..dd93ea6 100644
--- a/resources/config.sample.json
+++ b/resources/config.sample.json
@@ -61,6 +61,11 @@
     "timeout": 0,
     "stream": true,
     "verify": false,
-    "urls": {}
+    "api_endpoints": {
+      "datasets": "datasets",
+      "proposals": "proposals",
+      "origdatablocks": "origdatablocks",
+      "instruments": "instruments"
+    }
   }
 }
diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py
index df131cf..2920823 100644
--- a/src/scicat_configuration.py
+++ b/src/scicat_configuration.py
@@ -230,6 +230,22 @@ class DatasetOptions:
     default_access_groups: list[str] = field(default_factory=default_access_groups)
 
 
+@dataclass(kw_only=True)
+class _ScicatAPIURLs:
+    datasets: str
+    proposals: str
+    origdatablocks: str
+    instruments: str
+
+
+@dataclass(kw_only=True)
+class ScicatEndpoints:
+    datasets: str = "datasets"
+    proposals: str = "proposals"
+    origdatablocks: str = "origdatablocks"
+    instruments: str = "instruments"
+
+
 @dataclass(kw_only=True)
 class SciCatOptions:
     host: str = "https://scicat.host"
@@ -238,7 +254,16 @@ class SciCatOptions:
     timeout: int = 0
     stream: bool = True
     verify: bool = False
-    urls: dict = field(default_factory=dict)
+    api_endpoints: ScicatEndpoints = field(default_factory=ScicatEndpoints)
+
+    @property
+    def urls(self) -> _ScicatAPIURLs:
+        return _ScicatAPIURLs(
+            datasets=urljoin(self.host, self.api_endpoints.datasets),
+            proposals=urljoin(self.host, self.api_endpoints.proposals),
+            origdatablocks=urljoin(self.host, self.api_endpoints.origdatablocks),
+            instruments=urljoin(self.host, self.api_endpoints.instruments),
+        )
 
     @classmethod
     def from_configurations(cls, config: dict) -> "SciCatOptions":
@@ -249,12 +274,6 @@ def from_configurations(cls, config: dict) -> "SciCatOptions":
             **options.headers,
             **{"Authorization": f"Bearer {options.token}"},
         }
-        options.urls = {
-            "datasets": urljoin(options.host, "datasets"),
-            "proposals": urljoin(options.host, "proposals"),
-            "origdatablocks": urljoin(options.host, "origdatablocks"),
-            "instruments": urljoin(options.host, "instruments"),
-        }
         return options
 
 

From 5cc22a7ba80abebee946621dd0be1a0a7b233cb1 Mon Sep 17 00:00:00 2001
From: YooSunyoung <luysunyoung9@gmail.com>
Date: Thu, 31 Oct 2024 13:15:50 +0100
Subject: [PATCH 7/7] Use attributes instead of key.

---
 resources/config.sample.json |  2 +-
 src/scicat_communication.py  |  8 ++++----
 src/scicat_configuration.py  | 10 ++++++----
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/resources/config.sample.json b/resources/config.sample.json
index f7204f8..b6718ac 100644
--- a/resources/config.sample.json
+++ b/resources/config.sample.json
@@ -59,7 +59,7 @@
   "scicat": {
     "host": "https://scicat.host",
     "token": "JWT_TOKEN",
-    "headers": {},
+    "additional_headers": {},
     "timeout": 0,
     "stream": true,
     "verify": false,
diff --git a/src/scicat_communication.py b/src/scicat_communication.py
index c169c15..1257564 100644
--- a/src/scicat_communication.py
+++ b/src/scicat_communication.py
@@ -52,7 +52,7 @@ def create_scicat_dataset(
     """
     logger.info("Sending POST request to create new dataset")
     response = _post_to_scicat(
-        url=config.urls["datasets"],
+        url=config.urls.datasets,
         posting_obj=dataset,
         headers=config.additional_headers,
         timeout=config.timeout,
@@ -84,7 +84,7 @@ def create_scicat_origdatablock(
     """
     logger.info("Sending POST request to create new origdatablock")
     response = _post_to_scicat(
-        url=config.urls["origdatablocks"],
+        url=config.urls.origdatablocks,
         posting_obj=origdatablock,
         headers=config.additional_headers,
         timeout=config.timeout,
@@ -123,7 +123,7 @@ def check_dataset_by_pid(
     pid: str, config: SciCatOptions, logger: logging.Logger
 ) -> bool:
     response = _get_from_scicat(
-        url=urljoin(config.host, f"datasets/{quote(pid)}"),
+        url=urljoin(config.host_address, f"datasets/{quote(pid)}"),
         headers=config.additional_headers,
         timeout=config.timeout,
         stream=config.stream,
@@ -157,7 +157,7 @@ def check_dataset_by_metadata(
 ) -> bool:
     metadata_dict = {f"scientificMetadata.{metadata_key}.value": metadata_value}
     filter_string = '?filter={"where":' + json.dumps(metadata_dict) + "}"
-    url = urljoin(config.host, "datasets") + filter_string
+    url = urljoin(config.host_address, "datasets") + filter_string
     logger.info("Checking if dataset exists by metadata with url: %s", url)
     response = _get_from_scicat(
         url=url,
diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py
index 895b770..12de863 100644
--- a/src/scicat_configuration.py
+++ b/src/scicat_configuration.py
@@ -261,10 +261,12 @@ class SciCatOptions:
     @property
     def urls(self) -> _ScicatAPIURLs:
         return _ScicatAPIURLs(
-            datasets=urljoin(self.host, self.api_endpoints.datasets),
-            proposals=urljoin(self.host, self.api_endpoints.proposals),
-            origdatablocks=urljoin(self.host, self.api_endpoints.origdatablocks),
-            instruments=urljoin(self.host, self.api_endpoints.instruments),
+            datasets=urljoin(self.host_address, self.api_endpoints.datasets),
+            proposals=urljoin(self.host_address, self.api_endpoints.proposals),
+            origdatablocks=urljoin(
+                self.host_address, self.api_endpoints.origdatablocks
+            ),
+            instruments=urljoin(self.host_address, self.api_endpoints.instruments),
         )
 
     @property