Skip to content

Commit

Permalink
Merge pull request #86 from SciCatProject/fix_from_testing_20241031
Browse files Browse the repository at this point in the history
Fix from testing 20241031
  • Loading branch information
YooSunYoung authored Nov 1, 2024
2 parents 344362e + 5e04499 commit b5a820b
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 16 deletions.
2 changes: 1 addition & 1 deletion resources/config.sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"ingestor_files_directory": "../ingestor",
"message_to_file": true,
"message_file_extension": "message.json",
"use_full_file_path": false
"file_path_type": "relative"
}
},
"kafka": {
Expand Down
1 change: 0 additions & 1 deletion src/scicat_communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ def render_full_url(url: str, config: SciCatOptions) -> str:
for endpoint in urls.keys():
if url.startswith(endpoint):
return url.replace(endpoint, urls[endpoint])

return url


Expand Down
2 changes: 1 addition & 1 deletion src/scicat_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ class FileHandlingOptions:
ingestor_files_directory: str = "../ingestor"
message_to_file: bool = True
message_file_extension: str = "message.json"
use_full_file_path: bool = False
file_path_type: str = "relative" # allowed values: absolute and relative


@dataclass(kw_only=True)
Expand Down
44 changes: 31 additions & 13 deletions src/scicat_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ def extract_paths_from_h5_file(
master_key + "/" + subkey
for subkey in extract_paths_from_h5_file(_h5_object[master_key], _path)
]
else:
output_paths = [master_key]

return output_paths

Expand Down Expand Up @@ -424,7 +426,7 @@ def create_data_file_list(
file_path=hash_file_path, compute_file_hash=False
)
)
if source_folder:
if source_folder and config.file_path_type == "relative":
for data_file in data_file_list:
data_file.path = str(
pathlib.Path(data_file.path).relative_to(source_folder)
Expand Down Expand Up @@ -603,18 +605,29 @@ def scicat_dataset_to_dict(dataset: ScicatDataset) -> dict:
return {k: v for k, v in asdict(dataset).items() if v is not None}


def _define_dataset_source_folder(datafilelist: list[DataFileListItem]) -> pathlib.Path:
def _define_dataset_source_folder(
datafilelist: list[DataFileListItem],
data_file_path: pathlib.Path,
source_folder_config: str = "common_path",
) -> pathlib.Path | None:
"""
Return the dataset source folder, which is the common path
between all the data files associated with the dataset
"""
import os

return pathlib.Path(os.path.commonpath([item.path for item in datafilelist]))
if source_folder_config == "data_file":
return pathlib.Path(os.path.dirname(data_file_path))
elif source_folder_config == "common_path":
return pathlib.Path(os.path.commonpath([item.path for item in datafilelist]))
else:
return None


def _path_to_relative(
datafilelist_item: DataFileListItem, dataset_source_folder: pathlib.Path
datafilelist_item: DataFileListItem,
dataset_source_folder: pathlib.Path,
file_path_type: str = "relative",
) -> DataFileListItem:
"""
Copy the datafiles item and transform the path to the relative path
Expand All @@ -623,32 +636,37 @@ def _path_to_relative(
from copy import copy

origdatablock_datafilelist_item = copy(datafilelist_item)
origdatablock_datafilelist_item.path = (
pathlib.Path(datafilelist_item.path)
.relative_to(dataset_source_folder)
.as_posix()
)
if file_path_type == "relative":
origdatablock_datafilelist_item.path = (
pathlib.Path(datafilelist_item.path)
.relative_to(dataset_source_folder)
.as_posix()
)
return origdatablock_datafilelist_item


def _prepare_origdatablock_datafilelist(
datafiles_list: list[DataFileListItem], dataset_source_folder: pathlib.Path
datafiles_list: list[DataFileListItem],
dataset_source_folder: pathlib.Path,
file_path_type: str = "relative",
) -> list[DataFileListItem]:
"""
Prepare the datafiles list for the origdatablock entry in scicat
That means that the file paths needs to be relative to the dataset source folder
"""
return [_path_to_relative(item, dataset_source_folder) for item in datafiles_list]
return [
_path_to_relative(item, dataset_source_folder, file_path_type)
for item in datafiles_list
]


def create_origdatablock_instance(
data_file_list: list[DataFileListItem],
scicat_dataset: dict,
config: FileHandlingOptions,
) -> OrigDataBlockInstance:
dataset_source_folder = _define_dataset_source_folder(data_file_list)
origdatablock_datafiles_list = _prepare_origdatablock_datafilelist(
data_file_list, dataset_source_folder
data_file_list, scicat_dataset["sourceFolder"], config.file_path_type
)
return OrigDataBlockInstance(
datasetId=scicat_dataset["pid"],
Expand Down

0 comments on commit b5a820b

Please sign in to comment.