From 6ec691eb920b48da30139808ee0b445142508966 Mon Sep 17 00:00:00 2001 From: Peter Kraus Date: Sun, 31 Mar 2024 13:53:43 +0200 Subject: [PATCH] Technical Debt: Round 2 (#138) * custom extractors docs etc. * Agilent * Make the linter happy * More docs changes, and drycal. * Ezchrom & Tomato * panalytica. * phi * Phi.spe and Quadstar.sac * fusion * Nuke most of parsers * Linter * The big rename * Move stuff around * Purge parsers * Fix docs * Update eclab docs --- docs/apidoc_t/package.rst_t | 62 +++--- docs/source/conf.py | 3 +- docs/source/extractors.rst | 11 +- src/yadg/core.py | 82 +------ src/yadg/dgutils/__init__.py | 17 +- src/yadg/dgutils/dateutils.py | 52 ++++- src/yadg/dgutils/dsutils.py | 83 +++++++ src/yadg/dgutils/pintutils.py | 2 +- src/yadg/dgutils/{utils.py => schemautils.py} | 0 src/yadg/extractors/__init__.py | 2 +- src/yadg/extractors/agilent/__init__.py | 4 + .../agilentch.py => extractors/agilent/ch.py} | 88 ++++---- .../agilent/csv.py} | 82 +++---- src/yadg/extractors/agilent/dx.py | 68 ++++++ .../extractors/{custom => basic}/__init__.py | 0 .../main.py => extractors/basic/csv.py} | 174 ++++++--------- src/yadg/extractors/custom/basic/csv.py | 7 - src/yadg/extractors/custom/empalc/csv.py | 7 - src/yadg/extractors/custom/empalc/xlsx.py | 7 - .../extractors/custom/example/__init__.py | 85 -------- src/yadg/extractors/custom/fhimcpt/csv.py | 7 - src/yadg/extractors/custom/fhimcpt/vna.py | 7 - src/yadg/extractors/drycal/__init__.py | 4 + .../drycal.py => extractors/drycal/common.py} | 117 ++++------ src/yadg/extractors/drycal/csv.py | 57 +++++ src/yadg/extractors/drycal/rtf.py | 58 +++++ src/yadg/extractors/drycal/txt.py | 57 +++++ .../{custom/basic => eclab}/__init__.py | 0 .../empalc => eclab/common}/__init__.py | 0 .../eclab/common}/mpr_columns.py | 0 .../eclab/common}/mpt_columns.py | 0 .../eclab/common}/techniques.py | 0 .../eclabmpr.py => extractors/eclab/mpr.py} | 108 ++++----- .../eclabmpt.py => extractors/eclab/mpt.py} | 73 ++++--- src/yadg/extractors/empalc/__init__.py | 5 + .../empalccsv.py => extractors/empalc/csv.py} | 69 +++--- .../empalc/xlsx.py} | 62 ++++-- src/yadg/extractors/example/__init__.py | 50 +++++ src/yadg/extractors/ezchrom/__init__.py | 4 + .../ezchrom/asc.py} | 86 +++++--- src/yadg/extractors/fhimcpt/__init__.py | 5 + src/yadg/extractors/fhimcpt/csv.py | 99 +++++++++ .../fhimcpt/vna.py} | 67 +++--- .../{custom/fhimcpt => fusion}/__init__.py | 0 .../fusioncsv.py => extractors/fusion/csv.py} | 84 ++++--- src/yadg/extractors/fusion/json.py | 206 ++++++++++++++++++ src/yadg/extractors/fusion/zip.py | 70 ++++++ src/yadg/extractors/panalytical/__init__.py | 4 + .../panalytical}/common.py | 0 .../panalytical/csv.py} | 64 +++--- .../panalytical/xrdml.py} | 96 ++++---- .../panalytical/xy.py} | 59 +++-- .../extractors/{public => phi}/__init__.py | 0 .../phispe.py => extractors/phi/spe.py} | 69 +++--- src/yadg/extractors/public/agilent/ch.py | 7 - src/yadg/extractors/public/agilent/csv.py | 7 - src/yadg/extractors/public/agilent/dx.py | 8 - src/yadg/extractors/public/drycal/__init__.py | 0 src/yadg/extractors/public/drycal/csv.py | 40 ---- src/yadg/extractors/public/drycal/rtf.py | 37 ---- src/yadg/extractors/public/drycal/txt.py | 40 ---- src/yadg/extractors/public/eclab/__init__.py | 0 src/yadg/extractors/public/eclab/mpr.py | 7 - src/yadg/extractors/public/eclab/mpt.py | 7 - .../extractors/public/ezchrom/__init__.py | 0 src/yadg/extractors/public/ezchrom/asc.py | 7 - src/yadg/extractors/public/fusion/__init__.py | 0 src/yadg/extractors/public/fusion/csv.py | 7 - src/yadg/extractors/public/fusion/json.py | 20 -- src/yadg/extractors/public/fusion/zip.py | 20 -- .../extractors/public/panalytical/__init__.py | 0 src/yadg/extractors/public/panalytical/csv.py | 7 - .../extractors/public/panalytical/xrdml.py | 7 - src/yadg/extractors/public/panalytical/xy.py | 7 - src/yadg/extractors/public/phi/__init__.py | 0 src/yadg/extractors/public/phi/spe.py | 7 - .../extractors/public/quadstar/__init__.py | 0 src/yadg/extractors/public/quadstar/sac.py | 7 - src/yadg/extractors/public/tomato/__init__.py | 0 src/yadg/extractors/public/tomato/json.py | 35 --- .../{public/agilent => quadstar}/__init__.py | 0 .../quadstar/sac.py} | 62 ++++-- src/yadg/extractors/tomato/__init__.py | 4 + .../tomato/json.py} | 87 ++++++-- src/yadg/parsers/__init__.py | 0 src/yadg/parsers/basiccsv/__init__.py | 49 ----- src/yadg/parsers/chromdata/__init__.py | 93 -------- src/yadg/parsers/chromdata/fusionjson.py | 154 ------------- src/yadg/parsers/chromdata/fusionzip.py | 68 ------ src/yadg/parsers/chromtrace/__init__.py | 116 ---------- src/yadg/parsers/chromtrace/agilentdx.py | 87 -------- src/yadg/parsers/chromtrace/fusionjson.py | 111 ---------- src/yadg/parsers/chromtrace/fusionzip.py | 82 ------- src/yadg/parsers/dummy/__init__.py | 98 --------- src/yadg/parsers/electrochem/__init__.py | 88 -------- .../electrochem/eclabcommon/__init__.py | 0 src/yadg/parsers/flowdata/__init__.py | 39 ---- src/yadg/parsers/flowdata/main.py | 66 ------ src/yadg/parsers/masstrace/__init__.py | 90 -------- src/yadg/parsers/meascsv/__init__.py | 115 ---------- src/yadg/parsers/qftrace/__init__.py | 71 ------ src/yadg/parsers/xpstrace/__init__.py | 70 ------ src/yadg/parsers/xrdtrace/__init__.py | 75 ------- tests/test_chromdata.py | 1 + 104 files changed, 1651 insertions(+), 2675 deletions(-) create mode 100644 src/yadg/dgutils/dsutils.py rename src/yadg/dgutils/{utils.py => schemautils.py} (100%) create mode 100644 src/yadg/extractors/agilent/__init__.py rename src/yadg/{parsers/chromtrace/agilentch.py => extractors/agilent/ch.py} (72%) rename src/yadg/{parsers/chromtrace/agilentcsv.py => extractors/agilent/csv.py} (75%) create mode 100644 src/yadg/extractors/agilent/dx.py rename src/yadg/extractors/{custom => basic}/__init__.py (100%) rename src/yadg/{parsers/basiccsv/main.py => extractors/basic/csv.py} (50%) delete mode 100644 src/yadg/extractors/custom/basic/csv.py delete mode 100644 src/yadg/extractors/custom/empalc/csv.py delete mode 100644 src/yadg/extractors/custom/empalc/xlsx.py delete mode 100644 src/yadg/extractors/custom/example/__init__.py delete mode 100644 src/yadg/extractors/custom/fhimcpt/csv.py delete mode 100644 src/yadg/extractors/custom/fhimcpt/vna.py create mode 100644 src/yadg/extractors/drycal/__init__.py rename src/yadg/{parsers/flowdata/drycal.py => extractors/drycal/common.py} (64%) create mode 100644 src/yadg/extractors/drycal/csv.py create mode 100644 src/yadg/extractors/drycal/rtf.py create mode 100644 src/yadg/extractors/drycal/txt.py rename src/yadg/extractors/{custom/basic => eclab}/__init__.py (100%) rename src/yadg/extractors/{custom/empalc => eclab/common}/__init__.py (100%) rename src/yadg/{parsers/electrochem/eclabcommon => extractors/eclab/common}/mpr_columns.py (100%) rename src/yadg/{parsers/electrochem/eclabcommon => extractors/eclab/common}/mpt_columns.py (100%) rename src/yadg/{parsers/electrochem/eclabcommon => extractors/eclab/common}/techniques.py (100%) rename src/yadg/{parsers/electrochem/eclabmpr.py => extractors/eclab/mpr.py} (89%) rename src/yadg/{parsers/electrochem/eclabmpt.py => extractors/eclab/mpt.py} (83%) create mode 100644 src/yadg/extractors/empalc/__init__.py rename src/yadg/{parsers/chromdata/empalccsv.py => extractors/empalc/csv.py} (85%) rename src/yadg/{parsers/chromdata/empalcxlsx.py => extractors/empalc/xlsx.py} (85%) create mode 100644 src/yadg/extractors/example/__init__.py create mode 100644 src/yadg/extractors/ezchrom/__init__.py rename src/yadg/{parsers/chromtrace/ezchromasc.py => extractors/ezchrom/asc.py} (76%) create mode 100644 src/yadg/extractors/fhimcpt/__init__.py create mode 100644 src/yadg/extractors/fhimcpt/csv.py rename src/yadg/{parsers/qftrace/labviewcsv.py => extractors/fhimcpt/vna.py} (74%) rename src/yadg/extractors/{custom/fhimcpt => fusion}/__init__.py (100%) rename src/yadg/{parsers/chromdata/fusioncsv.py => extractors/fusion/csv.py} (71%) create mode 100644 src/yadg/extractors/fusion/json.py create mode 100644 src/yadg/extractors/fusion/zip.py create mode 100644 src/yadg/extractors/panalytical/__init__.py rename src/yadg/{parsers/xrdtrace => extractors/panalytical}/common.py (100%) rename src/yadg/{parsers/xrdtrace/panalyticalcsv.py => extractors/panalytical/csv.py} (81%) rename src/yadg/{parsers/xrdtrace/panalyticalxrdml.py => extractors/panalytical/xrdml.py} (84%) rename src/yadg/{parsers/xrdtrace/panalyticalxy.py => extractors/panalytical/xy.py} (63%) rename src/yadg/extractors/{public => phi}/__init__.py (100%) rename src/yadg/{parsers/xpstrace/phispe.py => extractors/phi/spe.py} (90%) delete mode 100644 src/yadg/extractors/public/agilent/ch.py delete mode 100644 src/yadg/extractors/public/agilent/csv.py delete mode 100644 src/yadg/extractors/public/agilent/dx.py delete mode 100644 src/yadg/extractors/public/drycal/__init__.py delete mode 100644 src/yadg/extractors/public/drycal/csv.py delete mode 100644 src/yadg/extractors/public/drycal/rtf.py delete mode 100644 src/yadg/extractors/public/drycal/txt.py delete mode 100644 src/yadg/extractors/public/eclab/__init__.py delete mode 100644 src/yadg/extractors/public/eclab/mpr.py delete mode 100644 src/yadg/extractors/public/eclab/mpt.py delete mode 100644 src/yadg/extractors/public/ezchrom/__init__.py delete mode 100644 src/yadg/extractors/public/ezchrom/asc.py delete mode 100644 src/yadg/extractors/public/fusion/__init__.py delete mode 100644 src/yadg/extractors/public/fusion/csv.py delete mode 100644 src/yadg/extractors/public/fusion/json.py delete mode 100644 src/yadg/extractors/public/fusion/zip.py delete mode 100644 src/yadg/extractors/public/panalytical/__init__.py delete mode 100644 src/yadg/extractors/public/panalytical/csv.py delete mode 100644 src/yadg/extractors/public/panalytical/xrdml.py delete mode 100644 src/yadg/extractors/public/panalytical/xy.py delete mode 100644 src/yadg/extractors/public/phi/__init__.py delete mode 100644 src/yadg/extractors/public/phi/spe.py delete mode 100644 src/yadg/extractors/public/quadstar/__init__.py delete mode 100644 src/yadg/extractors/public/quadstar/sac.py delete mode 100644 src/yadg/extractors/public/tomato/__init__.py delete mode 100644 src/yadg/extractors/public/tomato/json.py rename src/yadg/extractors/{public/agilent => quadstar}/__init__.py (100%) rename src/yadg/{parsers/masstrace/quadstarsac.py => extractors/quadstar/sac.py} (89%) create mode 100644 src/yadg/extractors/tomato/__init__.py rename src/yadg/{parsers/electrochem/tomatojson.py => extractors/tomato/json.py} (63%) delete mode 100644 src/yadg/parsers/__init__.py delete mode 100644 src/yadg/parsers/basiccsv/__init__.py delete mode 100644 src/yadg/parsers/chromdata/__init__.py delete mode 100644 src/yadg/parsers/chromdata/fusionjson.py delete mode 100644 src/yadg/parsers/chromdata/fusionzip.py delete mode 100644 src/yadg/parsers/chromtrace/__init__.py delete mode 100644 src/yadg/parsers/chromtrace/agilentdx.py delete mode 100644 src/yadg/parsers/chromtrace/fusionjson.py delete mode 100644 src/yadg/parsers/chromtrace/fusionzip.py delete mode 100644 src/yadg/parsers/dummy/__init__.py delete mode 100644 src/yadg/parsers/electrochem/__init__.py delete mode 100644 src/yadg/parsers/electrochem/eclabcommon/__init__.py delete mode 100644 src/yadg/parsers/flowdata/__init__.py delete mode 100644 src/yadg/parsers/flowdata/main.py delete mode 100644 src/yadg/parsers/masstrace/__init__.py delete mode 100644 src/yadg/parsers/meascsv/__init__.py delete mode 100644 src/yadg/parsers/qftrace/__init__.py delete mode 100644 src/yadg/parsers/xpstrace/__init__.py delete mode 100644 src/yadg/parsers/xrdtrace/__init__.py diff --git a/docs/apidoc_t/package.rst_t b/docs/apidoc_t/package.rst_t index 20ed8fc2..94f46a22 100644 --- a/docs/apidoc_t/package.rst_t +++ b/docs/apidoc_t/package.rst_t @@ -15,40 +15,32 @@ {%- if is_namespace %} {{- [pkgname, "namespace"] | join(" ") | e | heading }} -{% elif 'yadg.parsers.basiccsv' == pkgname %} -{{- "**basiccsv**: Common tabular file parser" | heading }} -{% elif 'yadg.parsers.chromdata' == pkgname %} -{{- "**chromdata**: Post-processed chromatography data parser" | heading }} -{% elif 'yadg.parsers.chromtrace' == pkgname %} -{{- "**chromtrace**: Raw chromatogram trace file parser" | heading }} -{% elif 'yadg.parsers.dummy' == pkgname %} -{{- "**dummy**: A dummy parser" | heading }} -{% elif 'yadg.parsers.electrochem' == pkgname %} -{{- "**electrochem**: Electrochemistry data parser" | heading }} -{% elif 'yadg.parsers.flowdata' == pkgname %} -{{- "**flowdata**: Flow data parser" | heading }} -{% elif 'yadg.parsers.masstrace' == pkgname %} -{{- "**masstrace**: Mass spectroscopy trace file parser" | heading }} -{% elif 'yadg.parsers.meascsv' == pkgname %} -{{- "**meascsv**: Legacy MCPT log file parser" | heading }} -{% elif 'yadg.parsers.qftrace' == pkgname %} -{{- "**qftrace**: Network analyser trace file parser" | heading }} -{% elif 'yadg.parsers.xpstrace' == pkgname %} -{{- "**xpstrace**: XPS trace file parser" | heading }} -{% elif 'yadg.parsers.xrdtrace' == pkgname %} -{{- "**xrdtrace**: X-ray diffractogram trace file parser" | heading }} -{% elif 'yadg.extractors.agilentch' == pkgname %} -{{- "**agilent-ch**: Agilent ChemStation export ``.CH``" | heading }} -{% elif 'yadg.extractors.agilentdx' == pkgname %} -{{- "**agilent-dx**: Agilent OpenLab raw data ``.dx``" | heading }} -{% elif 'yadg.extractors.eclabmpr' == pkgname %} -{{- "**eclab-mpr**: BioLogic ECLab binary ``.mpr``" | heading }} -{% elif 'yadg.extractors.eclabmpt' == pkgname %} -{{- "**eclab-mpr**: BioLogic ECLab export ``.mpt``" | heading }} -{% elif 'yadg.extractors.panalyticalxrdml' == pkgname %} -{{- "**panalytical-xrdml**: PANalytical XRDML ``.xrdml``" | heading }} -{% elif 'yadg.extractors.phispe' == pkgname %} -{{- "**phi-spe**: ULVAC-PHI Multipak ``.spe``" | heading }} +{% elif 'yadg.extractors.basic' == pkgname %} +{{- "**basic**: For tabulated data" | heading }} +{% elif 'yadg.extractors.empalc' == pkgname %} +{{- "**empalc**: For Empa's LC data" | heading }} +{% elif 'yadg.extractors.example' == pkgname %} +{{- "**example**: For yadg testing" | heading }} +{% elif 'yadg.extractors.fhimcpt' == pkgname %} +{{- "**fhimcpt**: For MCPT set-up at FHI" | heading }} +{% elif 'yadg.extractors.agilent' == pkgname %} +{{- "**agilent**: For Agilent chromatograms" | heading }} +{% elif 'yadg.extractors.drycal' == pkgname %} +{{- "**drycal**: For MesaLabs Drycal Pro data" | heading }} +{% elif 'yadg.extractors.eclab' == pkgname %} +{{- "**eclab**: For BioLogic data files" | heading }} +{% elif 'yadg.extractors.ezchrom' == pkgname %} +{{- "**ezchrom**: For EZChrom chromatograms" | heading }} +{% elif 'yadg.extractors.fusion' == pkgname %} +{{- "**fusion**: For Fusion chromatograms" | heading }} +{% elif 'yadg.extractors.panalytical' == pkgname %} +{{- "**panalytical**: For Panalytical XRD data" | heading }} +{% elif 'yadg.extractors.phi' == pkgname %} +{{- "**phi**: For Phi XPS data" | heading }} +{% elif 'yadg.extractors.quadstar' == pkgname %} +{{- "**quadstar**: For Quadstar MS data" | heading }} +{% elif 'yadg.extractors.tomato' == pkgname %} +{{- "**tomato**: For tomato outputs" | heading }} {% else %} {{- [pkgname, "package"] | join(" ") | e | heading }} {% endif %} @@ -69,8 +61,6 @@ Subpackages {% endif %} {%- if submodules %} -Submodules ----------- {% if separatemodules %} {{ toctree(submodules) }} {% else %} diff --git a/docs/source/conf.py b/docs/source/conf.py index f3b567e6..8fa188e1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -35,7 +35,7 @@ # "sphinx.ext.coverage", "sphinx.ext.napoleon", "sphinx.ext.intersphinx", - # "sphinx.ext.autosummary", + "sphinx.ext.autosummary", "sphinx_autodoc_typehints", "sphinx_rtd_theme", "sphinxcontrib.autodoc_pydantic", @@ -63,7 +63,6 @@ html_theme_options = { "body_max_width": "none", "sticky_navigation": True, - "navigation_depth": 6, } html_logo = "./images/yadg.png" html_favicon = "./images/yadg_ico.png" diff --git a/docs/source/extractors.rst b/docs/source/extractors.rst index 09d8e851..59994da7 100644 --- a/docs/source/extractors.rst +++ b/docs/source/extractors.rst @@ -4,12 +4,5 @@ :hidden: :glob: - apidoc/yadg.extractors.public.* - -.. toctree:: - :maxdepth: 1 - :caption: yadg custom extractors - :hidden: - :glob: - - apidoc/yadg.extractors.custom.* \ No newline at end of file + apidoc/yadg.extractors.example + apidoc/yadg.extractors.* \ No newline at end of file diff --git a/src/yadg/core.py b/src/yadg/core.py index a098e844..26cd74e0 100644 --- a/src/yadg/core.py +++ b/src/yadg/core.py @@ -1,12 +1,9 @@ from importlib import metadata import logging import importlib -import xarray as xr -import numpy as np from typing import Callable from datatree import DataTree from xarray import Dataset -from pydantic import BaseModel from dgbowl_schemas.yadg.dataschema import DataSchema from yadg import dgutils @@ -20,18 +17,13 @@ def infer_extractor(extractor: str) -> Callable: A function that finds an :func:`extract` function of the supplied ``extractor``. """ - modnames = [ - f"yadg.extractors.public.{extractor}", - f"yadg.extractors.custom.{extractor}", - f"yadg.extractors.{extractor.replace('.','')}", - ] - for modname in modnames: - try: - m = importlib.import_module(modname) - if hasattr(m, "extract"): - return getattr(m, "extract") - except ImportError: - logger.critical(f"could not import module '{modname}'") + modname = f"yadg.extractors.{extractor}" + try: + m = importlib.import_module(modname) + if hasattr(m, "extract"): + return getattr(m, "extract") + except ImportError: + logger.critical(f"could not import module '{modname}'") raise RuntimeError @@ -99,69 +91,13 @@ def process_schema(dataschema: DataSchema, strict_merge: bool = False) -> DataTr # there are no variables - we don't add 'uts' to those. fvals[name] = dset else: - fvals[name] = complete_uts( + fvals[name] = dgutils.complete_uts( dset, tf, step.externaldate, step.extractor.timezone ) - vals = merge_dicttrees(vals, fvals, concatmode) + vals = dgutils.merge_dicttrees(vals, fvals, concatmode) stepdt = DataTree.from_dict({} if vals is None else vals) stepdt.name = step.tag stepdt.attrs = sattrs stepdt.parent = root return root - - -def complete_uts( - ds: Dataset, - filename: str, - externaldate: BaseModel, - timezone: str, -) -> Dataset: - """ - A helper function ensuring that the Dataset ``ds`` contains a dimension ``"uts"``, - and that the timestamps in ``"uts"`` are completed as instructed in the - ``externaldate`` specification. - - """ - if not hasattr(ds, "uts"): - ds = ds.expand_dims("uts") - if len(ds.uts.coords) == 0: - ds["uts"] = np.zeros(ds.uts.size) - ds.attrs["fulldate"] = False - if not ds.attrs.get("fulldate", True) or externaldate is not None: - ts, fulldate = dgutils.complete_timestamps( - timesteps=ds.uts.values, - fn=filename, - spec=externaldate, - timezone=timezone, - ) - ds["uts"] = ts - if fulldate: - ds.attrs.pop("fulldate", None) - else: - # cannot store booleans in NetCDF files - ds.attrs["fulldate"] = int(fulldate) - - return ds - - -def merge_dicttrees(vals: dict, fvals: dict, mode: str) -> dict: - """ - A helper function that merges two ``DataTree.to_dict()`` objects by concatenating - the new values in ``fvals`` to the existing ones in ``vals``. - - """ - if vals is None: - return fvals - for k in fvals.keys(): - try: - vals[k] = xr.concat([vals[k], fvals[k]], dim="uts", combine_attrs=mode) - except xr.MergeError: - raise RuntimeError( - "Merging metadata from multiple files has failed, as some of the " - "values differ between files. This might be caused by trying to " - "parse data obtained using different techniques/protocols in a " - "single step. If you are certain this is what you want, try using " - "yadg with the '--ignore-merge-errors' option." - ) - return vals diff --git a/src/yadg/dgutils/__init__.py b/src/yadg/dgutils/__init__.py index dd56b226..a4d249d1 100644 --- a/src/yadg/dgutils/__init__.py +++ b/src/yadg/dgutils/__init__.py @@ -1,18 +1,31 @@ from .helpers import get_yadg_metadata -from .dateutils import now, infer_timestamp_from, ole_to_uts, complete_timestamps -from .utils import update_schema, schema_from_preset +from .dateutils import ( + now, + infer_timestamp_from, + str_to_uts, + ole_to_uts, + complete_timestamps, + complete_uts, +) +from .schemautils import update_schema, schema_from_preset from .btools import read_value from .pintutils import sanitize_units, ureg +from .dsutils import dicts_to_dataset, append_dicts, merge_dicttrees __all__ = [ "get_yadg_metadata", "now", "infer_timestamp_from", + "str_to_uts", "ole_to_uts", "complete_timestamps", + "complete_uts", "update_schema", "schema_from_preset", "read_value", "sanitize_units", "ureg", + "dicts_to_dataset", + "append_dicts", + "merge_dicttrees", ] diff --git a/src/yadg/dgutils/dateutils.py b/src/yadg/dgutils/dateutils.py index 8b731dfa..c293ef17 100644 --- a/src/yadg/dgutils/dateutils.py +++ b/src/yadg/dgutils/dateutils.py @@ -6,7 +6,9 @@ import logging from zoneinfo import ZoneInfo import numpy as np +from pydantic import BaseModel from typing import Callable, Union, Mapping, Iterable +from xarray import Dataset from dgbowl_schemas.yadg.dataschema_5_0.externaldate import ExternalDate from dgbowl_schemas.yadg.dataschema_5_0.timestamp import TimestampSpec @@ -20,7 +22,8 @@ def now( """ Wrapper around datetime.now() - A convenience function for returning the current time as a ISO 8601 or as a unix timestamp. + A convenience function for returning the current time as a ISO 8601 or as a Unix + timestamp. """ dt = datetime.datetime.now(tz=tz) if asstr: @@ -138,8 +141,8 @@ def infer_timestamp_from( spec A specification of timestamp elements with associated column indices and - optional formats. Currently accepted combinations of keys are: "uts"; "timestamp"; - "date" and / or "time". + optional formats. Currently accepted combinations of keys are: "uts"; + "timestamp"; "date" and / or "time". tz Timezone to use for conversion. By default, UTC is used. @@ -194,7 +197,7 @@ def timefn(value): else: logger.debug( - "Assuming specified column containing the time is in ISO 8601 format" + "Assuming specified column is time in ISO 8601 format." ) def timefn(value): @@ -220,14 +223,10 @@ def retfn(date, time): return cols, retfn, True elif "uts" in headers: - logger.debug( - "No timestamp spec provided, assuming column 'uts' is a valid unix timestamp" - ) + logger.debug("Assuming column 'uts' is a valid unix timestamp.") return [headers.index("uts")], float, True elif "timestamp" in headers: - logger.debug( - "No timestamp spec provided, assuming column 'timestamp' is a valid ISO 8601 timestamp" - ) + logger.debug("Assuming column 'timestamp' is a valid ISO 8601 timestamp") def retfunc(value): return str_to_uts(timestamp=value, timezone=timezone) @@ -420,3 +419,36 @@ def timestamps_from_file( ) else: return float(data) + + +def complete_uts( + ds: Dataset, + filename: str, + externaldate: BaseModel, + timezone: str, +) -> Dataset: + """ + A helper function ensuring that the Dataset ``ds`` contains a dimension ``"uts"``, + and that the timestamps in ``"uts"`` are completed as instructed in the + ``externaldate`` specification. + + """ + if not hasattr(ds, "uts"): + ds = ds.expand_dims("uts") + if len(ds.uts.coords) == 0: + ds["uts"] = np.zeros(ds.uts.size) + ds.attrs["fulldate"] = False + if not ds.attrs.get("fulldate", True) or externaldate is not None: + ts, fulldate = complete_timestamps( + timesteps=ds.uts.values, + fn=filename, + spec=externaldate, + timezone=timezone, + ) + ds["uts"] = ts + if fulldate: + ds.attrs.pop("fulldate", None) + else: + # cannot store booleans in NetCDF files + ds.attrs["fulldate"] = int(fulldate) + return ds diff --git a/src/yadg/dgutils/dsutils.py b/src/yadg/dgutils/dsutils.py new file mode 100644 index 00000000..7dddffc1 --- /dev/null +++ b/src/yadg/dgutils/dsutils.py @@ -0,0 +1,83 @@ +import numpy as np +import xarray as xr +from xarray import Dataset +from typing import Any + + +def append_dicts( + vals: dict[str, Any], + devs: dict[str, Any], + data: dict[str, list[Any]], + meta: dict[str, list[Any]], + fn: str = None, + li: int = 0, +) -> None: + if "_fn" in meta and fn is not None: + meta["_fn"].append(str(fn)) + for k, v in vals.items(): + if k not in data: + data[k] = [None if isinstance(v, str) else np.nan] * li + data[k].append(v) + for k, v in devs.items(): + if k not in meta: + meta[k] = [np.nan] * li + meta[k].append(v) + + for k in set(data) - set(vals): + data[k].append(np.nan) + for k in set(meta) - set(devs): + if k != "_fn": + meta[k].append(np.nan) + + +def dicts_to_dataset( + data: dict[str, list[Any]], + meta: dict[str, list[Any]], + units: dict[str, str] = dict(), + fulldate: bool = True, +) -> Dataset: + darrs = {} + for k, v in data.items(): + attrs = {} + u = units.get(k, None) + if u is not None: + attrs["units"] = u + if k == "uts": + continue + darrs[k] = xr.DataArray(data=v, dims=["uts"], attrs=attrs) + if k in meta and darrs[k].dtype.kind in {"i", "u", "f", "c", "m", "M"}: + err = f"{k}_std_err" + darrs[k].attrs["ancillary_variables"] = err + attrs["standard_name"] = f"{k} standard error" + darrs[err] = xr.DataArray(data=meta[k], dims=["uts"], attrs=attrs) + if "uts" in data: + coords = dict(uts=data.pop("uts")) + else: + coords = dict() + if fulldate: + attrs = dict() + else: + attrs = dict(fulldate=False) + return xr.Dataset(data_vars=darrs, coords=coords, attrs=attrs) + + +def merge_dicttrees(vals: dict, fvals: dict, mode: str) -> dict: + """ + A helper function that merges two ``DataTree.to_dict()`` objects by concatenating + the new values in ``fvals`` to the existing ones in ``vals``. + + """ + if vals is None: + return fvals + for k in fvals.keys(): + try: + vals[k] = xr.concat([vals[k], fvals[k]], dim="uts", combine_attrs=mode) + except xr.MergeError: + raise RuntimeError( + "Merging metadata from multiple files has failed, as some of the " + "values differ between files. This might be caused by trying to " + "parse data obtained using different techniques/protocols in a " + "single step. If you are certain this is what you want, try using " + "yadg with the '--ignore-merge-errors' option." + ) + return vals diff --git a/src/yadg/dgutils/pintutils.py b/src/yadg/dgutils/pintutils.py index b6bb60c8..99d1b245 100644 --- a/src/yadg/dgutils/pintutils.py +++ b/src/yadg/dgutils/pintutils.py @@ -55,7 +55,7 @@ def sanitize_units( Unit sanitizer. This sanitizer should be used where user-supplied units are likely to occur, - such as in the parsers :mod:`yadg.parsers.basiccsv`. Currently, only two + such as in the parsers :mod:`yadg.extractors.basic.csv`. Currently, only two replacements are done: - "Bar" is replaced with "bar" diff --git a/src/yadg/dgutils/utils.py b/src/yadg/dgutils/schemautils.py similarity index 100% rename from src/yadg/dgutils/utils.py rename to src/yadg/dgutils/schemautils.py diff --git a/src/yadg/extractors/__init__.py b/src/yadg/extractors/__init__.py index b42086f0..cce38c42 100644 --- a/src/yadg/extractors/__init__.py +++ b/src/yadg/extractors/__init__.py @@ -32,7 +32,7 @@ def extract(filetype: str, path: Path) -> Union[Dataset, DataTree]: """ extractor = ExtractorFactory(extractor={"filetype": filetype}).extractor - m = importlib.import_module(f"yadg.extractors.public.{extractor.filetype}") + m = importlib.import_module(f"yadg.extractors.{extractor.filetype}") func = getattr(m, "extract") ret = func(fn=str(path), **vars(extractor)) diff --git a/src/yadg/extractors/agilent/__init__.py b/src/yadg/extractors/agilent/__init__.py new file mode 100644 index 00000000..1ce8b7e0 --- /dev/null +++ b/src/yadg/extractors/agilent/__init__.py @@ -0,0 +1,4 @@ +""" +Extractors for data files generated by various proprietary Agilent software. + +""" diff --git a/src/yadg/parsers/chromtrace/agilentch.py b/src/yadg/extractors/agilent/ch.py similarity index 72% rename from src/yadg/parsers/chromtrace/agilentch.py rename to src/yadg/extractors/agilent/ch.py index a699a2e5..e943935e 100644 --- a/src/yadg/parsers/chromtrace/agilentch.py +++ b/src/yadg/extractors/agilent/ch.py @@ -1,15 +1,42 @@ """ -**agilentch**: Processing Agilent OpenLab binary signal trace files (CH and IT). --------------------------------------------------------------------------------- - +Extractor of Agilent OpenLab binary signal trace files (``.ch`` and ``.it``). Currently supports version "179" of the files. Version information is defined in -the `magic_values` (parameters & metadata) and `data_dtypes` (data) dictionaries. +the ``magic_values`` (parameters & metadata) and `data_dtypes` (data) dictionaries. Adapted from `ImportAgilent.m `_ and `aston `_. -File Structure of ``.ch`` files -``````````````````````````````` +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Agilent_ch + +Schema +`````` +.. code-block:: yaml + + datatree.DataTree: + {{ detector_name }}: + coords: + uts: !!float # Unix timestamp + elution_time: !!float # Elution time + data_vars: + signal: (uts, elution_time) # Signal data + +Metadata +```````` +The following metadata is extracted: + + - ``sampleid``: Sample name. + - ``username``: User name used to generate the file. + - ``method``: Name of the chromatographic method. + - ``version``: Version of the CH file (only "179" is currently supported.) + + +Notes on file structure +``````````````````````` +The following magic values are used: .. code :: 0x0000 "version magic" @@ -29,14 +56,21 @@ 0x127c "y-axis slope" Data is stored in a consecutive set of `` DataTree: - """ - Agilent OpenLAB signal trace parser - - One chromatogram per file with a single trace. Binary data format. - - Parameters - ---------- - fn - Filename to process. - - encoding - Not used as the file is binary. - - timezone - Timezone information. This should be ``"localtime"``. - - Returns - ------- - class:`datatree.DataTree` - A :class:`datatree.DataTree` containing one :class:`xarray.Dataset` per detector. As - there is only one detector data in each CH file, this nesting is only for - consistency with other filetypes. - - """ - +def extract( + *, + fn: str, + timezone: str, + **kwargs: dict, +) -> DataTree: with open(fn, "rb") as inf: ch = inf.read() @@ -124,7 +138,7 @@ def process(*, fn: str, timezone: str, **kwargs: dict) -> DataTree: detector, title = pars["tracetitle"].split(",") - uts = str_to_uts( + uts = dgutils.str_to_uts( timestamp=pars["timestamp"], format="%d-%b-%y, %H:%M:%S", timezone=timezone ) diff --git a/src/yadg/parsers/chromtrace/agilentcsv.py b/src/yadg/extractors/agilent/csv.py similarity index 75% rename from src/yadg/parsers/chromtrace/agilentcsv.py rename to src/yadg/extractors/agilent/csv.py index 62385534..7d01808b 100644 --- a/src/yadg/parsers/chromtrace/agilentcsv.py +++ b/src/yadg/extractors/agilent/csv.py @@ -1,11 +1,8 @@ """ -**agilentcsv**: Processing Agilent Chemstation Chromtab tabulated data files (csv). ------------------------------------------------------------------------------------ - -This file format may include multiple timesteps consisting of several traces each in a -single CSV file. It contains a header section for each timestep, followed by a detector -name, and a sequence of "X, Y" datapoints, which are stored as ``elution_time`` and -``signal``. +Extractor of Agilent Chemstation Chromtab tabulated data files. This file format may +include multiple timesteps consisting of several traces each in a single CSV file. It +contains a header section for each timestep, followed by a detector name, and a sequence +of "X, Y" datapoints, which are stored as ``elution_time`` and ``signal``. .. warning :: @@ -13,16 +10,43 @@ consistent between the timesteps of the same trace. The traces are expanded to the length of the longest trace, and the shorter traces are padded with ``NaNs``. -.. warning :: +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Agilent_csv + +Schema +`````` +.. code-block:: yaml + + datatree.DataTree: + {{ detector_name }}: + coords: + uts: !!float # Unix timestamp + elution_time: !!float # Elution time + data_vars: + signal: (uts, elution_time) # Signal data + +Metadata +```````` +The following metadata is extracted: + + - ``sampleid``: Sample name. + - ``datafile``: Original path of the data file. - Unfortunately, the chromatographic ``method`` is not exposed in this file format. +Uncertainties +````````````` +All uncertainties are derived from the string representation of the floats. + +.. codeauthor:: + Peter Kraus -.. codeauthor:: Peter Kraus """ import numpy as np from uncertainties.core import str_to_number_with_uncert as tuple_fromstr -from yadg.dgutils.dateutils import str_to_uts +from yadg import dgutils import xarray as xr from datatree import DataTree @@ -33,7 +57,7 @@ def _process_headers(headers: list, columns: list, timezone: str) -> dict: columns ), "chromtab: The number of headers and columns do not match." assert "Date Acquired" in headers, "chromtab: Cannot infer date." - res["uts"] = str_to_uts( + res["uts"] = dgutils.str_to_uts( timestamp=columns[headers.index("Date Acquired")].strip(), format="%d %b %Y %H:%M", timezone=timezone, @@ -61,33 +85,13 @@ def _to_trace(tx, ty): return trace -def process(*, fn: str, encoding: str, timezone: str, **kwargs: dict) -> DataTree: - """ - Agilent Chemstation CSV (Chromtab) file parser - - Each file may contain multiple chromatograms per file with multiple traces. Each - chromatogram starts with a header section, and is followed by each trace, which - includes a header line and x,y-data. - - Parameters - ---------- - fn - Filename to process. - - encoding - Encoding used to open the file. - - timezone - Timezone information. This should be ``"localtime"``. - - Returns - ------- - class:`datatree.DataTree` - A :class:`datatree.DataTree` containing one :class:`xarray.Dataset` per detector. As - When multiple timesteps are present in the file, the traces of each detector are - expanded to match the longest trace, and collated along the ``uts``-dimension. - """ - +def extract( + *, + fn: str, + encoding: str, + timezone: str, + **kwargs: dict, +) -> DataTree: with open(fn, "r", encoding=encoding, errors="ignore") as infile: lines = infile.readlines() metadata = {} diff --git a/src/yadg/extractors/agilent/dx.py b/src/yadg/extractors/agilent/dx.py new file mode 100644 index 00000000..75732def --- /dev/null +++ b/src/yadg/extractors/agilent/dx.py @@ -0,0 +1,68 @@ +""" +Extractor of Agilent OpenLab DX archives. This is a wrapper parser which unzips the +provided DX file, and then uses the :mod:`yadg.extractors.agilent.ch` extractor +to parse every CH file present in the archive. The IT files in the archive are currently +ignored. + +.. note:: + + Currently the timesteps from multiple CH files (if present) are appended in the + timesteps array without any further sorting. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Agilent_ch + +Schema +`````` +.. code-block:: yaml + + datatree.DataTree: + {{ detector_name }}: + coords: + uts: !!float # Unix timestamp + elution_time: !!float # Elution time + data_vars: + signal: (uts, elution_time) # Signal data + +Metadata +```````` +The following metadata is extracted: + + - ``sampleid``: Sample name. + - ``username``: User name used to generate the file. + - ``method``: Name of the chromatographic method. + - ``version``: Version of the CH file (only "179" is currently supported.) + +.. codeauthor:: + Peter Kraus + +""" + +import zipfile +import tempfile +import os +from datatree import DataTree + +from yadg.extractors.agilent.ch import extract as extract_ch +from yadg import dgutils + + +def extract( + *, + fn: str, + timezone: str, + **kwargs: dict, +) -> DataTree: + zf = zipfile.ZipFile(fn) + with tempfile.TemporaryDirectory() as tempdir: + zf.extractall(tempdir) + dt = None + filenames = [ffn for ffn in os.listdir(tempdir) if ffn.endswith("CH")] + for ffn in sorted(filenames): + path = os.path.join(tempdir, ffn) + fdt = extract_ch(fn=path, timezone=timezone, **kwargs) + dt = dgutils.merge_dicttrees(dt, fdt.to_dict(), "identical") + return DataTree.from_dict(dt) diff --git a/src/yadg/extractors/custom/__init__.py b/src/yadg/extractors/basic/__init__.py similarity index 100% rename from src/yadg/extractors/custom/__init__.py rename to src/yadg/extractors/basic/__init__.py diff --git a/src/yadg/parsers/basiccsv/main.py b/src/yadg/extractors/basic/csv.py similarity index 50% rename from src/yadg/parsers/basiccsv/main.py rename to src/yadg/extractors/basic/csv.py index 3187b267..d98eca1a 100644 --- a/src/yadg/parsers/basiccsv/main.py +++ b/src/yadg/extractors/basic/csv.py @@ -1,12 +1,57 @@ +""" +Handles the reading and processing of any tabular files, as long as the first line +contains the column headers. The columns of the table must be separated using a +separator such as ``,``, ``;``, or ``\\t``. + +.. note:: + + By default, the second line of the file should contain the units. Alternatively, + the units can be supplied using extractor parameters, in which case the second line + is considered to be data. + +Since ``yadg-5.0``, the **basic.csv** extractor handles sparse tables (i.e. tables with +missing data) by back-filling empty cells with ``np.NaNs``. + +The **basic.csv** extractor attempts to deduce the timestamps from the column headers, +using :func:`yadg.dgutils.dateutils.infer_timestamp_from`. Alternatively, the column(s) +containing the timestamp data and their format can be provided using extractor +parameters. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Basic_csv + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp + data_vars: + {{ headers }}: (uts) # Populated from file headers + +Metadata +```````` +No metadata is extracted. + +.. codeauthor:: + Peter Kraus + +""" + import logging -from uncertainties.core import str_to_number_with_uncert as tuple_fromstr -from typing import Callable, Any from pydantic import BaseModel import locale as lc -from ... import dgutils +from xarray import Dataset +from uncertainties.core import str_to_number_with_uncert as tuple_fromstr +from typing import Callable + + +from yadg import dgutils -import numpy as np -import xarray as xr logger = logging.getLogger(__name__) @@ -15,12 +60,12 @@ def process_row( headers: list, items: list, datefunc: Callable, - datecolumns: list, + datecolumns: list[int], ) -> tuple[dict, dict]: """ A function that processes a row of a table. - This is the main worker function of :mod:`~yadg.parsers.basiccsv`, but is often + This is the main worker function of :mod:`basic.csv` module, but is often re-used by any other parser that needs to process tabular data. Parameters @@ -29,10 +74,8 @@ def process_row( A list of headers of the table. items - A list of values corresponding to the headers. Must be the same length as headers. - - units - A dict for looking up the units corresponding to a certain header. + A list of values corresponding to the headers. Must be the same length as + headers. datefunc A function that will generate ``uts`` given a list of values. @@ -44,13 +87,13 @@ def process_row( ------- vals, devs A tuple of result dictionaries, with the first element containing the values - and the second element containing the deviations of the values. + and the second element containing the uncertainties of the values. """ - assert len(headers) == len(items), ( - f"process_row: Length mismatch between provided headers: " - f"{headers} and provided items: {items}." - ) + if len(headers) != len(items): + raise RuntimeError( + f"Length mismatch between provided headers {headers!r} and items {items!r}." + ) vals = {} devs = {} @@ -73,64 +116,7 @@ def process_row( return vals, devs -def append_dicts( - vals: dict[str, Any], - devs: dict[str, Any], - data: dict[str, list[Any]], - meta: dict[str, list[Any]], - fn: str = None, - li: int = 0, -) -> None: - if "_fn" in meta and fn is not None: - meta["_fn"].append(str(fn)) - for k, v in vals.items(): - if k not in data: - data[k] = [None if isinstance(v, str) else np.nan] * li - data[k].append(v) - for k, v in devs.items(): - if k not in meta: - meta[k] = [np.nan] * li - meta[k].append(v) - - for k in set(data) - set(vals): - data[k].append(np.nan) - for k in set(meta) - set(devs): - if k != "_fn": - meta[k].append(np.nan) - - -def dicts_to_dataset( - data: dict[str, list[Any]], - meta: dict[str, list[Any]], - units: dict[str, str] = dict(), - fulldate: bool = True, -) -> xr.Dataset: - darrs = {} - for k, v in data.items(): - attrs = {} - u = units.get(k, None) - if u is not None: - attrs["units"] = u - if k == "uts": - continue - darrs[k] = xr.DataArray(data=v, dims=["uts"], attrs=attrs) - if k in meta and darrs[k].dtype.kind in {"i", "u", "f", "c", "m", "M"}: - err = f"{k}_std_err" - darrs[k].attrs["ancillary_variables"] = err - attrs["standard_name"] = f"{k} standard error" - darrs[err] = xr.DataArray(data=meta[k], dims=["uts"], attrs=attrs) - if "uts" in data: - coords = dict(uts=data.pop("uts")) - else: - coords = dict() - if fulldate: - attrs = dict() - else: - attrs = dict(fulldate=False) - return xr.Dataset(data_vars=darrs, coords=coords, attrs=attrs) - - -def process( +def extract( *, fn: str, encoding: str, @@ -138,37 +124,7 @@ def process( timezone: str, parameters: BaseModel, **kwargs: dict, -) -> xr.Dataset: - """ - A basic csv parser. - - This parser processes a csv file. The header of the csv file consists of one or two - lines, with the column headers in the first line and the units in the second. The - parser also attempts to parse column names to produce a timestamp, and save all other - columns as floats or strings. - - Parameters - ---------- - fn - File to process - - encoding - Encoding of ``fn``, by default "utf-8". - - timezone - A string description of the timezone. Default is "localtime". - - parameters - Parameters for :class:`~dgbowl_schemas.yadg.dataschema_5_0.step.BasicCSV`. - - Returns - ------- - :class:`xarray.Dataset` - No metadata is returned by the :mod:`~yadg.parsers.basiccsv` parser. The full - date might not be returned, eg. when only time is specified in columns. - - """ - +) -> Dataset: if hasattr(parameters, "strip"): strip = parameters.strip else: @@ -202,9 +158,7 @@ def process( else: for header in headers: if header not in units: - logger.warning( - "Using implicit dimensionless unit ' ' for '%s'.", header - ) + logger.warning(f"Using implicit dimensionless unit ' ' for {header!r}") units[header] = " " elif units[header] == "": units[header] = " " @@ -224,7 +178,7 @@ def process( datefunc, datecolumns, ) - append_dicts(vals, devs, data_vals, meta_vals, fn, li) + dgutils.append_dicts(vals, devs, data_vals, meta_vals, fn, li) lc.setlocale(category=lc.LC_NUMERIC, locale=old_loc) - return dicts_to_dataset(data_vals, meta_vals, units, fulldate) + return dgutils.dicts_to_dataset(data_vals, meta_vals, units, fulldate) diff --git a/src/yadg/extractors/custom/basic/csv.py b/src/yadg/extractors/custom/basic/csv.py deleted file mode 100644 index 1403a96e..00000000 --- a/src/yadg/extractors/custom/basic/csv.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.basiccsv.main import process as extract - -supports = { - "basic.csv", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/custom/empalc/csv.py b/src/yadg/extractors/custom/empalc/csv.py deleted file mode 100644 index eca08f0d..00000000 --- a/src/yadg/extractors/custom/empalc/csv.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.chromdata.empalccsv import process as extract - -supports = { - "empalc.xlsx", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/custom/empalc/xlsx.py b/src/yadg/extractors/custom/empalc/xlsx.py deleted file mode 100644 index 8e1694bf..00000000 --- a/src/yadg/extractors/custom/empalc/xlsx.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.chromdata.empalcxlsx import process as extract - -supports = { - "empalc.csv", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/custom/example/__init__.py b/src/yadg/extractors/custom/example/__init__.py deleted file mode 100644 index f102cb8b..00000000 --- a/src/yadg/extractors/custom/example/__init__.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -This is an example extractor, used mainly for testing of the :mod:`yadg` package. -It provides no real functionality. - -Usage -````` -Available since ``yadg-4.0``. The parser supports the following parameters: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.Dummy - -Formats -``````` -The ``filetypes`` currently supported by the parser are: - - - tomato's JSON file (``tomato.json``) - -Schema -`````` -The output schema is only defined for the ``tomato.json`` filetype. - -.. code-block:: yaml - - xr.Dataset: - coords: - uts: !!float - data_vars: - {{ entries }} (uts) # Elements present in the "data" entry - -The value of every element of ``data`` is assigned a deviation of 0.0. - -Module Functions -```````````````` - -""" - -from pydantic import BaseModel -from yadg import dgutils -from yadg.parsers.basiccsv.main import dicts_to_dataset -from datatree import DataTree - - -def extract( - *, - fn: str, - parameters: BaseModel, - **kwargs: dict, -) -> DataTree: - """ - A dummy parser. - - This parser simply returns the current time, the filename provided, and any - ``kwargs`` passed. - - In case the provided ``filetype`` is a ``tomato.json`` file, this is a json - data file from the :mod:`tomato` package, which should contain a :class:`list` - of ``{"value": float, "time": float}`` datapoints in its ``data`` entry. - - Parameters - ---------- - fn - Filename to process - - filetype - Accepts ``tomato.json`` as an optional "dummy instrument" filetype from - :mod:`tomato`. - - parameters - Parameters for :class:`~dgbowl_schemas.yadg.dataschema_5_0.step.Dummy`. - - Returns - ------- - :class:`xarray.Dataset` - - """ - - kwargs = {} if parameters is None else parameters.dict() - if "parser" in kwargs: - del kwargs["parser"] - data_vals = {k: [v] for k, v in kwargs.items()} - data_vals["uts"] = [dgutils.now()] - meta_vals = {} - return dicts_to_dataset(data_vals, meta_vals, fulldate=False) - - -__all__ = ["extract"] diff --git a/src/yadg/extractors/custom/fhimcpt/csv.py b/src/yadg/extractors/custom/fhimcpt/csv.py deleted file mode 100644 index 67bb74c0..00000000 --- a/src/yadg/extractors/custom/fhimcpt/csv.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.meascsv import process as extract - -supports = { - "fhi.csv", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/custom/fhimcpt/vna.py b/src/yadg/extractors/custom/fhimcpt/vna.py deleted file mode 100644 index aa7083a0..00000000 --- a/src/yadg/extractors/custom/fhimcpt/vna.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.qftrace.labviewcsv import process as extract - -supports = { - "labview.csv", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/drycal/__init__.py b/src/yadg/extractors/drycal/__init__.py new file mode 100644 index 00000000..352d46f6 --- /dev/null +++ b/src/yadg/extractors/drycal/__init__.py @@ -0,0 +1,4 @@ +""" +Extractors for files from MesaLabs DryCal Pro software for Defender flow meters. + +""" diff --git a/src/yadg/parsers/flowdata/drycal.py b/src/yadg/extractors/drycal/common.py similarity index 64% rename from src/yadg/parsers/flowdata/drycal.py rename to src/yadg/extractors/drycal/common.py index a23f3556..a47f8b7d 100644 --- a/src/yadg/parsers/flowdata/drycal.py +++ b/src/yadg/extractors/drycal/common.py @@ -1,23 +1,25 @@ """ -**drycal**: File parser for DryCal log files. ---------------------------------------------- +This module includes shared functions for the :mod:`~yadg.extractors.drycal` +extractor, including functions for parsing the files, processing the tabulated data, +and ensuring timestamps are increasing. -This module includes functions for parsing converted documents (``rtf``) and -tabulated exports (``txt``, ``csv``). +.. codeauthor:: + Peter Kraus -The DryCal files only contain the timestamps of the datapoints, not the date. Therefore, -the date has to be supplied either using the ``date`` argument in parameters, or is -parsed from the prefix of the filename. - -.. codeauthor:: Peter Kraus """ -from striprtf.striprtf import rtf_to_text -from ..basiccsv.main import process_row, append_dicts, dicts_to_dataset -from ... import dgutils from pydantic import BaseModel from typing import Optional from datatree import DataTree +from xarray import Dataset +import logging +import xarray as xr +from striprtf.striprtf import rtf_to_text + +from yadg import dgutils +from yadg.extractors.basic.csv import process_row + +logger = logging.getLogger(__name__) class TimeDate(BaseModel): @@ -34,28 +36,6 @@ def rtf( encoding: str, timezone: str, ) -> DataTree: - """ - RTF version of the drycal parser. - - This is intended to parse legacy drycal DOC files, which have been converted to RTF - using other means. - - Parameters - ---------- - fn - Filename to parse. - - encoding - Encoding to use for parsing ``fn``. - - calib - A calibration spec. - - Returns - ------- - (timesteps, metadata, None): tuple[list, dict, None] - A standard data - metadata - common data output tuple. - """ with open(fn, "r", encoding=encoding) as infile: rtf = infile.read() lines = rtf_to_text(rtf).split("\n") @@ -94,9 +74,9 @@ def rtf( meta_vals = {"_fn": []} for pi, point in enumerate(data): vals, devs = process_row(headers[1:], point[1:], datefunc, datecolumns) - append_dicts(vals, devs, data_vals, meta_vals, fn, pi) + dgutils.append_dicts(vals, devs, data_vals, meta_vals, fn, pi) - return dicts_to_dataset(data_vals, meta_vals, units, False) + return dgutils.dicts_to_dataset(data_vals, meta_vals, units, False) def sep( @@ -105,34 +85,6 @@ def sep( encoding: str, timezone: str, ) -> DataTree: - """ - Generic drycal parser, using ``sep`` as separator string. - - This is intended to parse other export formats from DryCal, such as txt and csv files. - - Parameters - ---------- - fn - Filename to parse. - - date - A unix timestamp float corresponding to the day (or other offset) to be added to - each line in the measurement table. - - sep - The separator character used to split lines in ``fn``. - - encoding - Encoding to use for parsing ``fn``. - - calib - A calibration spec. - - Returns - ------- - (timesteps, metadata, None): tuple[list, dict, None] - A standard data - metadata - common data output tuple. - """ with open(fn, "r", encoding=encoding) as infile: lines = infile.readlines() for li in range(len(lines)): @@ -170,9 +122,9 @@ def sep( meta_vals = {"_fn": []} for pi, point in enumerate(data): vals, devs = process_row(headers[1:], point[1:], datefunc, datecolumns) - append_dicts(vals, devs, data_vals, meta_vals, fn, pi) + dgutils.append_dicts(vals, devs, data_vals, meta_vals, fn, pi) - return dicts_to_dataset(data_vals, meta_vals, units, False) + return dgutils.dicts_to_dataset(data_vals, meta_vals, units, False) def drycal_table(lines: list, sep: str = ",") -> tuple[list, dict, list]: @@ -183,20 +135,6 @@ def drycal_table(lines: list, sep: str = ",") -> tuple[list, dict, list]: lines, this function returns the headers, units, and data extracted from the table. The returned values are always of :class:`(str)` type, any post-processing is done in the calling routine. - - Parameters - ---------- - lines - A list containing the lines to be parsed - - sep - The separator string used to split each line into individual items - - Returns - ------- - (headers, units, data): tuple[list, dict, list] - A tuple of a list of the stripped headers, dictionary of header-unit key-value - pairs, and a list of lists containing the rows of the table. """ items = [i.strip() for i in lines[0].split(sep)] headers = [] @@ -226,3 +164,22 @@ def drycal_table(lines: list, sep: str = ",") -> tuple[list, dict, list]: units = dgutils.sanitize_units(units) return headers, units, data + + +def check_timestamps(vals: Dataset) -> Dataset: + warn = True + ndays = 0 + utslist = vals.uts.values + for i in range(1, vals.uts.size): + if utslist[i] < utslist[i - 1]: + if warn: + logger.warning("DryCal log crossing day boundary. Adding offset.") + warn = False + uts = utslist[i] + ndays * 86400 + while uts < utslist[i - 1]: + ndays += 1 + uts = utslist[i] + ndays * 86400 + utslist[i] = uts + vals["uts"] = xr.DataArray(data=utslist, dims=["uts"]) + vals.attrs["fulldate"] = False + return vals diff --git a/src/yadg/extractors/drycal/csv.py b/src/yadg/extractors/drycal/csv.py new file mode 100644 index 00000000..683c0cdf --- /dev/null +++ b/src/yadg/extractors/drycal/csv.py @@ -0,0 +1,57 @@ +""" +Handles the reading and processing of volumetric flow meter data exported from the +MesaLabs DryCal software as a csv file. + +.. note:: + + The date information is missing in the timestamps of the exported files and has to + be supplied externally. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Drycal_csv + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp, without date + data_vars: + DryCal: (uts) # Standardised flow rate + DryCal Avg.: (uts) # Running average of the flow rate + Temp.: (uts) # Measured flow temperature + Pressure: (uts) # Measured flow pressure + +Metadata +```````` +The following metadata is extracted: + + - ``product``: Model name of the MesaLabs device. + - ``serial number``: Serial number of the MesaLabs device. + +Uncertainties +````````````` +All uncertainties are derived from the string representation of the floats. + +.. codeauthor:: + Peter Kraus + +""" + +from xarray import Dataset +from yadg.extractors.drycal import common + + +def extract( + *, + fn: str, + encoding: str, + timezone: str, + **kwargs: dict, +) -> Dataset: + vals = common.sep(fn, ",", encoding, timezone) + return common.check_timestamps(vals) diff --git a/src/yadg/extractors/drycal/rtf.py b/src/yadg/extractors/drycal/rtf.py new file mode 100644 index 00000000..f2ccd1a0 --- /dev/null +++ b/src/yadg/extractors/drycal/rtf.py @@ -0,0 +1,58 @@ +""" +Handles the reading and processing of volumetric flow meter data exported from the +MesaLabs DryCal software as a rtf file. + +.. note:: + + The date information is missing in the timestamps of the exported files and has to + be supplied externally. The timestamp in the header of the rtf file corresponds to + the timestamp of export / report generation, not measurement. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Drycal_rtf + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp, without date + data_vars: + DryCal: (uts) # Standardised flow rate + DryCal Avg.: (uts) # Running average of the flow rate + Temp.: (uts) # Measured flow temperature + Pressure: (uts) # Measured flow pressure + +Metadata +```````` +The following metadata is extracted: + + - ``product``: Model name of the MesaLabs device. + - ``serial number``: Serial number of the MesaLabs device. + +Uncertainties +````````````` +All uncertainties are derived from the string representation of the floats. + +.. codeauthor:: + Peter Kraus + +""" + +from xarray import Dataset +from yadg.extractors.drycal import common + + +def extract( + *, + fn: str, + encoding: str, + timezone: str, + **kwargs: dict, +) -> Dataset: + vals = common.rtf(fn, encoding, timezone) + return common.check_timestamps(vals) diff --git a/src/yadg/extractors/drycal/txt.py b/src/yadg/extractors/drycal/txt.py new file mode 100644 index 00000000..5f60f8c4 --- /dev/null +++ b/src/yadg/extractors/drycal/txt.py @@ -0,0 +1,57 @@ +""" +Handles the reading and processing of volumetric flow meter data exported from the +MesaLabs DryCal software as a txt file. + +.. note:: + + The date information is missing in the timestamps of the exported files and has to + be supplied externally. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Drycal_txt + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp, without date + data_vars: + DryCal: (uts) # Standardised flow rate + DryCal Avg.: (uts) # Running average of the flow rate + Temp.: (uts) # Measured flow temperature + Pressure: (uts) # Measured flow pressure + +Metadata +```````` +The following metadata is extracted: + + - ``product``: Model name of the MesaLabs device. + - ``serial number``: Serial number of the MesaLabs device. + +Uncertainties +````````````` +All uncertainties are derived from the string representation of the floats. + +.. codeauthor:: + Peter Kraus + +""" + +from xarray import Dataset +from yadg.extractors.drycal import common + + +def extract( + *, + fn: str, + encoding: str, + timezone: str, + **kwargs: dict, +) -> Dataset: + vals = common.sep(fn, "\t", encoding, timezone) + return common.check_timestamps(vals) diff --git a/src/yadg/extractors/custom/basic/__init__.py b/src/yadg/extractors/eclab/__init__.py similarity index 100% rename from src/yadg/extractors/custom/basic/__init__.py rename to src/yadg/extractors/eclab/__init__.py diff --git a/src/yadg/extractors/custom/empalc/__init__.py b/src/yadg/extractors/eclab/common/__init__.py similarity index 100% rename from src/yadg/extractors/custom/empalc/__init__.py rename to src/yadg/extractors/eclab/common/__init__.py diff --git a/src/yadg/parsers/electrochem/eclabcommon/mpr_columns.py b/src/yadg/extractors/eclab/common/mpr_columns.py similarity index 100% rename from src/yadg/parsers/electrochem/eclabcommon/mpr_columns.py rename to src/yadg/extractors/eclab/common/mpr_columns.py diff --git a/src/yadg/parsers/electrochem/eclabcommon/mpt_columns.py b/src/yadg/extractors/eclab/common/mpt_columns.py similarity index 100% rename from src/yadg/parsers/electrochem/eclabcommon/mpt_columns.py rename to src/yadg/extractors/eclab/common/mpt_columns.py diff --git a/src/yadg/parsers/electrochem/eclabcommon/techniques.py b/src/yadg/extractors/eclab/common/techniques.py similarity index 100% rename from src/yadg/parsers/electrochem/eclabcommon/techniques.py rename to src/yadg/extractors/eclab/common/techniques.py diff --git a/src/yadg/parsers/electrochem/eclabmpr.py b/src/yadg/extractors/eclab/mpr.py similarity index 89% rename from src/yadg/parsers/electrochem/eclabmpr.py rename to src/yadg/extractors/eclab/mpr.py index cd48c4d3..7a2efd47 100644 --- a/src/yadg/parsers/electrochem/eclabmpr.py +++ b/src/yadg/extractors/eclab/mpr.py @@ -1,7 +1,40 @@ """ -**eclabmpr**: Processing of BioLogic's EC-Lab binary modular files. -------------------------------------------------------------------- +For processing of BioLogic's EC-Lab binary modular files. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.EClab_mpr + +Schema +`````` +The ``mpr`` files contain many columns that vary depending on the electrochemical +technique used. Below is shown a list of columns that can be expected to be present +in a typical ``mpr`` file. + +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp, without date + data_vars: + Ewe (uts) # Potential of the working electrode + Ece (uts) # Potential of the counter electrode, if present + I (uts) # Instantaneous current + time (uts) # Time elapsed since the start of the experiment + (uts) # Average Ewe potential since last data point + (uts) # Average Ece potential since last data point + (uts) # Average current since last data point + ... +.. note:: + + Note that in most cases, either the instantaneous or the averaged quantities are + stored - only rarely are both available! + +Notes on file structure +``````````````````````` ``.mpr`` files are structured in a set of "modules", one concerning settings, one for actual data, one for logs, and an optional loops module. The parameter sequences can be found in the settings module. @@ -10,8 +43,6 @@ Kerr `_, and builds on the work done by the previous civilian service member working on the project, Jonas Krieger. -.. _yadg.parsers.electrochem.eclabmpr.techniques: - These are the implemented techniques for which the technique parameter sequences can be parsed: @@ -41,14 +72,6 @@ | ZIR | IR compensation (PEIS) | +------+-------------------------------------------------+ -.. note:: - - ``.mpt`` files can contain more data than the corresponding binary - ``.mpr`` file. - -File Structure of ``.mpr`` Files -```````````````````````````````` - At a top level, ``.mpr`` files are made up of a number of modules, separated by the ``MODULE`` keyword. In all the files I have seen, the first module is the settings module, followed by the data module, the @@ -192,20 +215,21 @@ of any external sensors plugged into the device), the ``log`` is usually not present and therefore the full timestamp cannot be calculated. -.. codeauthor:: Nicolas Vetsch +.. codeauthor:: + Nicolas Vetsch + """ import logging -import xarray as xr +from xarray import Dataset import numpy as np -from yadg.dgutils.dateutils import ole_to_uts -from yadg.dgutils.btools import read_value -from .eclabcommon.techniques import ( +from yadg import dgutils +from .common.techniques import ( technique_params_dtypes, param_from_key, get_resolution, ) -from .eclabcommon.mpr_columns import ( +from .common.mpr_columns import ( module_header_dtype, settings_dtypes, flag_columns, @@ -213,7 +237,6 @@ log_dtypes, extdev_dtypes, ) -from yadg.parsers.basiccsv.main import append_dicts, dicts_to_dataset logger = logging.getLogger(__name__) @@ -237,14 +260,14 @@ def process_settings(data: bytes) -> tuple[dict, list]: technique, params_dtypes = technique_params_dtypes[data[0x0000]] settings["technique"] = technique for offset, (dtype, name) in settings_dtypes.items(): - settings[name] = read_value(data, offset, dtype) + settings[name] = dgutils.read_value(data, offset, dtype) # Then determine the technique parameters. The parameters' offset # changes depending on the technique present and apparently on some # other factor that is unclear to me. params_offset = None for offset in (0x0572, 0x1845, 0x1846): logger.debug("Trying to find the technique parameters at 0x%x.", offset) - n_params = read_value(data, offset + 0x0002, " tuple[dict, list]: if params_offset is None: raise NotImplementedError("Unknown parameter offset or technique dtype.") logger.debug("Reading number of parameter sequences at 0x%x.", params_offset) - ns = read_value(data, params_offset, " dict: """ log = {} for offset, (dtype, name) in log_dtypes.items(): - log[name] = read_value(data, offset, dtype) + log[name] = dgutils.read_value(data, offset, dtype) return log @@ -450,7 +473,7 @@ def process_loop(data: bytes) -> dict: The parsed loops. """ - n_indexes = read_value(data, 0x0000, " dict: """ ext = {} for offset, (dtype, name) in extdev_dtypes.items(): - ext[name] = read_value(data, offset, dtype) + ext[name] = dgutils.read_value(data, offset, dtype) return ext @@ -494,7 +517,7 @@ def process_modules(contents: bytes) -> tuple[dict, list, list, dict, dict]: modules = contents.split(b"MODULE")[1:] settings = log = loop = ext = None for module in modules: - header = read_value(module, 0x0000, module_header_dtype) + header = dgutils.read_value(module, 0x0000, module_header_dtype) name = header["short_name"].strip() logger.debug("Read '%s' module.", name) module_data = module[module_header_dtype.itemsize :] @@ -550,31 +573,12 @@ def process_modules(contents: bytes) -> tuple[dict, list, list, dict, dict]: return settings, params, ds, log, loop -def process( +def extract( *, fn: str, timezone: str, **kwargs: dict, -) -> xr.Dataset: - """Processes EC-Lab raw data binary files. - - Parameters - ---------- - fn - The file containing the data to parse. - - encoding - Encoding of ``fn``, by default "windows-1252". - - timezone - A string description of the timezone. Default is "localtime". - - Returns - ------- - :class:`xarray.Dataset` - The full date is specified only if the "LOG" module is present. - - """ +) -> Dataset: file_magic = b"BIO-LOGIC MODULAR FILE\x1a \x00\x00\x00\x00" with open(fn, "rb") as mpr_file: assert mpr_file.read(len(file_magic)) == file_magic, "invalid file magic" @@ -591,7 +595,7 @@ def process( fulldate = False else: metadata["log"] = log - start_time = ole_to_uts(log["ole_timestamp"], timezone=timezone) + start_time = dgutils.ole_to_uts(log["ole_timestamp"], timezone=timezone) fulldate = True if "time" in ds: ds["uts"] = ds["time"] + start_time diff --git a/src/yadg/parsers/electrochem/eclabmpt.py b/src/yadg/extractors/eclab/mpt.py similarity index 83% rename from src/yadg/parsers/electrochem/eclabmpt.py rename to src/yadg/extractors/eclab/mpt.py index 8fff9879..46088b0b 100644 --- a/src/yadg/parsers/electrochem/eclabmpt.py +++ b/src/yadg/extractors/eclab/mpt.py @@ -1,17 +1,40 @@ """ -**eclabmpt**: Processing of BioLogic's EC-Lab ASCII export files. ------------------------------------------------------------------ - -``.mpt`` files are made up of a header portion (with the technique -parameter sequences and an optional loops section) and a tab-separated -data table. - -A list of techniques supported by this parser is shown in `the techniques table -`_. - -File Structure of ``.mpt`` Files -```````````````````````````````` - +For processing of BioLogic's EC-Lab binary modular files. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.EClab_mpt + +Schema +`````` +The ``.mpt`` files contain many columns that vary depending on the electrochemical +technique used. Below is shown a list of columns that can be expected to be present +in a typical ``.mpt`` file. + +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp, without date + data_vars: + Ewe (uts) # Potential of the working electrode + Ece (uts) # Potential of the counter electrode, if present + I (uts) # Instantaneous current + time (uts) # Time elapsed since the start of the experiment + (uts) # Average Ewe potential since last data point + (uts) # Average Ece potential since last data point + (uts) # Average current since last data point + ... + +.. note:: + + Note that in most cases, either the instantaneous or the averaged quantities are + stored - only rarely are both available! + +Notes on file structure +``````````````````````` These human-readable files are sectioned into headerlines and datalines. The header part of the ``.mpt`` files is made up of information that can be found in the settings, log and loop modules of the binary ``.mpr`` file. @@ -19,7 +42,6 @@ If no header is present, the timestamps will instead be calculated from the file's ``mtime()``. - Metadata ```````` The metadata will contain the information from the header of the file. @@ -29,17 +51,18 @@ The mapping between metadata parameters between ``.mpr`` and ``.mpt`` files is not yet complete. -.. codeauthor:: Nicolas Vetsch +.. codeauthor:: + Nicolas Vetsch + """ import re import logging import locale as lc -import xarray as xr -from ...dgutils.dateutils import str_to_uts -from .eclabcommon.techniques import get_resolution, technique_params, param_from_key -from .eclabcommon.mpt_columns import column_units -from yadg.parsers.basiccsv.main import append_dicts, dicts_to_dataset +from xarray import Dataset +from yadg import dgutils +from .common.techniques import get_resolution, technique_params, param_from_key +from .common.mpt_columns import column_units logger = logging.getLogger(__name__) @@ -90,7 +113,7 @@ def process_header(lines: list[str], timezone: str) -> tuple[dict, list, dict]: timestamp_match = timestamp_re.search("\n".join(settings_lines)) timestamp = timestamp_match["val"] for format in ("%m/%d/%Y %H:%M:%S", "%m.%d.%Y %H:%M:%S", "%m/%d/%Y %H:%M:%S.%f"): - uts = str_to_uts( + uts = dgutils.str_to_uts( timestamp=timestamp, format=format, timezone=timezone, strict=False ) if uts is not None: @@ -192,20 +215,20 @@ def process_data( assert isinstance(val, float), "`n` should not be string" devs[col] = get_resolution(col, val, unit, Erange, Irange) - append_dicts(vals, devs, allvals, allmeta, li=li) + dgutils.append_dicts(vals, devs, allvals, allmeta, li=li) - ds = dicts_to_dataset(allvals, allmeta, units, fulldate=False) + ds = dgutils.dicts_to_dataset(allvals, allmeta, units, fulldate=False) return ds -def process( +def extract( *, fn: str, encoding: str, locale: str, timezone: str, **kwargs: dict, -) -> xr.Dataset: +) -> Dataset: """Processes EC-Lab human-readable text export files. Parameters diff --git a/src/yadg/extractors/empalc/__init__.py b/src/yadg/extractors/empalc/__init__.py new file mode 100644 index 00000000..53c2e88f --- /dev/null +++ b/src/yadg/extractors/empalc/__init__.py @@ -0,0 +1,5 @@ +""" +A custom extractor for processing liquid chromatography data exported from the Agilent +Online LC in the Materials for Energy Conversion lab at Empa. + +""" diff --git a/src/yadg/parsers/chromdata/empalccsv.py b/src/yadg/extractors/empalc/csv.py similarity index 85% rename from src/yadg/parsers/chromdata/empalccsv.py rename to src/yadg/extractors/empalc/csv.py index 2a587931..db349f0d 100644 --- a/src/yadg/parsers/chromdata/empalccsv.py +++ b/src/yadg/extractors/empalc/csv.py @@ -1,15 +1,44 @@ """ -**empalccsv**: Processing Empa's online LC exported data (csv) --------------------------------------------------------------- - -This is a structured format produced by the export from Agilent's Online LC device -at Empa. It contains three sections: +Handles processing of the csv version of the structured format produced by Agilent's +Online LC device at Empa. It contains three sections: - metadata section, - table containing sampling information, - table containing analysed chromatography data. -.. codeauthor:: Peter Kraus +Usage +````` +Available since ``yadg-4.2``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.EmpaLC_csv + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp + species: !!str # Species name + data_vars: + height: (uts, species) # Peak height + area: (uts, species) # Integrated peak area + concentration: (uts, species) # Peak area with calibration applied + retention time: (uts, species) # Position of peak maximum + +Metadata +```````` +The following metadata is extracted: + + - ``sequence``: Sample / sequence name. + - ``description``: A free-form description of the experiment. + - ``username``: User name used to generate the file. + - ``datafile``: Original path of the result file. + - ``version``: Version of the export function used to generate the file. + +.. codeauthor:: + Peter Kraus + """ import logging @@ -17,30 +46,18 @@ from uncertainties.core import str_to_number_with_uncert as tuple_fromstr import xarray as xr import numpy as np +from xarray import Dataset -logger = logging.getLogger(__name__) - - -def process(*, fn: str, encoding: str, **kwargs: dict) -> xr.Dataset: - """ - Custom Agilent Online LC csv export format. - - Multiple chromatograms per file, with multiple detectors. - Parameters - ---------- - fn - Filename to process. - - encoding - Encoding used to open the file. - - Returns - ------- - :class:`xarray.Dataset` +logger = logging.getLogger(__name__) - """ +def extract( + *, + fn: str, + encoding: str, + **kwargs: dict, +) -> Dataset: with open(fn, "r", encoding=encoding, errors="ignore") as infile: lines = infile.readlines() diff --git a/src/yadg/parsers/chromdata/empalcxlsx.py b/src/yadg/extractors/empalc/xlsx.py similarity index 85% rename from src/yadg/parsers/chromdata/empalcxlsx.py rename to src/yadg/extractors/empalc/xlsx.py index fdd80855..93182eb4 100644 --- a/src/yadg/parsers/chromdata/empalcxlsx.py +++ b/src/yadg/extractors/empalc/xlsx.py @@ -1,16 +1,44 @@ """ -**empalcxlsx**: Processing Empa's online LC exported data (xlsx) ----------------------------------------------------------------- - -This is a structured format produced by the export from Agilent's Online LC device -at Empa. It contains three sections: +Handles processing of the xlsx version of the structured format produced by Agilent's +Online LC device at Empa. It contains three sections: - metadata section, - table containing sampling information, - table containing analysed chromatography data. +Usage +````` +Available since ``yadg-4.2``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.EmpaLC_xlsx + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp + species: !!str # Species name + data_vars: + height: (uts, species) # Peak height + area: (uts, species) # Integrated peak area + concentration: (uts, species) # Peak area with calibration applied + retention time: (uts, species) # Position of peak maximum + +Metadata +```````` +The following metadata is extracted: + + - ``sequence``: Sample / sequence name. + - ``description``: A free-form description of the experiment. + - ``username``: User name used to generate the file. + - ``datafile``: Original path of the result file. + - ``version``: Version of the export function used to generate the file. + +.. codeauthor:: + Peter Kraus -.. codeauthor:: Peter Kraus """ import logging @@ -18,27 +46,17 @@ import openpyxl from uncertainties.core import str_to_number_with_uncert as tuple_fromstr import xarray as xr +from xarray import Dataset import numpy as np logger = logging.getLogger(__name__) -def process(*, fn: str, **kwargs: dict) -> xr.Dataset: - """ - Fusion xlsx export format. - - Multiple chromatograms per file, with multiple detectors. - - Parameters - ---------- - fn - Filename to process. - - Returns - ------- - :class:`xarray.Dataset` - - """ +def extract( + *, + fn: str, + **kwargs: dict, +) -> Dataset: try: wb = openpyxl.load_workbook( filename=fn, diff --git a/src/yadg/extractors/example/__init__.py b/src/yadg/extractors/example/__init__.py new file mode 100644 index 00000000..d959417a --- /dev/null +++ b/src/yadg/extractors/example/__init__.py @@ -0,0 +1,50 @@ +""" +This is an example extractor, used mainly for testing of the :mod:`yadg` package. +It provides no real functionality. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Example + + +Schema +`````` +The output schema is only defined for the ``tomato.json`` filetype. + +.. code-block:: yaml + + xr.Dataset: + coords: + uts: !!float # The current timestamp + data_vars: + {{ param_keys }} (None) # All parameter key/value pairs + +Metadata +```````` +No metadata is returned. + +.. codeauthor:: + Peter Kraus + +""" + +from pydantic import BaseModel +from yadg import dgutils +from datatree import DataTree + + +def extract( + *, + fn: str, + parameters: BaseModel, + **kwargs: dict, +) -> DataTree: + kwargs = {} if parameters is None else parameters.dict() + if "parser" in kwargs: + del kwargs["parser"] + data_vals = {k: [v] for k, v in kwargs.items()} + data_vals["uts"] = [dgutils.now()] + meta_vals = {} + return dgutils.dicts_to_dataset(data_vals, meta_vals, fulldate=False) diff --git a/src/yadg/extractors/ezchrom/__init__.py b/src/yadg/extractors/ezchrom/__init__.py new file mode 100644 index 00000000..fba80afe --- /dev/null +++ b/src/yadg/extractors/ezchrom/__init__.py @@ -0,0 +1,4 @@ +""" +Extractors for data files generated by Agilent's EZChrom software. + +""" diff --git a/src/yadg/parsers/chromtrace/ezchromasc.py b/src/yadg/extractors/ezchrom/asc.py similarity index 76% rename from src/yadg/parsers/chromtrace/ezchromasc.py rename to src/yadg/extractors/ezchrom/asc.py index 72d102d2..7623984e 100644 --- a/src/yadg/parsers/chromtrace/ezchromasc.py +++ b/src/yadg/extractors/ezchrom/asc.py @@ -1,53 +1,67 @@ """ -**ezchromasc**: Processing EZ-Chrom ASCII export files (dat.asc). ------------------------------------------------------------------ +Handles files created using the ASCII export function in the EZChrom software. +This file format includes one timestep with multiple traces for each ASCII file. It +contains a header section, and a sequence of Y datapoints (``signal``) for each +detector. The X-axis (``elution_time``) is assumed to be uniform between traces, and +its units have to be deduced from the header. -This file format includes one timestep with multiple traces in each ASCII file. It -contains a header section, and a sequence of Y datapoints (``signal``) for each detector. -The X-axis (``elution_time``) is assumed to be uniform between traces, and its units have -to be deduced from the header. +Usage +````` +Available since ``yadg-4.0``. -.. codeauthor:: Peter Kraus -""" +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.EZChrom_asc -import numpy as np -import logging -from uncertainties.core import str_to_number_with_uncert as tuple_fromstr -from yadg.dgutils.dateutils import str_to_uts -import xarray as xr -from datatree import DataTree +Schema +`````` +.. code-block:: yaml -logger = logging.getLogger(__name__) + datatree.DataTree: + {{ detector_index }}: + coords: + uts: !!float # Unix timestamp + elution_time: !!float # Elution time + data_vars: + signal: (uts, elution_time) # Signal data + +Metadata +```````` +The following metadata is extracted: + - ``sampleid``: Sample name. + - ``username``: User name used to generate the file. + - ``method``: Name of the chromatographic method. + - ``version``: Version of the CH file (only "179" is currently supported.) -def process(*, fn: str, encoding: str, timezone: str, **kwargs: dict) -> DataTree: - """ - EZ-Chrome ASCII export file parser. +Uncertainties +````````````` +The uncertainties in ``signal`` are derived from the string representation of the float. - One chromatogram per file with multiple traces. A header section is followed by - y-values for each trace. x-values have to be deduced using number of points, - frequency, and x-multiplier. Method name is available, but detector names are not. - They are assigned their numerical index in the file. +For ``elution_time``, an uncertainty of one X-axis multiplier is used. - Parameters - ---------- - fn - Filename to process. - encoding - Encoding used to open the file. +.. codeauthor:: + Peter Kraus - timezone - Timezone information. This should be ``"localtime"``. +""" + +import numpy as np +import logging +from uncertainties.core import str_to_number_with_uncert as tuple_fromstr +import xarray as xr +from datatree import DataTree +from yadg import dgutils - Returns - ------- - class:`datatree.DataTree` - A :class:`datatree.DataTree` containing one :class:`xarray.Dataset` per detector. +logger = logging.getLogger(__name__) - """ +def extract( + *, + fn: str, + encoding: str, + timezone: str, + **kwargs: dict, +) -> DataTree: with open(fn, "r", encoding=encoding, errors="ignore") as infile: lines = infile.readlines() metadata = {} @@ -63,7 +77,7 @@ def process(*, fn: str, encoding: str, timezone: str, **kwargs: dict) -> DataTre k = key.lower().replace(" ", "") metadata[k] = line.split(f"{key}:")[1].strip() if line.startswith("Acquisition Date and Time:"): - uts = str_to_uts( + uts = dgutils.str_to_uts( timestamp=line.split("Time:")[1].strip(), format="%m/%d/%Y %I:%M:%S %p", timezone=timezone, diff --git a/src/yadg/extractors/fhimcpt/__init__.py b/src/yadg/extractors/fhimcpt/__init__.py new file mode 100644 index 00000000..50fe0ed1 --- /dev/null +++ b/src/yadg/extractors/fhimcpt/__init__.py @@ -0,0 +1,5 @@ +""" +A set of custom extractors for processing files generated by the MCPT instrument at FHI, +now in the Risse group at FU Berlin. + +""" diff --git a/src/yadg/extractors/fhimcpt/csv.py b/src/yadg/extractors/fhimcpt/csv.py new file mode 100644 index 00000000..f61c66d1 --- /dev/null +++ b/src/yadg/extractors/fhimcpt/csv.py @@ -0,0 +1,99 @@ +""" +This parser handles the reading and processing of the legacy log files created by +the LabView interface for the MCPT instrument at FHI, now FU Berlin. These files contain +information about the timestamp, temperatures, and inlet / process flows. + +Usage +````` +Available since ``yadg-3.0``. Deprecated since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.FHI_csv + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp + data_vars: + T_f: (uts) # Flow temperature + T_fs: (uts) # Flow temperature setpoint + T_fo: (uts) # Flow heater duty cycle + T_c: (uts) # Cavity temperature + T_cs: (uts) # Cavity temperature setpoint + T_co: (uts) # Cavity cooling duty cycle + T_cal: (uts) # Calibration thermocouple temperature + N2: (uts) # N2 flow + O2: (uts) # N2 flow + alkane: (uts) # alkane flow + CO_CO2: (uts) # CO or CO2 flow + saturator: (uts) # saturator flow + pressure: (uts) # Reactor flow meter back-pressure + flow low: (uts) # Reactor mix high-flow MFC + flow high: (uts) # Reactor mix low-flow MFC + cavity flush: (uts) # Cavity N2 flow + heater flow: (uts) # Heater flow + +Metadata +```````` +No metadata is returned. + +.. codeauthor:: + Peter Kraus + +""" + +import logging +from pydantic import BaseModel +from yadg.extractors.basic.csv import process_row +from yadg import dgutils +from xarray import Dataset + +logger = logging.getLogger(__name__) + + +def extract( + *, + fn: str, + encoding: str, + timezone: str, + parameters: BaseModel, + **kwargs: dict, +) -> Dataset: + + with open(fn, "r", encoding=encoding) as infile: + lines = [i.strip() for i in infile.readlines()] + + headers = [i.strip() for i in lines.pop(0).split(";")] + + for hi, header in enumerate(headers): + if "/" in header: + logger.warning("Replacing '/' for '_' in header '%s'.", header) + headers[hi] = header.replace("/", "_") + + _units = [i.strip() for i in lines.pop(0).split(";")] + units = {} + for h in headers: + units[h] = _units.pop(0) + + units = dgutils.sanitize_units(units) + + datecolumns, datefunc, fulldate = dgutils.infer_timestamp_from( + spec=parameters.timestamp, + timezone=timezone, + ) + + # Process rows + data_vals = {} + meta_vals = {"_fn": []} + for li, line in enumerate(lines): + vals, devs = process_row( + headers, + line.split(";"), + datefunc, + datecolumns, + ) + dgutils.append_dicts(vals, devs, data_vals, meta_vals, fn, li) + + return dgutils.dicts_to_dataset(data_vals, meta_vals, units, fulldate) diff --git a/src/yadg/parsers/qftrace/labviewcsv.py b/src/yadg/extractors/fhimcpt/vna.py similarity index 74% rename from src/yadg/parsers/qftrace/labviewcsv.py rename to src/yadg/extractors/fhimcpt/vna.py index a3cf1098..ade25ee8 100644 --- a/src/yadg/parsers/qftrace/labviewcsv.py +++ b/src/yadg/extractors/fhimcpt/vna.py @@ -1,49 +1,54 @@ """ -**labviewcsv**: Processing Agilent LabVIEW CSV files ----------------------------------------------------- - Used to process files generated using Agilent PNA-L N5320C via its LabVIEW driver. This file format includes a header, with the values of bandwidth and averaging, and three tab-separated columns containing the frequency :math:`f`, and the real and imaginary parts of the complex reflection coefficient :math:`\\Gamma(f)`. -Timestamps are determined from file name. One trace per file. As the set-up for -which this format was designed always uses the ``S11`` port, the name of the trace -is hard-coded to this value. +Note that no timestamps are present in the file and have to be supplied externally, +e.g. from the file name. One trace per file. As the MCPT set-up for which this +extractor was designed always uses the ``S11`` port, the node name is is hard-coded to +this value. + +Usage +````` +Available since ``yadg-3.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.FHI_vna + +Schema +`````` +.. code-block:: yaml + + datatree.DataTree: + S11: !!xarray.Dataset + coords: + freq: !!float # An array of measurement frequencies + data_vars: + Re(G): (freq) # Real part of Γ + Im(G): (freq) # Imaginary part of Γ + average: (None) # Number of traces averaged + bandwidth: (None) # Filter bandwidth + +Metadata +```````` +No metadata is returned. + +.. codeauthor:: + Peter Kraus -.. codeauthor:: Peter Kraus """ from uncertainties.core import str_to_number_with_uncert as tuple_fromstr import xarray as xr -import datatree +from datatree import DataTree -def process( +def extract( *, fn: str, - encoding: str = "utf-8", + encoding: str, **kwargs: dict, -) -> datatree.DataTree: - """ - VNA reflection trace parser for Agilent's LabVIEW driver. - - Parameters - ---------- - fn - File to process - - encoding - Encoding of ``fn``, by default "utf-8". - - Returns - ------- - :class:`datatree.DataTree` - A :class:`datatree.DataTree` containing a single :class:`xarray.Dataset` with the - ``S11`` (reflection) trace. - - """ - +) -> DataTree: with open(fn, "r", encoding=encoding) as infile: lines = infile.readlines() assert ( @@ -127,4 +132,4 @@ def process( }, ) - return datatree.DataTree.from_dict(dict(S11=vals)) + return DataTree.from_dict(dict(S11=vals)) diff --git a/src/yadg/extractors/custom/fhimcpt/__init__.py b/src/yadg/extractors/fusion/__init__.py similarity index 100% rename from src/yadg/extractors/custom/fhimcpt/__init__.py rename to src/yadg/extractors/fusion/__init__.py diff --git a/src/yadg/parsers/chromdata/fusioncsv.py b/src/yadg/extractors/fusion/csv.py similarity index 71% rename from src/yadg/parsers/chromdata/fusioncsv.py rename to src/yadg/extractors/fusion/csv.py index 8977e769..1d38cd9b 100644 --- a/src/yadg/parsers/chromdata/fusioncsv.py +++ b/src/yadg/extractors/fusion/csv.py @@ -1,9 +1,6 @@ """ -**fusioncsv**: Processing Inficon Fusion csv export format (csv). ------------------------------------------------------------------- - -This is a tabulated format, including the concentrations, mole fractions, peak -areas, and retention times. The latter is ignored by this parser. +For processing Inficon Fusion csv export format (csv). This is a tabulated format, +including the concentrations, mole fractions, peak areas, and retention times. .. warning:: @@ -11,14 +8,49 @@ is discouraged, and the ``json`` files (or a zipped archive of them) should be parsed instead. -.. codeauthor:: Peter Kraus +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Fusion_csv + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp + species: !!str # Species name + data_vars: + area: (uts, species) # Integrated peak area + concentration: (uts, species) # Calibrated peak area + xout: (uts, species) # Mole fraction (normalised conc.) + retention time: (uts, species) # Retention time + +Metadata +```````` +The following metadata is extracted: + + - ``method``: Name of the chromatographic method. + +Uncertainties +````````````` +The uncertainties are derived from the string representation of the floats. + +.. codeauthor:: + Peter Kraus + """ import logging -from yadg.dgutils.dateutils import str_to_uts -from uncertainties.core import str_to_number_with_uncert as tuple_fromstr -import xarray as xr import numpy as np +import xarray as xr +from xarray import Dataset +from uncertainties.core import str_to_number_with_uncert as tuple_fromstr + +from yadg import dgutils + logger = logging.getLogger(__name__) @@ -37,29 +69,13 @@ } -def process(*, fn: str, encoding: str, timezone: str, **kwargs: dict) -> xr.Dataset: - """ - Fusion csv export format. - - Multiple chromatograms per file, with multiple detectors. - - Parameters - ---------- - fn - Filename to process. - - encoding - Encoding used to open the file. - - timezone - Timezone information. This should be ``"localtime"``. - - Returns - ------- - :class:`xarray.Dataset` - - """ - +def extract( + *, + fn: str, + encoding: str, + timezone: str, + **kwargs: dict, +) -> Dataset: with open(fn, "r", encoding=encoding, errors="ignore") as infile: lines = infile.readlines() @@ -98,7 +114,9 @@ def process(*, fn: str, encoding: str, timezone: str, **kwargs: dict) -> xr.Data "area": {}, "retention time": {}, "sampleid": items[sni], - "uts": str_to_uts(timestamp=f"{items[0]}{offset}", timezone=timezone), + "uts": dgutils.str_to_uts( + timestamp=f"{items[0]}{offset}", timezone=timezone + ), } for ii, i in enumerate(items[2:]): ii += 2 diff --git a/src/yadg/extractors/fusion/json.py b/src/yadg/extractors/fusion/json.py new file mode 100644 index 00000000..32fac3bc --- /dev/null +++ b/src/yadg/extractors/fusion/json.py @@ -0,0 +1,206 @@ +""" +For processing Inficon Fusion json data. Contains both the data from the raw +chromatogram and the post-processed results. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Fusion_json + +Schema +`````` +.. code-block:: yaml + + datatree.DataTree: + coords: + uts: !!float + species: !!str + data_vars: + height: (uts, species) # Peak height at maximum + area: (uts, species) # Integrated peak area + concentration: (uts, species) # Calibrated peak area + xout: (uts, species) # Mole fraction (normalized conc.) + retention time: (uts, species) # Peak retention time + {{ detector_name }}: + coords: + uts: !!float # Unix timestamp + elution_time: !!float # Elution time + data_vars: + signal: (uts, elution_time) # Signal data + valve: (uts) # Valve position + +Metadata +```````` +No metadata is currently extracted. + +.. codeauthor:: + Peter Kraus + +""" + +import json +import logging +from datatree import DataTree +from xarray import Dataset +import xarray as xr +import numpy as np + +from yadg import dgutils + + +logger = logging.getLogger(__name__) + + +def chromdata(jsdata: dict, uts: float) -> Dataset: + metadata = { + "method": jsdata.get("methodName", "n/a"), + "version": jsdata.get("softwareVersion", {}).get("version", None), + "datafile": jsdata.get("sequence", {}).get("location", None), + } + + sampleid = jsdata.get("annotations", {}).get("name", None) + if sampleid is not None: + metadata["sampleid"] = sampleid + + units = { + "height": None, + "area": None, + "concentration": "%", + "xout": "%", + "retention time": "s", + } + + raw = { + "height": {}, + "area": {}, + "concentration": {}, + "xout": {}, + "retention time": {}, + } + + species = set() + + # sort detector keys to ensure alphabetic order for ID matching + for detname in sorted(jsdata["detectors"].keys()): + detdict = jsdata["detectors"][detname] + if "analysis" in detdict: + for peak in detdict["analysis"]["peaks"]: + if "label" not in peak: + continue + else: + species.add(peak["label"]) + if "height" in peak: + raw["height"][peak["label"]] = (float(peak["height"]), 1.0) + if "area" in peak: + raw["area"][peak["label"]] = (float(peak["area"]), 0.01) + if "concentration" in peak: + raw["concentration"][peak["label"]] = ( + float(peak["concentration"]), + float(peak["concentration"]) * 1e-3, + ) + if "normalizedConcentration" in peak: + raw["xout"][peak["label"]] = ( + float(peak["normalizedConcentration"]), + float(peak["normalizedConcentration"]) * 1e-3, + ) + if "top" in peak: + raw["retention time"][peak["label"]] = (float(peak["top"]), 0.01) + + valve = jsdata.get("annotations", {}).get("valcoPosition", None) + if valve is not None: + raw["valve"] = valve + + species = sorted(species) + data_vars = {} + for k, v in units.items(): + vals, devs = zip(*[raw[k].get(s, (np.nan, np.nan)) for s in species]) + data_vars[k] = ( + ["uts", "species"], + [vals], + {"ancillary_variables": f"{k}_std_err"}, + ) + data_vars[f"{k}_std_err"] = ( + ["uts", "species"], + [devs], + {"standard_name": f"{k} stdandard_error"}, + ) + if v is not None: + data_vars[k][2]["units"] = v + data_vars[f"{k}_std_err"][2]["units"] = v + + ds = xr.Dataset( + data_vars=data_vars, + coords={"species": (["species"], species), "uts": (["uts"], [uts])}, + attrs=metadata, + ) + return ds + + +def chromtrace(jsdata: dict, uts: float) -> DataTree: + metadata = { + "method": jsdata.get("methodName", "n/a"), + "sampleid": jsdata.get("annotations", {}).get("name", None), + "version": jsdata.get("softwareVersion", {}).get("version", None), + "datafile": jsdata.get("sequence", {}).get("location", None), + } + + # sort detector keys to ensure alphabetic order for ID matching + traces = sorted(jsdata["detectors"].keys()) + vals = {} + for detname in traces: + detdict = jsdata["detectors"][detname] + fvals = xr.Dataset( + data_vars={ + "signal": ( + ["uts", "elution_time"], + [detdict["values"]], + {"ancillary_variables": "signal_std_err"}, + ), + "signal_std_err": ( + ["uts", "elution_time"], + [np.ones(detdict["nValuesExpected"])], + {"standard_name": "signal standard_error"}, + ), + "elution_time_std_err": ( + ["elution_time"], + np.ones(detdict["nValuesExpected"]) / detdict["nValuesPerSecond"], + {"units": "s", "standard_name": "elution_time standard_error"}, + ), + }, + coords={ + "elution_time": ( + ["elution_time"], + np.arange(detdict["nValuesExpected"]) / detdict["nValuesPerSecond"], + {"units": "s", "ancillary_variables": "elution_time_std_err"}, + ), + "uts": (["uts"], [uts]), + }, + attrs={}, + ) + valve = jsdata.get("annotations", {}).get("valcoPosition", None) + if valve is not None: + fvals["valve"] = valve + vals[detname] = fvals + + dt = DataTree.from_dict(vals) + dt.attrs = metadata + return dt + + +def extract( + *, + fn: str, + encoding: str, + timezone: str, + **kwargs: dict, +) -> DataTree: + with open(fn, "r", encoding=encoding, errors="ignore") as infile: + jsdata = json.load(infile) + uts = dgutils.str_to_uts(timestamp=jsdata["runTimeStamp"], timezone=timezone) + data = chromdata(jsdata, uts) + trace = chromtrace(jsdata, uts) + newdt = DataTree(data) + for k, v in trace.items(): + newdt[k] = v + return newdt diff --git a/src/yadg/extractors/fusion/zip.py b/src/yadg/extractors/fusion/zip.py new file mode 100644 index 00000000..a9965902 --- /dev/null +++ b/src/yadg/extractors/fusion/zip.py @@ -0,0 +1,70 @@ +""" +For processing Inficon Fusion zipped data. This is a wrapper parser which unzips the +provided zip file, and then uses the :mod:`yadg.extractors.fusion.json` extractor +to parse every fusion-data file present in the archive. + +Contains both the data from the raw chromatogram and the post-processed results. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Fusion_zip + +Schema +`````` +.. code-block:: yaml + + datatree.DataTree: + coords: + uts: !!float + species: !!str + data_vars: + height: (uts, species) # Peak height at maximum + area: (uts, species) # Integrated peak area + concentration: (uts, species) # Calibrated peak area + xout: (uts, species) # Mole fraction (normalized conc.) + retention time: (uts, species) # Peak retention time + {{ detector_name }}: + coords: + uts: !!float # Unix timestamp + elution_time: !!float # Elution time + data_vars: + signal: (uts, elution_time) # Signal data + valve: (uts) # Valve position + +Metadata +```````` +No metadata is currently extracted. + +.. codeauthor:: + Peter Kraus + +""" + +import zipfile +import tempfile +import os +from datatree import DataTree + +from yadg.extractors.fusion.json import extract as extract_json +from yadg import dgutils + + +def extract( + *, + fn: str, + timezone: str, + encoding: str, + **kwargs: dict, +) -> DataTree: + zf = zipfile.ZipFile(fn) + with tempfile.TemporaryDirectory() as tempdir: + zf.extractall(tempdir) + dt = None + filenames = [ffn for ffn in os.listdir(tempdir) if ffn.endswith("fusion-data")] + for ffn in sorted(filenames): + path = os.path.join(tempdir, ffn) + fdt = extract_json(fn=path, timezone=timezone, encoding=encoding, **kwargs) + dt = dgutils.merge_dicttrees(dt, fdt.to_dict(), "identical") + return DataTree.from_dict(dt) diff --git a/src/yadg/extractors/panalytical/__init__.py b/src/yadg/extractors/panalytical/__init__.py new file mode 100644 index 00000000..99617bf3 --- /dev/null +++ b/src/yadg/extractors/panalytical/__init__.py @@ -0,0 +1,4 @@ +""" +Extractors for various exports of Panalytical X-ray diffractograms. + +""" diff --git a/src/yadg/parsers/xrdtrace/common.py b/src/yadg/extractors/panalytical/common.py similarity index 100% rename from src/yadg/parsers/xrdtrace/common.py rename to src/yadg/extractors/panalytical/common.py diff --git a/src/yadg/parsers/xrdtrace/panalyticalcsv.py b/src/yadg/extractors/panalytical/csv.py similarity index 81% rename from src/yadg/parsers/xrdtrace/panalyticalcsv.py rename to src/yadg/extractors/panalytical/csv.py index a5500635..6c77c53a 100644 --- a/src/yadg/parsers/xrdtrace/panalyticalcsv.py +++ b/src/yadg/extractors/panalytical/csv.py @@ -1,10 +1,30 @@ """ -panalyticalcsv: Processing of PANalytical XRD ``csv`` files ------------------------------------------------------------ +Handles processing of csv exports of Panalytical XRD files. -File Structure -`````````````` +Usage +````` +Available since ``yadg-4.2``. +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Panalytical_csv + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp + angle: !!float # 2θ angle + data_vars: + intensity: (uts, angle) # Measured intensity + +Metadata +```````` +With the exception of the ``comment``, the metadata present in the csv file is extracted +from the file header without post-processing. + +Notes on file structure +``````````````````````` These files are split into a ``[Measurement conditions]`` and a ``[Scan points]`` section. The former stores the metadata and the latter all the datapoints. @@ -19,13 +39,13 @@ Peter Kraus """ -from ...dgutils import dateutils -from .common import panalytical_comment, snake_case from uncertainties.core import str_to_number_with_uncert as tuple_fromstr +from xarray import Dataset import xarray as xr import numpy as np -# Converting camelCase xrdml keys to snake_case. +from yadg.dgutils import dateutils +from yadg.extractors.panalytical.common import panalytical_comment, snake_case def _process_comments(comments: list[str]) -> dict: @@ -90,35 +110,13 @@ def _process_data(data: str) -> tuple[list, list]: return avals, adevs, ivals, idevs -def process( +def extract( *, fn: str, - encoding: str = "utf-8", - timezone: str = "UTC", + encoding: str, + timezone: str, **kwargs: dict, -) -> xr.Dataset: - """ - Processes a PANalytical XRD csv file. All information contained in the header - of the csv file is stored in the metadata. - - Parameters - ---------- - fn - The file containing the trace(s) to parse. - - encoding - Encoding of ``fn``, by default "utf-8". - - timezone - A string description of the timezone. Default is "UTC". - - Returns - ------- - :class:`xarray.Dataset` - Data containing the timesteps and metadata. This filetype contains the full - date specification. - - """ +) -> Dataset: with open(fn, "r", encoding=encoding) as csv_file: csv = csv_file.read() # Split file into its sections. diff --git a/src/yadg/parsers/xrdtrace/panalyticalxrdml.py b/src/yadg/extractors/panalytical/xrdml.py similarity index 84% rename from src/yadg/parsers/xrdtrace/panalyticalxrdml.py rename to src/yadg/extractors/panalytical/xrdml.py index 5922cc4d..75948757 100644 --- a/src/yadg/parsers/xrdtrace/panalyticalxrdml.py +++ b/src/yadg/extractors/panalytical/xrdml.py @@ -1,29 +1,52 @@ """ -panalyticalxrdml: Processing of PANalytical XRD ``xml`` files -------------------------------------------------------------- +Handles processing of Panalytical XRDML files. -File Structure -`````````````` +Usage +````` +Available since ``yadg-4.2``. -These are xml-formatted files, which we here parse using the :mod:`xml.etree` -library into a Python :class:`dict`. +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Panalytical_xrdml + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp + angle: !!float # 2θ angle + data_vars: + intensity: (uts, angle) # Measured intensity + +Metadata +```````` +The following metadata is extracted: + + - ``sample``: Metadata information about the sample. + - ``wavelength``: Measurement wavelength. + - ``comment``: A free-form description of the experiment. + - ``incident_beam_path`` + - ``diffracted_beam_path`` + - ``counting_time`` .. note:: - The ``angle`` returned from this parser is based on a linear interpolation of - the start and end point of the scan, and is the :math:`2\\theta`. The values - of :math:`\\omega` are discarded. + The returned metadata contain only a subset of the available metadata in the XML + file. If something important is missing, please contact us! -.. warning:: +Notes on file structure +``````````````````````` +These are xml-formatted files, which we here parse using the :mod:`xml.etree` +library into a Python :class:`dict`. - This parser is fairly new and untested. As a result, the returned metadata - contain only a subset of the available metadata in the XML file. If something - important is missing, please contact us! +The ``angle`` returned from this parser is based on a linear interpolation of the start +and end point of the scan, and is the :math:`2\\theta`. The values of :math:`\\omega` +are discarded. Uncertainties ````````````` -The uncertainties of in ``"angle"`` are taken as the step-width of -the linearly spaced :math:`2\\theta` values. +The uncertainties of in ``"angle"`` are taken as the step-width of the linearly spaced +:math:`2\\theta` values. The uncertainties of of ``"intensity"`` are currently set to a constant value of 1.0 count as all the supported files seem to produce integer values. @@ -31,17 +54,19 @@ .. codeauthor:: Nicolas Vetsch, Peter Kraus + """ from collections import defaultdict from typing import Union from xml.etree import ElementTree import numpy as np +from xarray import Dataset import xarray as xr - from uncertainties.core import str_to_number_with_uncert as tuple_fromstr -from .common import panalytical_comment -from ...dgutils import dateutils + +from yadg.extractors.panalytical.common import panalytical_comment +from yadg.dgutils import dateutils def etree_to_dict(e: ElementTree.Element) -> dict: @@ -51,17 +76,6 @@ def etree_to_dict(e: ElementTree.Element) -> dict: Element text is stored into `"#text"` for all nodes. From https://stackoverflow.com/a/10076823. - - Parameters - ---------- - e - The ElementTree root Element. - - Returns - ------- - dict - ElementTree parsed into a dictionary. - """ d = {e.tag: {} if e.attrib else None} children = list(e) @@ -96,7 +110,6 @@ def _process_values(d: Union[dict, str]) -> Union[dict, str]: .. code:: {"key": f"{#text} {@unit}", ...} - """ # TODO # If not "#text" or @tribute just snake_case and recurse. @@ -115,7 +128,6 @@ def _process_scan(scan: dict) -> dict: """ Parses the scan section of the file. Creates the explicit positions based on the number of measured intensities and the start & end position. - """ header = scan.pop("header") dpts = scan.pop("dataPoints") @@ -148,7 +160,6 @@ def _process_scan(scan: dict) -> dict: def _process_comment(comment: dict) -> dict: - """ """ entry = comment.pop("entry") ret = {} for line in entry: @@ -186,29 +197,12 @@ def _process_measurement(measurement: dict, timezone: str): return trace, meta -def process( +def extract( *, fn: str, timezone: str, **kwargs: dict, -) -> xr.Dataset: - """Processes a PANalytical xrdml file. - - Parameters - ---------- - fn - The file containing the trace(s) to parse. - - timezone - A string description of the timezone. Default is "UTC". - - Returns - ------- - :class:`xarray.Dataset` - Data containing the timesteps, and metadata. This filetype contains the full - date specification. - - """ +) -> Dataset: it = ElementTree.iterparse(fn) # Removing xmlns prefixes from all tags. # From https://stackoverflow.com/a/25920989. diff --git a/src/yadg/parsers/xrdtrace/panalyticalxy.py b/src/yadg/extractors/panalytical/xy.py similarity index 63% rename from src/yadg/parsers/xrdtrace/panalyticalxy.py rename to src/yadg/extractors/panalytical/xy.py index f5a0c9c9..5d463266 100644 --- a/src/yadg/parsers/xrdtrace/panalyticalxy.py +++ b/src/yadg/extractors/panalytical/xy.py @@ -1,19 +1,33 @@ """ -panalyticalxy: Processing of PANalytical XRD ``xy`` files ---------------------------------------------------------- +Handles processing of xy exports of Panalytical XRD files. When possible, the xrdml or +csv files should be used instead. -File Structure -`````````````` +Usage +````` +Available since ``yadg-4.2``. -These files basically just contain the ``[Scan points]`` part of -:mod:`~yadg.parsers.xrdtrace.panalyticalcsv` files. As a consequence, no metadata -is recorded, and the format does not have an associated timestamp. +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Panalytical_xy -Uncertainties -````````````` -The uncertainties of ``"angle"`` are taken from the number of significant figures. +Schema +`````` +.. code-block:: yaml -The uncertainties of ``"intensity"`` are taken from the number of significant figures. + xarray.Dataset: + coords: + angle: !!float # 2θ angle + data_vars: + intensity: (angle) # Measured intensity + +Metadata +```````` +No metadata is present in files. + +Notes on file structure +``````````````````````` + +These files basically just contain the ``[Scan points]`` part of Panalytical csv files. +As a consequence, no metadata is recorded, and the format does not have an associated +timestamp. .. codeauthor:: Nicolas Vetsch, @@ -22,33 +36,16 @@ from uncertainties.core import str_to_number_with_uncert as tuple_fromstr import numpy as np +from xarray import Dataset import xarray as xr -def process( +def extract( *, fn: str, encoding: str, **kwargs: dict, -) -> xr.Dataset: - """Processes a PANalytical XRD xy file. - - Parameters - ---------- - fn - The file containing the trace(s) to parse. - - encoding - Encoding of ``fn``, by default "utf-8". - - - Returns - ------- - :class:`xarray.Dataset` - Tuple containing the timesteps and metadata. A full timestamp is not available - in ``.xy`` files. - - """ +) -> Dataset: with open(fn, "r", encoding=encoding) as xy_file: xy = xy_file.readlines() datapoints = [li.strip().split() for li in xy] diff --git a/src/yadg/extractors/public/__init__.py b/src/yadg/extractors/phi/__init__.py similarity index 100% rename from src/yadg/extractors/public/__init__.py rename to src/yadg/extractors/phi/__init__.py diff --git a/src/yadg/parsers/xpstrace/phispe.py b/src/yadg/extractors/phi/spe.py similarity index 90% rename from src/yadg/parsers/xpstrace/phispe.py rename to src/yadg/extractors/phi/spe.py index 2730d53c..539ed6e6 100644 --- a/src/yadg/parsers/xpstrace/phispe.py +++ b/src/yadg/extractors/phi/spe.py @@ -1,12 +1,38 @@ """ -**phispe**: Processing of ULVAC PHI Multipak XPS traces. --------------------------------------------------------- +Processing of ULVAC PHI Multipak XPS traces. -The `IGOR .spe import script by jjweimer `_ -was pretty helpful for writing this parser. +The `IGOR .spe import script `_ by +jjweimer was pretty helpful for writing this extractor. -File Structure of ``.spe`` Files -```````````````````````````````` +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Phi_spe + +Schema +`````` +.. code-block:: yaml + + datatree.DataTree: + {{ trace_name }}: + coords: + E: !!float # Binding energies + data_vars: + y: (E) # Signal data + +Metadata +```````` +The following metadata is extracted: + + - ``software_id``: ID of the software used to generate the file. + - ``version``: Version of the software used to generate the file. + - ``username``: User name used to generate the file. + +Additionally, the processed header data is stored in the metadata under ``file_header``. + +Notes on file structure +``````````````````````` These binary files actually contain an ASCII file header, delimited by `"SOFH\n"` and `"EOFH\n"`. @@ -97,13 +123,16 @@ Determining the uncertainty of the counts per second signal in XPS traces from the phispe parser should be done in a better way. -.. codeauthor:: Nicolas Vetsch +.. codeauthor:: + Nicolas Vetsch + """ import re import numpy as np import xarray as xr import datatree +from datatree import DataTree import yadg.dgutils as dgutils data_header_dtype = np.dtype( @@ -307,35 +336,19 @@ def _process_traces(spe: list[bytes], trace_defs: list[dict]) -> dict: return traces -def process( +def extract( *, fn: str, **kwargs: dict, -) -> datatree.DataTree: - """Processes ULVAC-PHI Multipak XPS data. - - Parameters - ---------- - fn - The file containing the data to parse. - - Returns - ------- - :class:`datatree.DataTree` - Returns a :class:`datatree.DataTree` containing a :class:`xarray.Dataset` for each - XPS trace present in the input file. - - """ +) -> DataTree: with open(fn, "rb") as spe_file: spe = spe_file.readlines() header = _process_header(spe) software_id, version = header.get("software_version").split() meta = { - "params": { - "software_id": software_id, - "version": version, - "username": header.get("operator"), - }, + "software_id": software_id, + "version": version, + "username": header.get("operator"), "file_header": header, } trace_defs = _process_trace_defs(header) diff --git a/src/yadg/extractors/public/agilent/ch.py b/src/yadg/extractors/public/agilent/ch.py deleted file mode 100644 index 31805290..00000000 --- a/src/yadg/extractors/public/agilent/ch.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.chromtrace.agilentch import process as extract - -supports = { - "agilent.ch", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/agilent/csv.py b/src/yadg/extractors/public/agilent/csv.py deleted file mode 100644 index 0671ba41..00000000 --- a/src/yadg/extractors/public/agilent/csv.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.chromtrace.agilentcsv import process as extract - -supports = { - "agilent.csv", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/agilent/dx.py b/src/yadg/extractors/public/agilent/dx.py deleted file mode 100644 index 2dd473a8..00000000 --- a/src/yadg/extractors/public/agilent/dx.py +++ /dev/null @@ -1,8 +0,0 @@ -from yadg.parsers.chromtrace.agilentdx import process as extract - -supports = { - "agilent.dx", - "marda:agilent-dx", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/drycal/__init__.py b/src/yadg/extractors/public/drycal/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/yadg/extractors/public/drycal/csv.py b/src/yadg/extractors/public/drycal/csv.py deleted file mode 100644 index 4d9bbf12..00000000 --- a/src/yadg/extractors/public/drycal/csv.py +++ /dev/null @@ -1,40 +0,0 @@ -import logging -import xarray as xr -from yadg.parsers.flowdata import drycal - -supports = { - "drycal.csv", -} - -logger = logging.getLogger(__name__) - - -def extract( - *, - fn: str, - encoding: str, - timezone: str, - **kwargs: dict, -) -> xr.Dataset: - """ """ - vals = drycal.sep(fn, ",", encoding, timezone) - # check timestamps are increasing: - warn = True - ndays = 0 - utslist = vals.uts.values - for i in range(1, vals.uts.size): - if utslist[i] < utslist[i - 1]: - if warn: - logger.warning("DryCal log crossing day boundary. Adding offset.") - warn = False - uts = utslist[i] + ndays * 86400 - while uts < utslist[i - 1]: - ndays += 1 - uts = utslist[i] + ndays * 86400 - utslist[i] = uts - vals["uts"] = xr.DataArray(data=utslist, dims=["uts"]) - vals.attrs["fulldate"] = False - return vals - - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/drycal/rtf.py b/src/yadg/extractors/public/drycal/rtf.py deleted file mode 100644 index 33fec4b4..00000000 --- a/src/yadg/extractors/public/drycal/rtf.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging -from xarray import Dataset -import xarray as xr -from yadg.parsers.flowdata import drycal - -logger = logging.getLogger(__name__) - - -def extract( - *, - fn: str, - encoding: str, - timezone: str, - **kwargs: dict, -) -> Dataset: - """ """ - vals = drycal.rtf(fn, encoding, timezone) - # check timestamps are increasing: - warn = True - ndays = 0 - utslist = vals.uts.values - for i in range(1, vals.uts.size): - if utslist[i] < utslist[i - 1]: - if warn: - logger.warning("DryCal log crossing day boundary. Adding offset.") - warn = False - uts = utslist[i] + ndays * 86400 - while uts < utslist[i - 1]: - ndays += 1 - uts = utslist[i] + ndays * 86400 - utslist[i] = uts - vals["uts"] = xr.DataArray(data=utslist, dims=["uts"]) - vals.attrs["fulldate"] = False - return vals - - -__all__ = ["extract"] diff --git a/src/yadg/extractors/public/drycal/txt.py b/src/yadg/extractors/public/drycal/txt.py deleted file mode 100644 index 7f1b4118..00000000 --- a/src/yadg/extractors/public/drycal/txt.py +++ /dev/null @@ -1,40 +0,0 @@ -import logging -import xarray as xr -from yadg.parsers.flowdata import drycal - -supports = { - "drycal.txt", -} - -logger = logging.getLogger(__name__) - - -def extract( - *, - fn: str, - encoding: str, - timezone: str, - **kwargs: dict, -) -> xr.Dataset: - """ """ - vals = drycal.sep(fn, "\t", encoding, timezone) - # check timestamps are increasing: - warn = True - ndays = 0 - utslist = vals.uts.values - for i in range(1, vals.uts.size): - if utslist[i] < utslist[i - 1]: - if warn: - logger.warning("DryCal log crossing day boundary. Adding offset.") - warn = False - uts = utslist[i] + ndays * 86400 - while uts < utslist[i - 1]: - ndays += 1 - uts = utslist[i] + ndays * 86400 - utslist[i] = uts - vals["uts"] = xr.DataArray(data=utslist, dims=["uts"]) - vals.attrs["fulldate"] = False - return vals - - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/eclab/__init__.py b/src/yadg/extractors/public/eclab/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/yadg/extractors/public/eclab/mpr.py b/src/yadg/extractors/public/eclab/mpr.py deleted file mode 100644 index 3d58589a..00000000 --- a/src/yadg/extractors/public/eclab/mpr.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.electrochem.eclabmpr import process as extract - -supports = { - "eclab.mpr", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/eclab/mpt.py b/src/yadg/extractors/public/eclab/mpt.py deleted file mode 100644 index 1a9e37ae..00000000 --- a/src/yadg/extractors/public/eclab/mpt.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.electrochem.eclabmpt import process as extract - -supports = { - "eclab.mpt", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/ezchrom/__init__.py b/src/yadg/extractors/public/ezchrom/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/yadg/extractors/public/ezchrom/asc.py b/src/yadg/extractors/public/ezchrom/asc.py deleted file mode 100644 index dc258dcf..00000000 --- a/src/yadg/extractors/public/ezchrom/asc.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.chromtrace.ezchromasc import process as extract - -supports = { - "ezchrom.asc", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/fusion/__init__.py b/src/yadg/extractors/public/fusion/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/yadg/extractors/public/fusion/csv.py b/src/yadg/extractors/public/fusion/csv.py deleted file mode 100644 index 168a988c..00000000 --- a/src/yadg/extractors/public/fusion/csv.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.chromdata.fusioncsv import process as extract - -supports = { - "fusion.csv", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/fusion/json.py b/src/yadg/extractors/public/fusion/json.py deleted file mode 100644 index 21aa5383..00000000 --- a/src/yadg/extractors/public/fusion/json.py +++ /dev/null @@ -1,20 +0,0 @@ -from datatree import DataTree -from yadg.parsers.chromdata.fusionjson import process as extract_data -from yadg.parsers.chromtrace.fusionjson import process as extract_trace - - -supports = { - "fusion.json", -} - - -def extract(**kwargs): - data = extract_data(**kwargs) - trace = extract_trace(**kwargs) - newdt = DataTree(data) - for k, v in trace.items(): - newdt[k] = v - return newdt - - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/fusion/zip.py b/src/yadg/extractors/public/fusion/zip.py deleted file mode 100644 index b532f3f4..00000000 --- a/src/yadg/extractors/public/fusion/zip.py +++ /dev/null @@ -1,20 +0,0 @@ -from datatree import DataTree -from yadg.parsers.chromdata.fusionzip import process as extract_data -from yadg.parsers.chromtrace.fusionzip import process as extract_trace - - -supports = { - "fusion.zip", -} - - -def extract(**kwargs): - data = extract_data(**kwargs) - trace = extract_trace(**kwargs) - newdt = DataTree(data) - for k, v in trace.items(): - newdt[k] = v - return newdt - - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/panalytical/__init__.py b/src/yadg/extractors/public/panalytical/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/yadg/extractors/public/panalytical/csv.py b/src/yadg/extractors/public/panalytical/csv.py deleted file mode 100644 index 30485ff0..00000000 --- a/src/yadg/extractors/public/panalytical/csv.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.xrdtrace.panalyticalcsv import process as extract - -supports = { - "panalytical.csv", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/panalytical/xrdml.py b/src/yadg/extractors/public/panalytical/xrdml.py deleted file mode 100644 index 0e95bd2a..00000000 --- a/src/yadg/extractors/public/panalytical/xrdml.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.xrdtrace.panalyticalxrdml import process as extract - -supports = { - "panalytical.xrdml", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/panalytical/xy.py b/src/yadg/extractors/public/panalytical/xy.py deleted file mode 100644 index 6206b4b6..00000000 --- a/src/yadg/extractors/public/panalytical/xy.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.xrdtrace.panalyticalxy import process as extract - -supports = { - "panalytical.xy", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/phi/__init__.py b/src/yadg/extractors/public/phi/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/yadg/extractors/public/phi/spe.py b/src/yadg/extractors/public/phi/spe.py deleted file mode 100644 index 7791699c..00000000 --- a/src/yadg/extractors/public/phi/spe.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.xpstrace.phispe import process as extract - -supports = { - "phi.spe", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/quadstar/__init__.py b/src/yadg/extractors/public/quadstar/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/yadg/extractors/public/quadstar/sac.py b/src/yadg/extractors/public/quadstar/sac.py deleted file mode 100644 index 1e08dac7..00000000 --- a/src/yadg/extractors/public/quadstar/sac.py +++ /dev/null @@ -1,7 +0,0 @@ -from yadg.parsers.masstrace.quadstarsac import process as extract - -supports = { - "quadstar.sac", -} - -__all__ = ["supports", "extract"] diff --git a/src/yadg/extractors/public/tomato/__init__.py b/src/yadg/extractors/public/tomato/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/yadg/extractors/public/tomato/json.py b/src/yadg/extractors/public/tomato/json.py deleted file mode 100644 index f1df8c2e..00000000 --- a/src/yadg/extractors/public/tomato/json.py +++ /dev/null @@ -1,35 +0,0 @@ -import json -from xarray import Dataset - -from yadg.parsers.basiccsv.main import append_dicts, dicts_to_dataset -from yadg.parsers.electrochem.tomatojson import process - - -def dummy_tomato_json(*, fn: str, **kwargs: dict) -> Dataset: - with open(fn, "r") as inf: - jsdata = json.load(inf) - - data_vals = {} - meta_vals = {} - for vi, vals in enumerate(jsdata["data"]): - vals["uts"] = vals.pop("time") - devs = {} - for k, v in vals.items(): - if k not in {"time", "address", "channel"}: - devs[k] = 0.0 - append_dicts(vals, devs, data_vals, meta_vals, fn, vi) - return dicts_to_dataset(data_vals, meta_vals, fulldate=False) - - -def extract(*, fn: str, **kwargs: dict) -> Dataset: - - with open(fn, "r") as inf: - jsdata = json.load(inf) - - if "technique" in jsdata: - return process(fn=fn, **kwargs) - else: - return dummy_tomato_json(fn=fn, **kwargs) - - -__all__ = ["extract"] diff --git a/src/yadg/extractors/public/agilent/__init__.py b/src/yadg/extractors/quadstar/__init__.py similarity index 100% rename from src/yadg/extractors/public/agilent/__init__.py rename to src/yadg/extractors/quadstar/__init__.py diff --git a/src/yadg/parsers/masstrace/quadstarsac.py b/src/yadg/extractors/quadstar/sac.py similarity index 89% rename from src/yadg/parsers/masstrace/quadstarsac.py rename to src/yadg/extractors/quadstar/sac.py index 774deedd..64ac4d3a 100644 --- a/src/yadg/parsers/masstrace/quadstarsac.py +++ b/src/yadg/extractors/quadstar/sac.py @@ -1,16 +1,37 @@ """ -**quadstarsac**: Processing of Quadstar 32-bit scan analog data. ----------------------------------------------------------------- - The `sac2dat.c code from Dr. Moritz Bubek `_ was a really useful stepping stone for this Python file parser. +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Quadstar_sac + +Schema +`````` +.. code-block:: yaml + + datatree.DataTree: + {{ trace_index }}: + coords: + uts: !!float # Unix timestamp + mass_to_charge: !!float # M/Z ratio + data_vars: + fsr: (None) # Full scale range + y: (uts, mass_to_charge) # Signal data + + +Metadata +```````` +The "info_position" section in the below structure is stored as metadata for every +trace, without further processing. + +Notes on file structure +``````````````````````` Pretty much the entire file format has been reverse engineered. There are still one or two unknown fields. -File Structure of `.sac` Files -`````````````````````````````` - .. code-block:: python 0x00 "data_index" @@ -62,7 +83,17 @@ data_position + (n * timestep_length) + 0x06 "datapoints" ... -.. codeauthor:: Nicolas Vetsch +Uncertainties +````````````` +Uncertainties in ``mass_to_charge`` are set to one step in M/Z spacing. + +Uncertainties in the signal ``y`` are either based on the analog-to-digital conversion +(i.e. using the full scale range), or from the upper limit of contribution of +neighboring M/Z points (50 ppm). + +.. codeauthor:: + Nicolas Vetsch + """ import numpy as np @@ -128,26 +159,11 @@ def _find_first_data_position(scan_headers: list[dict]) -> int: return header["data_position"] -def process( +def extract( *, fn: str, **kwargs: dict, ) -> DataTree: - """Processes a Quadstar 32-bit analog data .sac file. - - Parameters - ---------- - fn - The file containing the trace(s) to parse. - - Returns - ------- - :class:`datatree.DataTree` - A :class:`datatree.DataTree` containing one :class:`xarray.Dataset` per mass trace. - The traces in the Quadstar ``.sac`` files are not named, therefore their index - is used as the :class:`xarray.Dataset` name. - - """ with open(fn, "rb") as sac_file: sac = sac_file.read() meta = dgutils.read_value(sac, 0x0000, general_header_dtype) diff --git a/src/yadg/extractors/tomato/__init__.py b/src/yadg/extractors/tomato/__init__.py new file mode 100644 index 00000000..3b8f76cd --- /dev/null +++ b/src/yadg/extractors/tomato/__init__.py @@ -0,0 +1,4 @@ +""" +Extractor for files generated by tomato. + +""" diff --git a/src/yadg/parsers/electrochem/tomatojson.py b/src/yadg/extractors/tomato/json.py similarity index 63% rename from src/yadg/parsers/electrochem/tomatojson.py rename to src/yadg/extractors/tomato/json.py index e33294e4..fc79c873 100644 --- a/src/yadg/parsers/electrochem/tomatojson.py +++ b/src/yadg/extractors/tomato/json.py @@ -1,14 +1,40 @@ """ -**tomatojson**: Processing of tomato electrochemistry outputs. --------------------------------------------------------------- - -This module parses the electrochemistry ``json`` files generated by tomato. - -.. warning:: - - This parser is brand-new in `yadg-4.1` and the interface is unstable. - -Four sections are expected in each tomato data file: +This module parses the files generated by the ``dummy`` and ``biologic`` devices +within ``tomato-0.2``. As the ``dummy`` device has been mainly used for testing, +the below discusses the output of a ``biologic`` device. + +Usage +````` +Available since ``yadg-4.0``. + +.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_1.filetype.Tomato_json + +Schema +`````` +.. code-block:: yaml + + xarray.Dataset: + coords: + uts: !!float # Unix timestamp + data_vars: + Ewe: (uts) # Potential of the working electrode + Ece: (uts) # Potential of the counter electrode, if present + I: (uts) # Instantaneous current + technique: (uts) # Technique name + loop number: (uts) # Loop number (over techniques) + cycle number: (uts) # Cycle number (within technique) + index: (uts) # Technique index + +Metadata +```````` +No metadata is extracted. + +Notes on file structure +``````````````````````` +The files generated by the ``dummy`` driver do not contain the ``technique``, and all +values present in the json files are simply copied over assuming an uncertainty of 0.0. + +For the ``biologic`` driver, each tomato data file contains the following four sections: - ``technique`` section, describing the current technique, - ``previous`` section, containing status information of the previous file, @@ -21,18 +47,25 @@ a technique change happened). However, ``previous`` may not be present in the first data file of an experiment. +Uncertainties +````````````` To determine the measurement errors, the values from BioLogic manual are used: for measured voltages (:math:`E_{\\text{we}}` and :math:`E_{\\text{ce}}`) this corresponds to a constant uncertainty of 0.004% of the applied E-range with a maximum of 75 uV, while for currents (:math:`I`) this is a constant uncertainty of 0.0015% of the applied I-range with a maximum of 0.76 uA. -.. codeauthor:: Peter Kraus +.. codeauthor:: + Peter Kraus + """ import json import logging import xarray as xr +from xarray import Dataset + +from yadg import dgutils logger = logging.getLogger(__name__) @@ -48,10 +81,7 @@ } -def process(*, fn: str, **kwargs: dict) -> xr.Dataset: - with open(fn, "r") as infile: - jsdata = json.load(infile) - +def biologic_tomato_json(fn: str, jsdata: dict) -> Dataset: technique = jsdata["technique"] previous = jsdata.get("previous", None) current = jsdata["current"] @@ -140,3 +170,30 @@ def process(*, fn: str, **kwargs: dict) -> xr.Dataset: if not fulldate: ds.attrs["fulldate"] = False return ds + + +def dummy_tomato_json(fn: str, jsdata: dict) -> Dataset: + data_vals = {} + meta_vals = {} + for vi, vals in enumerate(jsdata["data"]): + vals["uts"] = vals.pop("time") + devs = {} + for k, v in vals.items(): + if k not in {"time", "address", "channel"}: + devs[k] = 0.0 + dgutils.append_dicts(vals, devs, data_vals, meta_vals, fn, vi) + return dgutils.dicts_to_dataset(data_vals, meta_vals, fulldate=False) + + +def extract( + *, + fn: str, + **kwargs: dict, +) -> Dataset: + with open(fn, "r") as inf: + jsdata = json.load(inf) + + if "technique" in jsdata: + return biologic_tomato_json(fn, jsdata) + else: + return dummy_tomato_json(fn, jsdata) diff --git a/src/yadg/parsers/__init__.py b/src/yadg/parsers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/yadg/parsers/basiccsv/__init__.py b/src/yadg/parsers/basiccsv/__init__.py deleted file mode 100644 index 1cd8ba85..00000000 --- a/src/yadg/parsers/basiccsv/__init__.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Handles the reading and processing of any tabular files, as long as the first line -contains the column headers. By default, the second should contain the units. The -columns of the table must be separated using a separator such as ``,``, ``;``, -or ``\\t``. - -.. warning:: - - Since ``yadg-5.0``, the parser handles sparse tables (i.e. tables with missing - data) by back-filling empty cells with ``np.NaNs``. - -.. note:: - - :mod:`~yadg.parsers.basiccsv` attempts to deduce the timestamp from the column - headers, using :func:`yadg.dgutils.dateutils.infer_timestamp_from`. Alternatively, - the column(s) containing the timestamp data and their format can be provided using - parameters. - -Usage -````` -Available since ``yadg-4.0``. The parser supports the following parameters: - -.. _yadg.parsers.basiccsv.model: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.BasicCSV - -Schema -`````` -The primary functionality of :mod:`~yadg.parsers.basiccsv` is to load the tabular -data, and determine the Unix timestamp. The headers of the tabular data are taken -`verbatim` from the file, and appear as ``data_vars`` of the :class:`xarray.Dataset`. -The single ``coord`` for the ``data_vars`` is the deduced Unix timestamp, ``uts``. - -.. code-block:: yaml - - xr.Dataset: - coords: - uts: !!float # Unix timestamp - data_vars: - {{ headers }}: (uts) # Populated from file headers - -Module Functions -```````````````` - -""" - -from .main import process - -__all__ = ["process"] diff --git a/src/yadg/parsers/chromdata/__init__.py b/src/yadg/parsers/chromdata/__init__.py deleted file mode 100644 index 19f85f3a..00000000 --- a/src/yadg/parsers/chromdata/__init__.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Handles the reading of post-processed chromatography data, i.e. files containing peak -areas, concentrations, or mole fractions. - -.. note:: - - To parse trace data as present in raw chromatograms, use the - :mod:`~yadg.parsers.chromtrace` parser. - -Usage -````` -Available since ``yadg-4.2``. The parser supports the following parameters: - -.. _yadg.parsers.chromdata.model: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.ChromData - -.. _yadg.parsers.chromdata.formats: - -Formats -``````` -The ``filetypes`` currently supported by the parser are: - - - Inficon Fusion JSON format (``fusion.json``): - see :mod:`~yadg.parsers.chromdata.fusionjson` - - Inficon Fusion zip archive (``fusion.zip``): - see :mod:`~yadg.parsers.chromdata.fusionzip` - - Inficon Fusion csv export (``fusion.csv``): - see :mod:`~yadg.parsers.chromdata.fusioncsv` - - Empa's Agilent LC csv export (``empalc.csv``): - see :mod:`~yadg.parsers.chromdata.empalccsv` - - Empa's Agilent LC excel export (``empalc.xlsx``): - see :mod:`~yadg.parsers.chromdata.empalcxlsx` - -Schema -`````` -Each file is processed into a single :class:`xarray.Dataset`, containing the following -``coords`` and ``data_vars`` (if present in the file): - -.. code-block:: yaml - - xr.Dataset: - coords: - uts: !!float # Unix timestamp - species: !!str # Species names - data_vars: - height: (uts, species) # Peak height maximum - area: (uts, species) # Integrated peak area - retention time: (uts, species) # Peak retention time - concentration: (uts, species) # Species concentration (mol/l) - xout: (uts, species) # Species mole fraction (-) - -Module Functions -```````````````` - -""" - -import xarray as xr - -from . import ( - fusionjson, - fusionzip, - fusioncsv, - empalccsv, - empalcxlsx, -) - - -def process(*, filetype: str, **kwargs: dict) -> xr.Dataset: - """ - Unified chromatographic data parser. Forwards ``kwargs`` to the worker functions - based on the supplied ``filetype``. - - Parameters - ---------- - filetype - Discriminator used to select the appropriate worker function. - - Returns - ------- - :class:`xarray.Dataset` - - """ - if filetype == "fusion.json": - return fusionjson.process(**kwargs) - elif filetype == "fusion.zip": - return fusionzip.process(**kwargs) - elif filetype == "fusion.csv": - return fusioncsv.process(**kwargs) - elif filetype == "empalc.csv": - return empalccsv.process(**kwargs) - elif filetype == "empalc.xlsx": - return empalcxlsx.process(**kwargs) diff --git a/src/yadg/parsers/chromdata/fusionjson.py b/src/yadg/parsers/chromdata/fusionjson.py deleted file mode 100644 index 1ebe3a9c..00000000 --- a/src/yadg/parsers/chromdata/fusionjson.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -**fusionjson**: Processing Inficon Fusion json data format (json). ------------------------------------------------------------------- - -This is a fairly detailed data format, including the traces, the calibration applied, -and also the integrated peak areas and other processed information, which are parsed -by this module. - -.. note :: - - To parse the raw trace data, use the :mod:`~yadg.parsers.chromtrace` module. - -.. warning :: - - The detectors in the json files are not necessarily in a consistent order. To - avoid inconsistent parsing of species which appear in both detectors, the - detector keys are sorted. **Species present in both detectors** will be - **overwritten by the last detector** in alphabetical order. - -Exposed metadata: -````````````````` - -.. code-block:: yaml - - params: - method: !!str - username: None - version: !!str - datafile: !!str - -.. codeauthor:: Peter Kraus -""" - -import json -import logging -from ...dgutils.dateutils import str_to_uts -import xarray as xr -import numpy as np - -logger = logging.getLogger(__name__) - - -def process(*, fn: str, encoding: str, timezone: str, **kwargs: dict) -> xr.Dataset: - """ - Fusion json format. - - One chromatogram per file with multiple traces, and pre-analysed results. - Only a subset of the metadata is retained, including the method name, - detector names, and information about assigned peaks. - - Parameters - ---------- - fn - Filename to process. - - encoding - Encoding used to open the file. - - timezone - Timezone information. This should be ``"localtime"``. - - Returns - ------- - :class:`xarray.Dataset` - - """ - - with open(fn, "r", encoding=encoding, errors="ignore") as infile: - jsdata = json.load(infile) - metadata = { - "method": jsdata.get("methodName", "n/a"), - "version": jsdata.get("softwareVersion", {}).get("version", None), - "datafile": jsdata.get("sequence", {}).get("location", None), - } - uts = str_to_uts(timestamp=jsdata["runTimeStamp"], timezone=timezone) - - sampleid = jsdata.get("annotations", {}).get("name", None) - if sampleid is not None: - metadata["sampleid"] = sampleid - - units = { - "height": None, - "area": None, - "concentration": "%", - "xout": "%", - "retention time": "s", - } - - raw = { - "height": {}, - "area": {}, - "concentration": {}, - "xout": {}, - "retention time": {}, - } - - species = set() - - # sort detector keys to ensure alphabetic order for ID matching - for detname in sorted(jsdata["detectors"].keys()): - detdict = jsdata["detectors"][detname] - if "analysis" in detdict: - for peak in detdict["analysis"]["peaks"]: - if "label" not in peak: - continue - else: - species.add(peak["label"]) - if "height" in peak: - raw["height"][peak["label"]] = (float(peak["height"]), 1.0) - if "area" in peak: - raw["area"][peak["label"]] = (float(peak["area"]), 0.01) - if "concentration" in peak: - raw["concentration"][peak["label"]] = ( - float(peak["concentration"]), - float(peak["concentration"]) * 1e-3, - ) - if "normalizedConcentration" in peak: - raw["xout"][peak["label"]] = ( - float(peak["normalizedConcentration"]), - float(peak["normalizedConcentration"]) * 1e-3, - ) - if "top" in peak: - raw["retention time"][peak["label"]] = (float(peak["top"]), 0.01) - else: - logger.warning("'analysis' of chromatogram not present in file '%s'", fn) - - valve = jsdata.get("annotations", {}).get("valcoPosition", None) - if valve is not None: - raw["valve"] = valve - - species = sorted(species) - data_vars = {} - for k, v in units.items(): - vals, devs = zip(*[raw[k].get(s, (np.nan, np.nan)) for s in species]) - data_vars[k] = ( - ["uts", "species"], - [vals], - {"ancillary_variables": f"{k}_std_err"}, - ) - data_vars[f"{k}_std_err"] = ( - ["uts", "species"], - [devs], - {"standard_name": f"{k} stdandard_error"}, - ) - if v is not None: - data_vars[k][2]["units"] = v - data_vars[f"{k}_std_err"][2]["units"] = v - - ds = xr.Dataset( - data_vars=data_vars, - coords={"species": (["species"], species), "uts": (["uts"], [uts])}, - attrs=metadata, - ) - return ds diff --git a/src/yadg/parsers/chromdata/fusionzip.py b/src/yadg/parsers/chromdata/fusionzip.py deleted file mode 100644 index 9755f5f5..00000000 --- a/src/yadg/parsers/chromdata/fusionzip.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -**fusionzip**: Processing Inficon Fusion zipped data format (zip). ------------------------------------------------------------------- - -This is a wrapper parser which unzips the provided zip file, and then uses -the :mod:`yadg.parsers.chromdata.fusionjson` parser to parse every data -file present in the archive. - -.. codeauthor:: Peter Kraus -""" - -import zipfile -import tempfile -import os -import xarray as xr - -from .fusionjson import process as processjson - - -def process(*, fn: str, encoding: str, timezone: str, **kwargs: dict) -> xr.Dataset: - """ - Fusion zip file format. - - The Fusion GC's can export their json formats as a zip archive of a folder - of jsons. This parser allows for parsing of this zip archive directly, - without the user having to unzip & move the data. - - Parameters - ---------- - fn - Filename to process. - - encoding - Not used as the file is binary. - - timezone - Timezone information. This should be ``"localtime"``. - - Returns - ------- - :class:`xarray.Dataset` - The data from the inidividual json files contained in the zip archive are - concatenated into a single :class:`xarray.Dataset`. This might fail if the metadata - in the json files differs, or if the dimensions are not easily concatenable. - - """ - - zf = zipfile.ZipFile(fn) - with tempfile.TemporaryDirectory() as tempdir: - zf.extractall(tempdir) - ds = None - for ffn in sorted(os.listdir(tempdir)): - ffn = os.path.join(tempdir, ffn) - if ffn.endswith("fusion-data"): - ids = processjson(fn=ffn, encoding=encoding, timezone=timezone) - if ds is None: - ds = ids - else: - try: - ds = xr.concat([ds, ids], dim="uts", combine_attrs="identical") - except xr.MergeError: - raise RuntimeError( - "Merging metadata from the unzipped fusion-json files has failed. " - "This might be caused by trying to parse data obtained using " - "different chromatographic methods. Please check the contents " - "of the unzipped files." - ) - return ds diff --git a/src/yadg/parsers/chromtrace/__init__.py b/src/yadg/parsers/chromtrace/__init__.py deleted file mode 100644 index 1fa4da1b..00000000 --- a/src/yadg/parsers/chromtrace/__init__.py +++ /dev/null @@ -1,116 +0,0 @@ -""" -Handles the parsing of raw traces present in chromatography files, whether the source is -a liquid chromatograph (LC) or a gas chromatograph (GC). The basic function of the -parser is to: - -#. read in the raw data and create timestamped `traces` -#. collect `metadata` such as the method information, sample ID, etc. - -:mod:`~yadg.parsers.chromtrace` loads the chromatographic data from the specified -file, determines the uncertainties of the signal (y-axis), and explicitly -populates the points in the time axis (x-axis), when required. - -Usage -````` -Available since ``yadg-4.0``. The parser supports the following parameters: - -.. _yadg.parsers.chromtrace.model: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.ChromTrace - -.. _yadg.parsers.chromtrace.formats: - -Formats -``````` -The ``filetypes`` currently supported by the parser are: - - - EZ-Chrom ASCII export (``ezchrom.asc``): - see :mod:`~yadg.parsers.chromtrace.ezchromasc` - - Agilent Chemstation Chromtab (``agilent.csv``): - see :mod:`~yadg.parsers.chromtrace.agilentcsv` - - Agilent OpenLab binary signal (``agilent.ch``): - see :mod:`~yadg.parsers.chromtrace.agilentch` - - Agilent OpenLab data archive (``agilent.dx``): - see :mod:`~yadg.parsers.chromtrace.agilentdx` - - Inficon Fusion JSON format (``fusion.json``): - see :mod:`~yadg.parsers.chromtrace.fusionjson` - - Inficon Fusion zip archive (``fusion.zip``): - see :mod:`~yadg.parsers.chromtrace.fusionzip` - -.. _yadg.parsers.chromtrace.provides: - -Schema -`````` -The data is returned as a :class:`datatree.DataTree`, containing a :class:`xarray.Dataset` -for each trace / detector name: - -.. code-block:: yaml - - datatree.DataTree: - {{ detector_name }} !!xr.Dataset - coords: - uts: !!float # Timestamp of the chromatogram - elution_time: !!float # The time axis of the chromatogram (s) - data_vars: - signal: (uts, elution_time) # The ordinate axis of the chromatogram - -When multiple chromatograms are parsed, they are concatenated separately per detector -name. An error might occur during this concatenation if the ``elution_time`` axis changes -dimensions or coordinates between different timesteps. - -.. note:: - - To parse processed data in the raw data files, such as integrated peak areas or - concentrations, use the :mod:`~yadg.parsers.chromdata` parser instead. - -Module Functions -```````````````` - -""" - -import logging -import datatree - -from . import ( - ezchromasc, - agilentcsv, - agilentch, - agilentdx, - fusionjson, - fusionzip, -) - -logger = logging.getLogger(__name__) - - -def process( - *, - filetype: str, - **kwargs: dict, -) -> datatree.DataTree: - """ - Unified raw chromatogram parser. Forwards ``kwargs`` to the worker functions - based on the supplied ``filetype``. - - Parameters - ---------- - filetype - Discriminator used to select the appropriate worker function. - - Returns - ------- - :class:`datatree.DataTree` - - """ - if filetype == "ezchrom.asc": - return ezchromasc.process(**kwargs) - elif filetype == "agilent.csv": - return agilentcsv.process(**kwargs) - elif filetype == "agilent.dx": - return agilentdx.process(**kwargs) - elif filetype == "agilent.ch": - return agilentch.process(**kwargs) - elif filetype == "fusion.json": - return fusionjson.process(**kwargs) - elif filetype == "fusion.zip": - return fusionzip.process(**kwargs) diff --git a/src/yadg/parsers/chromtrace/agilentdx.py b/src/yadg/parsers/chromtrace/agilentdx.py deleted file mode 100644 index f38dbc31..00000000 --- a/src/yadg/parsers/chromtrace/agilentdx.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -**agilentch**: Processing Agilent OpenLab data archive files (DX). ------------------------------------------------------------------- - -This is a wrapper parser which unzips the provided DX file, and then uses the -:mod:`yadg.parsers.chromtrace.agilentch` parser to parse every CH file present in -the archive. The IT files in the archive are currently ignored. - -In addition to the metadata exposed by the CH parser, the ``datafile`` entry -is populated with the corresponding name of the CH file. The ``fn`` entry in each -timestep contains the parent DX file. - -.. note:: - - Currently the timesteps from multiple CH files (if present) are appended in the - timesteps array without any further sorting. - -.. codeauthor:: Peter Kraus -""" - -import zipfile -import tempfile -import os -from .agilentch import process as processch -from datatree import DataTree -import xarray as xr - - -def process(*, fn: str, encoding: str, timezone: str, **kwargs: dict) -> DataTree: - """ - Agilent OpenLab DX archive parser. - - This is a simple wrapper around the Agilent OpenLab signal trace parser in - :mod:`yadg.parsers.chromtrace.agilentch`. This wrapper first un-zips the DX - file into a temporary directory, and then processess all CH files found - within the archive, concatenating timesteps from multiple files. - - Parameters - ---------- - fn - Filename to process. - - encoding - Not used as the file is binary. - - timezone - Timezone information. This should be ``"localtime"``. - - Returns - ------- - class:`datatree.DataTree` - A :class:`datatree.DataTree` containing one :class:`xarray.Dataset` per detector. If - multiple timesteps are found in the zip archive, the :class:`datatree.DataTrees` - are collated along the ``uts`` dimension. - - """ - - zf = zipfile.ZipFile(fn) - with tempfile.TemporaryDirectory() as tempdir: - zf.extractall(tempdir) - dt = None - for ffn in os.listdir(tempdir): - if ffn.endswith("CH"): - path = os.path.join(tempdir, ffn) - fdt = processch(fn=path, encoding=encoding, timezone=timezone) - if dt is None: - dt = fdt - elif isinstance(dt, DataTree): - for k, v in fdt.items(): - if k in dt: # pylint: disable=E1135 - try: - newv = xr.concat( - [dt[k].ds, v.ds], # pylint: disable=E1136 - dim="uts", - combine_attrs="identical", - ) - except xr.MergeError: - raise RuntimeError( - "Merging metadata from the unzipped agilent-ch files has failed. " - "This is a bug. Please open an issue on GitHub." - ) - else: - newv = v.ds - dt[k] = DataTree(newv) # pylint: disable=E1137 - else: - raise RuntimeError("We should not get here.") - return dt diff --git a/src/yadg/parsers/chromtrace/fusionjson.py b/src/yadg/parsers/chromtrace/fusionjson.py deleted file mode 100644 index 2bd98264..00000000 --- a/src/yadg/parsers/chromtrace/fusionjson.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -**fusionjson**: Processing Inficon Fusion json data format (json). ------------------------------------------------------------------- - -This is a fairly detailed data format, including the traces, the calibration applied, -and also the integrated peak areas. If the peak areas are present, this is returned -in the list of timesteps as a ``"peaks"`` entry. - -Exposed metadata: -````````````````` - -.. code-block:: yaml - - method: !!str - sampleid: !!str - version: !!str - datafile: !!str - -.. codeauthor:: Peter Kraus -""" - -import json -import numpy as np -from ...dgutils.dateutils import str_to_uts -import xarray as xr -from datatree import DataTree - - -def process(*, fn: str, encoding: str, timezone: str, **kwargs: dict) -> DataTree: - """ - Fusion json format. - - One chromatogram per file with multiple traces, and integrated peak areas. - - .. warning:: - - To parse the integrated data present in these files, use the - :mod:`~yadg.parsers.chromdata` parser. - - Only a subset of the metadata is retained, including the method name, - detector names, and information about assigned peaks. - - Parameters - ---------- - fn - Filename to process. - - encoding - Encoding used to open the file. - - timezone - Timezone information. This should be ``"localtime"``. - - Returns - ------- - class:`datatree.DataTree` - A :class:`datatree.DataTree` containing one :class:`xarray.Dataset` per detector. - - """ - - with open(fn, "r", encoding=encoding, errors="ignore") as infile: - jsdata = json.load(infile) - metadata = { - "method": jsdata.get("methodName", "n/a"), - "sampleid": jsdata.get("annotations", {}).get("name", None), - "version": jsdata.get("softwareVersion", {}).get("version", None), - "datafile": jsdata.get("sequence", {}).get("location", None), - } - uts = str_to_uts(timestamp=jsdata["runTimeStamp"], timezone=timezone) - - # sort detector keys to ensure alphabetic order for ID matching - traces = sorted(jsdata["detectors"].keys()) - vals = {} - for detname in traces: - detdict = jsdata["detectors"][detname] - fvals = xr.Dataset( - data_vars={ - "signal": ( - ["uts", "elution_time"], - [detdict["values"]], - {"ancillary_variables": "signal_std_err"}, - ), - "signal_std_err": ( - ["uts", "elution_time"], - [np.ones(detdict["nValuesExpected"])], - {"standard_name": "signal standard_error"}, - ), - "elution_time_std_err": ( - ["elution_time"], - np.ones(detdict["nValuesExpected"]) / detdict["nValuesPerSecond"], - {"units": "s", "standard_name": "elution_time standard_error"}, - ), - }, - coords={ - "elution_time": ( - ["elution_time"], - np.arange(detdict["nValuesExpected"]) / detdict["nValuesPerSecond"], - {"units": "s", "ancillary_variables": "elution_time_std_err"}, - ), - "uts": (["uts"], [uts]), - }, - attrs={}, - ) - valve = jsdata.get("annotations", {}).get("valcoPosition", None) - if valve is not None: - fvals["valve"] = valve - vals[detname] = fvals - - dt = DataTree.from_dict(vals) - dt.attrs = metadata - return dt diff --git a/src/yadg/parsers/chromtrace/fusionzip.py b/src/yadg/parsers/chromtrace/fusionzip.py deleted file mode 100644 index 5087f227..00000000 --- a/src/yadg/parsers/chromtrace/fusionzip.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -**fusionzip**: Processing Inficon Fusion zipped data format (zip). ------------------------------------------------------------------- - -This is a wrapper parser which unzips the provided zip file, and then uses -the :mod:`yadg.parsers.chromtrace.fusionjson` parser to parse every data -file present in the archive. - -Exposed metadata: -````````````````` - -.. code-block:: yaml - - method: !!str - sampleid: !!str - version: !!str - datafile: !!str - -.. codeauthor:: Peter Kraus -""" - -import zipfile -import tempfile -import os -import xarray as xr -from datatree import DataTree - -from .fusionjson import process as processjson - - -def process(*, fn: str, encoding: str, timezone: str, **kwargs: dict) -> DataTree: - """ - Fusion zip file format. - - The Fusion GC's can export their json formats as a zip archive of a folder - of jsons. This parser allows for parsing of this zip archive directly, - without the user having to unzip & move the data. - - Parameters - ---------- - fn - Filename to process. - - encoding - Not used as the file is binary. - - timezone - Timezone information. This should be ``"localtime"``. - - Returns - ------- - class:`datatree.DataTree` - A :class:`datatree.DataTree` containing one :class:`xarray.Dataset` per detector. If - multiple timesteps are found in the zip archive, the :class:`datatree.DataTrees` - are collated along the ``uts`` dimension. - - """ - - zf = zipfile.ZipFile(fn) - with tempfile.TemporaryDirectory() as tempdir: - zf.extractall(tempdir) - dt = None - for ffn in sorted(os.listdir(tempdir)): - path = os.path.join(tempdir, ffn) - if ffn.endswith("fusion-data"): - fdt = processjson(fn=path, encoding=encoding, timezone=timezone) - if dt is None: - dt = fdt - elif isinstance(dt, DataTree): - for k, v in fdt.items(): - if k in dt: # pylint: disable=E1135 - newv = xr.concat( - [dt[k].ds, v.ds], # pylint: disable=E1136 - dim="uts", - combine_attrs="identical", - ) - else: - newv = v.ds - dt[k] = DataTree(newv) # pylint: disable=E1137 - else: - raise RuntimeError("We should not get here.") - return dt diff --git a/src/yadg/parsers/dummy/__init__.py b/src/yadg/parsers/dummy/__init__.py deleted file mode 100644 index d16f5f9d..00000000 --- a/src/yadg/parsers/dummy/__init__.py +++ /dev/null @@ -1,98 +0,0 @@ -""" -This is a dummy parser, used mainly for testing of the :mod:`yadg` and :mod:`tomato` -packages. It provides no real functionality. - -Usage -````` -Available since ``yadg-4.0``. The parser supports the following parameters: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.Dummy - -Formats -``````` -The ``filetypes`` currently supported by the parser are: - - - tomato's JSON file (``tomato.json``) - -Schema -`````` -The output schema is only defined for the ``tomato.json`` filetype. - -.. code-block:: yaml - - xr.Dataset: - coords: - uts: !!float - data_vars: - {{ entries }} (uts) # Elements present in the "data" entry - -The value of every element of ``data`` is assigned a deviation of 0.0. - -Module Functions -```````````````` - -""" - -from pydantic import BaseModel -import json -from ... import dgutils -from ..basiccsv.main import append_dicts, dicts_to_dataset -from datatree import DataTree - - -def process( - *, - fn: str, - filetype: str, - parameters: BaseModel, - **kwargs: dict, -) -> DataTree: - """ - A dummy parser. - - This parser simply returns the current time, the filename provided, and any - ``kwargs`` passed. - - In case the provided ``filetype`` is a ``tomato.json`` file, this is a json - data file from the :mod:`tomato` package, which should contain a :class:`list` - of ``{"value": float, "time": float}`` datapoints in its ``data`` entry. - - Parameters - ---------- - fn - Filename to process - - filetype - Accepts ``tomato.json`` as an optional "dummy instrument" filetype from - :mod:`tomato`. - - parameters - Parameters for :class:`~dgbowl_schemas.yadg.dataschema_5_0.step.Dummy`. - - Returns - ------- - :class:`xarray.Dataset` - - """ - if filetype == "tomato.json": - with open(fn, "r") as inf: - jsdata = json.load(inf) - - data_vals = {} - meta_vals = {} - for vi, vals in enumerate(jsdata["data"]): - vals["uts"] = vals.pop("time") - devs = {} - for k, v in vals.items(): - if k not in {"time", "address", "channel"}: - devs[k] = 0.0 - append_dicts(vals, devs, data_vals, meta_vals, fn, vi) - else: - kwargs = {} if parameters is None else parameters.dict() - if "parser" in kwargs: - del kwargs["parser"] - data_vals = {k: [v] for k, v in kwargs.items()} - data_vals["uts"] = [dgutils.now()] - meta_vals = {} - - return dicts_to_dataset(data_vals, meta_vals, fulldate=False) diff --git a/src/yadg/parsers/electrochem/__init__.py b/src/yadg/parsers/electrochem/__init__.py deleted file mode 100644 index c08f4e27..00000000 --- a/src/yadg/parsers/electrochem/__init__.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -This module handles the reading and processing of files containing electrochemical -data, including BioLogic's EC-Lab file formats. The basic function of the parser is to: - -#. Read in the technique data and create timesteps. -#. Collect metadata, such as the measurement settings and the loops - contained in a given file. -#. Collect data describing the technique parameter sequences. - -Usage -````` -Available since ``yadg-4.0``. The parser supports the following parameters: - -.. _yadg.parsers.electrochem.model: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.ElectroChem - -.. _yadg.parsers.electrochem.formats: - -Formats -``````` -The ``filetypes`` currently supported by the parser are: - - - EC-Lab raw data binary file and parameter settings (``eclab.mpr``), - see :mod:`~yadg.parsers.electrochem.eclabmpr` - - EC-Lab human-readable text export of data (``eclab.mpt``), - see :mod:`~yadg.parsers.electrochem.eclabmpt` - - tomato's structured json output (``tomato.json``), - see :mod:`~yadg.parsers.electrochem.tomatojson` - -Schema -`````` -Depending on the filetype, the output :class:`xarray.Dataset` may contain multiple -derived values. However, all filetypes will report at least the following: - -.. code-block:: yaml - - xr.Dataset: - coords: - uts: !!float - data_vars: - Ewe: (uts) # Potential of the working electrode (V) - Ece: (uts) # Potential of the counter electrode (V) - I: (uts) # Applied current (A) - -In some cases, average values (i.e. ```` or ````) may be reported instead -of the instantaneous data. - -.. warning:: - - In previous versions of :mod:`yadg`, the :mod:`~yadg.parsers.electrochem` parser - optionally transposed data from impedance spectroscopy, grouping the datapoints - in each scan into a single "trace". This behaviour has been removed in ``yadg-5.0``. - -Module Functions -```````````````` - -""" - -import xarray as xr -from . import eclabmpr, eclabmpt, tomatojson - - -def process( - *, - filetype: str, - **kwargs: dict, -) -> xr.Dataset: - """ - Unified parser for electrochemistry data. Forwards ``kwargs`` to the worker functions - based on the supplied ``filetype``. - - Parameters - ---------- - filetype - Discriminator used to select the appropriate worker function. - - Returns - ------- - :class:`xarray.Dataset` - - """ - if filetype == "eclab.mpr": - return eclabmpr.process(**kwargs) - elif filetype == "eclab.mpt": - return eclabmpt.process(**kwargs) - elif filetype == "tomato.json": - return tomatojson.process(**kwargs) diff --git a/src/yadg/parsers/electrochem/eclabcommon/__init__.py b/src/yadg/parsers/electrochem/eclabcommon/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/yadg/parsers/flowdata/__init__.py b/src/yadg/parsers/flowdata/__init__.py deleted file mode 100644 index 4b9e9431..00000000 --- a/src/yadg/parsers/flowdata/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Handles the reading and processing of flow controller or flow meter data. - -Usage -````` -Available since ``yadg-4.0``. The parser supports the following parameters: - -.. _yadg.parsers.flowdata.model: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.FlowData - -.. _yadg.parsers.flowdata.formats: - -Formats -``````` -The ``filetypes`` currently supported by the parser are: - - - DryCal log file text output (``drycal.txt``), - see :mod:`~yadg.parsers.flowdata.drycal` - - DryCal log file tabulated output (``drycal.csv``), - see :mod:`~yadg.parsers.flowdata.drycal` - - DryCal log file document file (``drycal.rtf``), - see :mod:`~yadg.parsers.flowdata.drycal` - -.. _yadg.parsers.flowdata.provides: - -Schema -`````` -The parser is used to extract all tabular data in the input file. This parser processes -additional calibration information analogously to :mod:`~yadg.parsers.basiccsv`. - -Module Functions -```````````````` - -""" - -from .main import process - -__all__ = ["process"] diff --git a/src/yadg/parsers/flowdata/main.py b/src/yadg/parsers/flowdata/main.py deleted file mode 100644 index a455287c..00000000 --- a/src/yadg/parsers/flowdata/main.py +++ /dev/null @@ -1,66 +0,0 @@ -import logging -import xarray as xr -from . import drycal - -logger = logging.getLogger(__name__) - - -def process( - *, - fn: str, - filetype: str, - encoding: str, - timezone: str, - **kwargs: dict, -) -> xr.Dataset: - """ - Flow meter data processor - - This parser processes flow meter data. - - Parameters - ---------- - fn - File to process - - encoding - Encoding of ``fn``, by default "utf-8". - - timezone - A string description of the timezone. Default is "localtime". - - parameters - Parameters for :class:`~dgbowl_schemas.yadg.dataschema_5_0.step.FlowData`. - - Returns - ------- - :class:`xarray.Dataset` - - """ - - if filetype.startswith("drycal"): - - if filetype.endswith(".rtf") or fn.endswith("rtf"): - vals = drycal.rtf(fn, encoding, timezone) - elif filetype.endswith(".csv") or fn.endswith("csv"): - vals = drycal.sep(fn, ",", encoding, timezone) - elif filetype.endswith(".txt") or fn.endswith("txt"): - vals = drycal.sep(fn, "\t", encoding, timezone) - - # check timestamps are increasing: - warn = True - ndays = 0 - utslist = vals.uts.values - for i in range(1, vals.uts.size): - if utslist[i] < utslist[i - 1]: - if warn: - logger.warning("DryCal log crossing day boundary. Adding offset.") - warn = False - uts = utslist[i] + ndays * 86400 - while uts < utslist[i - 1]: - ndays += 1 - uts = utslist[i] + ndays * 86400 - utslist[i] = uts - vals["uts"] = xr.DataArray(data=utslist, dims=["uts"]) - vals.attrs["fulldate"] = False - return vals diff --git a/src/yadg/parsers/masstrace/__init__.py b/src/yadg/parsers/masstrace/__init__.py deleted file mode 100644 index cd4556ce..00000000 --- a/src/yadg/parsers/masstrace/__init__.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Handles the reading and processing of mass spectrometry files. The basic function of the -parser is to: - -#. read in the raw data and create timestamped traces with one :class:`xarray.Dataset` per trace -#. collect `metadata` such as the software version, author, etc. - -Usage -````` -Select :mod:`~yadg.parsers.masstrace` by supplying it to the ``parser`` keyword -in the `dataschema`. The parser supports the following parameters: - -.. _yadg.parsers.masstrace.model: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.MassTrace - -.. _yadg.parsers.masstrace.formats: - -Formats -``````` -The ``filetypes`` currently supported by the parser are: - - - Pfeiffer Quadstar 32-bit scan analog data (``quadstar.sac``), - see :mod:`~yadg.parsers.masstrace.quadstarsac` - -.. _yadg.parsers.masstrace.provides: - -Schema -`````` -The raw data, loaded from the supplied files, is stored using the following format: - -.. code-block:: yaml - - datatree.DataTree: - {{ detector_name }} !!xr.Dataset - coords: - uts: !!float - mass_to_charge: !!float # m/z (amu) - data_vars: - y: (uts, mass_to_charge) # Detected signal (counts) - -The uncertainties in ``mass_to_charge`` are taken as the step-width of -the linearly spaced mass values. - -The uncertainties in of ``y`` are the largest value between: - -#. The quantization error from the ADC, its resolution assumed to be 32 - bit. Dividing F.S.R. by ``2 ** 32`` gives an error in the order of - magnitude of the smallest data value in ``y``. -#. The contribution from neighboring masses. In the operating manual of - the QMS 200 (see 2.8 QMS 200 F & 2.9 QMS 200 M), a maximum - contribution from the neighboring mass of 50 ppm is noted. - -.. note:: - - The data in ``y`` may contain ``NaN`` s. The measured ion - count/current value will occasionally exceed the specified detector - F.S.R. (e.g. 1e-9), and will then flip directly to the maximum value - of a float32. These values are set to ``float("NaN")``. - -Module Functions -```````````````` - -""" - -import datatree -from . import quadstarsac - - -def process( - *, - filetype: str, - **kwargs: dict, -) -> datatree.DataTree: - """ - Unified mass spectrometry data parser.Forwards ``kwargs`` to the worker functions - based on the supplied ``filetype``. - - Parameters - ---------- - filetype - Discriminator used to select the appropriate worker function. - - Returns - ------- - :class:`datatree.DataTree` - - """ - if filetype == "quadstar.sac": - return quadstarsac.process(**kwargs) diff --git a/src/yadg/parsers/meascsv/__init__.py b/src/yadg/parsers/meascsv/__init__.py deleted file mode 100644 index e2b72426..00000000 --- a/src/yadg/parsers/meascsv/__init__.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -This parser handles the reading and processing of the legacy log files created by -the LabView interface for the MCPT instrument. These files contain information about -the timestamp, temperatures, and inlet / process flows. - -.. admonition:: DEPRECATED in ``yadg-4.0`` - - As of ``yadg-4.0``, this parser is deprecated and should not be used for new data. - Please consider switching to the :mod:`~yadg.parsers.basiccsv` parser. - -Usage -````` -Available since ``yadg-3.0``. Deprecated since ``yadg-4.0``. The parser supports the -following parameters: - -.. _yadg.parsers.meascsv.model: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.MeasCSV - -.. _parsers_meascsv_provides: - -Schema -`````` -The parser is used to extract all of the tabular data in the input file, storing -them in the same format as :mod:`~yadg.parsers.basiccsv`, using the column headers -as keys. - -""" - -import logging -from pydantic import BaseModel -from ..basiccsv.main import process_row, append_dicts, dicts_to_dataset -from ... import dgutils -import xarray as xr - -logger = logging.getLogger(__name__) - - -def process( - *, - fn: str, - encoding: str, - timezone: str, - parameters: BaseModel, - **kwargs: dict, -) -> xr.Dataset: - """ - Legacy MCPT measurement log parser. - - This parser is included to maintain parity with older schemas and datagrams. - It is essentially a wrapper around :func:`yadg.parsers.basiccsv.main.process_row`. - - .. admonition:: DEPRECATED in ``yadg-4.0`` - - For new applications, please use the :mod:`~yadg.parsers.basiccsv` parser. - - Parameters - ---------- - fn - File to process - - encoding - Encoding of ``fn``, by default "utf-8". - - timezone - A string description of the timezone. Default is "localtime". - - parameters - Parameters for :class:`~dgbowl_schemas.yadg.dataschema_5_0.step.MeasCSV`. - - Returns - ------- - :class:`xarray.Dataset` - A :class:`xarray.Dataset` containing the timesteps, metadata, and full date tag. No - metadata is returned. The full date is always provided in :mod:`~yadg.parsers.meascsv` - compatible files. - - """ - logger.warning("This parser is deprecated. Please switch to 'basiccsv'.") - - with open(fn, "r", encoding=encoding) as infile: - lines = [i.strip() for i in infile.readlines()] - - headers = [i.strip() for i in lines.pop(0).split(";")] - - for hi, header in enumerate(headers): - if "/" in header: - logger.warning("Replacing '/' for '_' in header '%s'.", header) - headers[hi] = header.replace("/", "_") - - _units = [i.strip() for i in lines.pop(0).split(";")] - units = {} - for h in headers: - units[h] = _units.pop(0) - - units = dgutils.sanitize_units(units) - - datecolumns, datefunc, fulldate = dgutils.infer_timestamp_from( - spec=parameters.timestamp, - timezone=timezone, - ) - - # Process rows - data_vals = {} - meta_vals = {"_fn": []} - for li, line in enumerate(lines): - vals, devs = process_row( - headers, - line.split(";"), - datefunc, - datecolumns, - ) - append_dicts(vals, devs, data_vals, meta_vals, fn, li) - - return dicts_to_dataset(data_vals, meta_vals, units, fulldate) diff --git a/src/yadg/parsers/qftrace/__init__.py b/src/yadg/parsers/qftrace/__init__.py deleted file mode 100644 index 30111e9a..00000000 --- a/src/yadg/parsers/qftrace/__init__.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -The module handles the reading and processing of the network analyzer -traces, containing the reflection coefficient as a function of the sweeped frequency, -:math:`\\Gamma(f)`. - -:mod:`~yadg.parsers.qftrace` loads the reflection trace data, determines the -uncertainties of the signal (y-axis), and explicitly populates the points in -the time axis (x-axis). - -Usage -````` -Available since ``yadg-3.0``. The parser supports the following parameters: - -.. _yadg.parsers.qftrace.model: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.QFTrace - -.. _yadg.parsers.qftrace.formats: - - - LabView output in a tab-separated format (``csv``): - :mod:`~yadg.parsers.qftrace.labviewcsv` - -.. _yadg.parsers.qftrace.provides: - -Schema -`````` -For filetypes containing the reflection trace data, the schema is as follows: - -.. code-block:: yaml - - datatree.DataTree: - S11: - coords: - uts: !!float - freq: !!float # Field frequency (Hz) - data_vars: - Re(G): (uts, freq) # Imaginary part of the reflection coefficient - Im(G) (uts, freq) # Real part of the reflection coefficient - average: (uts) # Number of scans averaged to form a single trace - bandwidth: (uts) # Filter bandwidth (Hz) - -Module Functions -```````````````` - -""" - -from . import labviewcsv -import datatree - - -def process( - *, - filetype: str, - **kwargs: dict, -) -> datatree.DataTree: - """ - VNA reflection trace parser. Forwards ``kwargs`` to the worker functions - based on the supplied ``filetype``. - - Parameters - ---------- - filetype - Discriminator used to select the appropriate worker function. - - Returns - ------- - :class:`datatree.DataTree` - - """ - if filetype == "labview.csv": - return labviewcsv.process(**kwargs) diff --git a/src/yadg/parsers/xpstrace/__init__.py b/src/yadg/parsers/xpstrace/__init__.py deleted file mode 100644 index 93582b0f..00000000 --- a/src/yadg/parsers/xpstrace/__init__.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -This module handles the reading and processing of X-ray photoelectron spectroscopy -data, including determining the uncertainties of the signal (y-axis), and explicitly -populating the points in the energy axis (``E``). - -Usage -````` -Available since ``yadg-4.1``. The parser supports the following parameters: - -.. _yadg.parsers.xpstrace.model: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.XPSTrace - -.. _yadg.parsers.xpstrace.formats: - -Formats -``````` -The ``filetypes`` currently supported by the parser are: - - - ULVAC PHI Multipak XPS traces (``phi.spe``), - see :mod:`~yadg.parsers.xpstrace.phispe` - -.. _yadg.parsers.xpstrace.provides: - -Provides -```````` -The raw data is stored, for each timestep, using the following format: - -.. code-block:: yaml - - datatree.DataTree: - {{ trace_name }} !!xr.Dataset - coords: - uts: !!float - E: !!float # binding energies (eV) - data_vals: - y: (uts, E) # signal - -Module Functions -```````````````` - -""" - -import datatree -from . import phispe - - -def process( - *, - filetype: str, - **kwargs: dict, -) -> datatree.DataTree: - """ - Unified x-ray photoelectron spectroscopy parser. Forwards ``kwargs`` to the worker - functions based on the supplied ``filetype``. - - This parser processes XPS scans in signal(energy) format. - - Parameters - ---------- - filetype - Discriminator used to select the appropriate worker function. - - Returns - ------- - :class:`datatree.DataTree` - - """ - if filetype == "phi.spe": - return phispe.process(**kwargs) diff --git a/src/yadg/parsers/xrdtrace/__init__.py b/src/yadg/parsers/xrdtrace/__init__.py deleted file mode 100644 index 6b04f801..00000000 --- a/src/yadg/parsers/xrdtrace/__init__.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -This module handles the reading and processing of X-ray diffraction data. It loads X-ray -diffraction data, determines reasonable uncertainties of the signal intensity (y-axis), -and explicitly populates the angle axis (:math:`2\\theta`), if necessary. - -Usage -````` -Available since ``yadg-4.0``. The parser supports the following parameters: - -.. _yadg.parsers.xrdtrace.model: - -.. autopydantic_model:: dgbowl_schemas.yadg.dataschema_5_0.step.XRDTrace - -.. _yadg.parsers.xrdtrace.formats: - -Formats -``````` -The ``filetypes`` currently supported by the parser are: - - - PANalytical ``xrdml`` files (``panalytical.xrdml``), - see :mod:`~yadg.parsers.xrdtrace.panalyticalxrdml` - - PANalytical ``csv`` files (``panalytical.csv``), - see :mod:`~yadg.parsers.xrdtrace.panalyticalcsv` - - PANalytical ``xy`` files (``panalytical.xy``), - see :mod:`~yadg.parsers.xrdtrace.panalyticalxy` - -.. _yadg.parsers.xrdtrace.provides: - -Provides -```````` -The raw data is stored, for each timestep, using the following format: - -.. code-block:: yaml - - xr.Dataset: - coords: - uts: !!float - angle: !!float # Diffraction angle (deg) - data_vals: - intensity: (uts, angle) # Detector intensity (counts) - -""" - -import xarray as xr -from . import panalyticalxrdml, panalyticalcsv, panalyticalxy - - -def process( - *, - filetype: str, - **kwargs: dict, -) -> xr.Dataset: - """ - Unified X-ray diffractogram data parser. Forwards ``kwargs`` to the worker - functions based on the supplied ``filetype``. - - This parser processes XPS scans in signal(energy) format. - - Parameters - ---------- - filetype - Discriminator used to select the appropriate worker function. - - Returns - ------- - :class:`xarray.Dataset` - - - """ - if filetype == "panalytical.xrdml": - return panalyticalxrdml.process(**kwargs) - elif filetype == "panalytical.csv": - return panalyticalcsv.process(**kwargs) - elif filetype == "panalytical.xy": - return panalyticalxy.process(**kwargs) diff --git a/tests/test_chromdata.py b/tests/test_chromdata.py index 943ce134..e7e01282 100644 --- a/tests/test_chromdata.py +++ b/tests/test_chromdata.py @@ -16,6 +16,7 @@ def special_datagram_test(datagram, testspec): if i not in testspec: continue ret = dg_get_quantity(datagram, testspec["step"], i, testspec["point"]) + print(f"{ret=}") for k, v in testspec[i].items(): rval = ret["n"].sel(dict(species=k)) rdev = ret["s"].sel(dict(species=k))