Technical Debt: Round 2 (#138)

* custom extractors docs etc. * Agilent * Make the linter happy * More docs changes, and drycal. * Ezchrom & Tomato * panalytica. * phi * Phi.spe and Quadstar.sac * fusion * Nuke most of parsers * Linter * The big rename * Move stuff around * Purge parsers * Fix docs * Update eclab docs
dgbowl · Mar 31, 2024 · 6ec691e · 6ec691e
1 parent fe71ea3
commit 6ec691e
Show file tree

Hide file tree

Showing 104 changed files with 1,651 additions and 2,675 deletions.
diff --git a/docs/apidoc_t/package.rst_t b/docs/apidoc_t/package.rst_t
@@ -15,40 +15,32 @@
 
 {%- if is_namespace %}
 {{- [pkgname, "namespace"] | join(" ") | e | heading }}
-{% elif 'yadg.parsers.basiccsv' == pkgname %}
-{{- "**basiccsv**: Common tabular file parser" | heading }}
-{% elif 'yadg.parsers.chromdata' == pkgname %}
-{{- "**chromdata**: Post-processed chromatography data parser" | heading }}
-{% elif 'yadg.parsers.chromtrace' == pkgname %}
-{{- "**chromtrace**: Raw chromatogram trace file parser" | heading }}
-{% elif 'yadg.parsers.dummy' == pkgname %}
-{{- "**dummy**: A dummy parser" | heading }}
-{% elif 'yadg.parsers.electrochem' == pkgname %}
-{{- "**electrochem**: Electrochemistry data parser" | heading }}
-{% elif 'yadg.parsers.flowdata' == pkgname %}
-{{- "**flowdata**: Flow data parser" | heading }}
-{% elif 'yadg.parsers.masstrace' == pkgname %}
-{{- "**masstrace**: Mass spectroscopy trace file parser" | heading }}
-{% elif 'yadg.parsers.meascsv' == pkgname %}
-{{- "**meascsv**: Legacy MCPT log file parser" | heading }}
-{% elif 'yadg.parsers.qftrace' == pkgname %}
-{{- "**qftrace**: Network analyser trace file parser" | heading }}
-{% elif 'yadg.parsers.xpstrace' == pkgname %}
-{{- "**xpstrace**: XPS trace file parser" | heading }}
-{% elif 'yadg.parsers.xrdtrace' == pkgname %}
-{{- "**xrdtrace**: X-ray diffractogram trace file parser" | heading }}
-{% elif 'yadg.extractors.agilentch' == pkgname %}
-{{- "**agilent-ch**: Agilent ChemStation export ``.CH``" | heading }}
-{% elif 'yadg.extractors.agilentdx' == pkgname %}
-{{- "**agilent-dx**: Agilent OpenLab raw data ``.dx``" | heading }}
-{% elif 'yadg.extractors.eclabmpr' == pkgname %}
-{{- "**eclab-mpr**: BioLogic ECLab binary ``.mpr``" | heading }}
-{% elif 'yadg.extractors.eclabmpt' == pkgname %}
-{{- "**eclab-mpr**: BioLogic ECLab export ``.mpt``" | heading }}
-{% elif 'yadg.extractors.panalyticalxrdml' == pkgname %}
-{{- "**panalytical-xrdml**: PANalytical XRDML ``.xrdml``" | heading }}
-{% elif 'yadg.extractors.phispe' == pkgname %}
-{{- "**phi-spe**: ULVAC-PHI Multipak ``.spe``" | heading }}
+{% elif 'yadg.extractors.basic' == pkgname %}
+{{- "**basic**: For tabulated data" | heading }}
+{% elif 'yadg.extractors.empalc' == pkgname %}
+{{- "**empalc**: For Empa's LC data" | heading }}
+{% elif 'yadg.extractors.example' == pkgname %}
+{{- "**example**: For yadg testing" | heading }}
+{% elif 'yadg.extractors.fhimcpt' == pkgname %}
+{{- "**fhimcpt**: For MCPT set-up at FHI" | heading }}
+{% elif 'yadg.extractors.agilent' == pkgname %}
+{{- "**agilent**: For Agilent chromatograms" | heading }}
+{% elif 'yadg.extractors.drycal' == pkgname %}
+{{- "**drycal**: For MesaLabs Drycal Pro data" | heading }}
+{% elif 'yadg.extractors.eclab' == pkgname %}
+{{- "**eclab**: For BioLogic data files" | heading }}
+{% elif 'yadg.extractors.ezchrom' == pkgname %}
+{{- "**ezchrom**: For EZChrom chromatograms" | heading }}
+{% elif 'yadg.extractors.fusion' == pkgname %}
+{{- "**fusion**: For Fusion chromatograms" | heading }}
+{% elif 'yadg.extractors.panalytical' == pkgname %}
+{{- "**panalytical**: For Panalytical XRD data" | heading }}
+{% elif 'yadg.extractors.phi' == pkgname %}
+{{- "**phi**: For Phi XPS data" | heading }}
+{% elif 'yadg.extractors.quadstar' == pkgname %}
+{{- "**quadstar**: For Quadstar MS data" | heading }}
+{% elif 'yadg.extractors.tomato' == pkgname %}
+{{- "**tomato**: For tomato outputs" | heading }}
 {% else %}
 {{- [pkgname, "package"] | join(" ") | e | heading }}
 {% endif %}
@@ -69,8 +61,6 @@ Subpackages
 {% endif %}
 
 {%- if submodules %}
-Submodules
-----------
 {% if separatemodules %}
 {{ toctree(submodules) }}
 {% else %}

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -35,7 +35,7 @@
     # "sphinx.ext.coverage",
     "sphinx.ext.napoleon",
     "sphinx.ext.intersphinx",
-    # "sphinx.ext.autosummary",
+    "sphinx.ext.autosummary",
     "sphinx_autodoc_typehints",
     "sphinx_rtd_theme",
     "sphinxcontrib.autodoc_pydantic",
@@ -63,7 +63,6 @@
 html_theme_options = {
     "body_max_width": "none",
     "sticky_navigation": True,
-    "navigation_depth": 6,
 }
 html_logo = "./images/yadg.png"
 html_favicon = "./images/yadg_ico.png"

diff --git a/docs/source/extractors.rst b/docs/source/extractors.rst
@@ -4,12 +4,5 @@
    :hidden:
    :glob:
 
-   apidoc/yadg.extractors.public.*
-
-.. toctree::
-   :maxdepth: 1
-   :caption: yadg custom extractors
-   :hidden:
-   :glob:
-
-   apidoc/yadg.extractors.custom.*
+   apidoc/yadg.extractors.example
+   apidoc/yadg.extractors.*
diff --git a/src/yadg/core.py b/src/yadg/core.py
@@ -1,12 +1,9 @@
 from importlib import metadata
 import logging
 import importlib
-import xarray as xr
-import numpy as np
 from typing import Callable
 from datatree import DataTree
 from xarray import Dataset
-from pydantic import BaseModel
 
 from dgbowl_schemas.yadg.dataschema import DataSchema
 from yadg import dgutils
@@ -20,18 +17,13 @@ def infer_extractor(extractor: str) -> Callable:
     A function that finds an :func:`extract` function of the supplied ``extractor``.
 
     """
-    modnames = [
-        f"yadg.extractors.public.{extractor}",
-        f"yadg.extractors.custom.{extractor}",
-        f"yadg.extractors.{extractor.replace('.','')}",
-    ]
-    for modname in modnames:
-        try:
-            m = importlib.import_module(modname)
-            if hasattr(m, "extract"):
-                return getattr(m, "extract")
-        except ImportError:
-            logger.critical(f"could not import module '{modname}'")
+    modname = f"yadg.extractors.{extractor}"
+    try:
+        m = importlib.import_module(modname)
+        if hasattr(m, "extract"):
+            return getattr(m, "extract")
+    except ImportError:
+        logger.critical(f"could not import module '{modname}'")
     raise RuntimeError
 
 
@@ -99,69 +91,13 @@ def process_schema(dataschema: DataSchema, strict_merge: bool = False) -> DataTr
                     # there are no variables - we don't add 'uts' to those.
                     fvals[name] = dset
                 else:
-                    fvals[name] = complete_uts(
+                    fvals[name] = dgutils.complete_uts(
                         dset, tf, step.externaldate, step.extractor.timezone
                     )
-            vals = merge_dicttrees(vals, fvals, concatmode)
+            vals = dgutils.merge_dicttrees(vals, fvals, concatmode)
 
         stepdt = DataTree.from_dict({} if vals is None else vals)
         stepdt.name = step.tag
         stepdt.attrs = sattrs
         stepdt.parent = root
     return root
-
-
-def complete_uts(
-    ds: Dataset,
-    filename: str,
-    externaldate: BaseModel,
-    timezone: str,
-) -> Dataset:
-    """
-    A helper function ensuring that the Dataset ``ds`` contains a dimension ``"uts"``,
-    and that the timestamps in ``"uts"`` are completed as instructed in the
-    ``externaldate`` specification.
-
-    """
-    if not hasattr(ds, "uts"):
-        ds = ds.expand_dims("uts")
-    if len(ds.uts.coords) == 0:
-        ds["uts"] = np.zeros(ds.uts.size)
-        ds.attrs["fulldate"] = False
-    if not ds.attrs.get("fulldate", True) or externaldate is not None:
-        ts, fulldate = dgutils.complete_timestamps(
-            timesteps=ds.uts.values,
-            fn=filename,
-            spec=externaldate,
-            timezone=timezone,
-        )
-        ds["uts"] = ts
-        if fulldate:
-            ds.attrs.pop("fulldate", None)
-        else:
-            # cannot store booleans in NetCDF files
-            ds.attrs["fulldate"] = int(fulldate)
-
-    return ds
-
-
-def merge_dicttrees(vals: dict, fvals: dict, mode: str) -> dict:
-    """
-    A helper function that merges two ``DataTree.to_dict()`` objects by concatenating
-    the new values in ``fvals`` to the existing ones in ``vals``.
-
-    """
-    if vals is None:
-        return fvals
-    for k in fvals.keys():
-        try:
-            vals[k] = xr.concat([vals[k], fvals[k]], dim="uts", combine_attrs=mode)
-        except xr.MergeError:
-            raise RuntimeError(
-                "Merging metadata from multiple files has failed, as some of the "
-                "values differ between files. This might be caused by trying to "
-                "parse data obtained using different techniques/protocols in a "
-                "single step. If you are certain this is what you want, try using "
-                "yadg with the '--ignore-merge-errors' option."
-            )
-    return vals
diff --git a/src/yadg/dgutils/__init__.py b/src/yadg/dgutils/__init__.py
@@ -1,18 +1,31 @@
 from .helpers import get_yadg_metadata
-from .dateutils import now, infer_timestamp_from, ole_to_uts, complete_timestamps
-from .utils import update_schema, schema_from_preset
+from .dateutils import (
+    now,
+    infer_timestamp_from,
+    str_to_uts,
+    ole_to_uts,
+    complete_timestamps,
+    complete_uts,
+)
+from .schemautils import update_schema, schema_from_preset
 from .btools import read_value
 from .pintutils import sanitize_units, ureg
+from .dsutils import dicts_to_dataset, append_dicts, merge_dicttrees
 
 __all__ = [
     "get_yadg_metadata",
     "now",
     "infer_timestamp_from",
+    "str_to_uts",
     "ole_to_uts",
     "complete_timestamps",
+    "complete_uts",
     "update_schema",
     "schema_from_preset",
     "read_value",
     "sanitize_units",
     "ureg",
+    "dicts_to_dataset",
+    "append_dicts",
+    "merge_dicttrees",
 ]
diff --git a/src/yadg/dgutils/dateutils.py b/src/yadg/dgutils/dateutils.py
@@ -6,7 +6,9 @@
 import logging
 from zoneinfo import ZoneInfo
 import numpy as np
+from pydantic import BaseModel
 from typing import Callable, Union, Mapping, Iterable
+from xarray import Dataset
 from dgbowl_schemas.yadg.dataschema_5_0.externaldate import ExternalDate
 from dgbowl_schemas.yadg.dataschema_5_0.timestamp import TimestampSpec
 
@@ -20,7 +22,8 @@ def now(
     """
     Wrapper around datetime.now()
 
-    A convenience function for returning the current time as a ISO 8601 or as a unix timestamp.
+    A convenience function for returning the current time as a ISO 8601 or as a Unix
+    timestamp.
     """
     dt = datetime.datetime.now(tz=tz)
     if asstr:
@@ -138,8 +141,8 @@ def infer_timestamp_from(
 
     spec
         A specification of timestamp elements with associated column indices and
-        optional formats. Currently accepted combinations of keys are: "uts"; "timestamp";
-        "date" and / or "time".
+        optional formats. Currently accepted combinations of keys are: "uts";
+        "timestamp"; "date" and / or "time".
 
     tz
         Timezone to use for conversion. By default, UTC is used.
@@ -194,7 +197,7 @@ def timefn(value):
 
                 else:
                     logger.debug(
-                        "Assuming specified column containing the time is in ISO 8601 format"
+                        "Assuming specified column is time in ISO 8601 format."
                     )
 
                     def timefn(value):
@@ -220,14 +223,10 @@ def retfn(date, time):
 
                 return cols, retfn, True
     elif "uts" in headers:
-        logger.debug(
-            "No timestamp spec provided, assuming column 'uts' is a valid unix timestamp"
-        )
+        logger.debug("Assuming column 'uts' is a valid unix timestamp.")
         return [headers.index("uts")], float, True
     elif "timestamp" in headers:
-        logger.debug(
-            "No timestamp spec provided, assuming column 'timestamp' is a valid ISO 8601 timestamp"
-        )
+        logger.debug("Assuming column 'timestamp' is a valid ISO 8601 timestamp")
 
         def retfunc(value):
             return str_to_uts(timestamp=value, timezone=timezone)
@@ -420,3 +419,36 @@ def timestamps_from_file(
                 )
             else:
                 return float(data)
+
+
+def complete_uts(
+    ds: Dataset,
+    filename: str,
+    externaldate: BaseModel,
+    timezone: str,
+) -> Dataset:
+    """
+    A helper function ensuring that the Dataset ``ds`` contains a dimension ``"uts"``,
+    and that the timestamps in ``"uts"`` are completed as instructed in the
+    ``externaldate`` specification.
+
+    """
+    if not hasattr(ds, "uts"):
+        ds = ds.expand_dims("uts")
+    if len(ds.uts.coords) == 0:
+        ds["uts"] = np.zeros(ds.uts.size)
+        ds.attrs["fulldate"] = False
+    if not ds.attrs.get("fulldate", True) or externaldate is not None:
+        ts, fulldate = complete_timestamps(
+            timesteps=ds.uts.values,
+            fn=filename,
+            spec=externaldate,
+            timezone=timezone,
+        )
+        ds["uts"] = ts
+        if fulldate:
+            ds.attrs.pop("fulldate", None)
+        else:
+            # cannot store booleans in NetCDF files
+            ds.attrs["fulldate"] = int(fulldate)
+    return ds