cadsr importer (#132)

* Adding caDSR CDE ingester * docs * Adding models * pytestified test_rdfs_importer. Fixed rdfs import bug
linkml · Feb 26, 2024 · 6186a49 · 6186a49
1 parent e9bbd52
commit 6186a49
Show file tree

Hide file tree

Showing 16 changed files with 13,302 additions and 286 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ packages = [
 
 [tool.poetry.dependencies]
 python = "^3.9"
-linkml = ">=1.6.7"
+linkml = "^1.7.4"
 mkdocs = ">=1.2.3"
 pandas = ">=1.3.5"
 python-dateutil = ">=2.8.2"
@@ -30,6 +30,7 @@ inflect = ">=6.0.0"
 schemasheets = ">=0.1.24"
 xmltodict = "^0.13.0"
 click-default-group = "^1.2.4"
+linkml-runtime = "^1.7.2"
 
 
 [tool.poetry.dev-dependencies]

diff --git a/schema_automator/cli.py b/schema_automator/cli.py
@@ -5,6 +5,8 @@
 """
 import logging
 import os
+from pathlib import Path
+
 import click
 
 
@@ -20,6 +22,7 @@
 from schema_automator.generalizers.csv_data_generalizer import CsvDataGeneralizer
 from schema_automator.generalizers.generalizer import DEFAULT_CLASS_NAME, DEFAULT_SCHEMA_NAME
 from schema_automator.generalizers.pandas_generalizer import PandasDataGeneralizer
+from schema_automator.importers.cadsr_import_engine import CADSRImportEngine
 from schema_automator.importers.dosdp_import_engine import DOSDPImportEngine
 from schema_automator.generalizers.json_instance_generalizer import JsonDataGeneralizer
 from schema_automator.importers.jsonschema_import_engine import JsonSchemaImportEngine
@@ -387,6 +390,27 @@ def import_frictionless(input, output, schema_name, schema_id, **kwargs):
     write_schema(schema, output)
 
 
+@main.command()
+@output_option
+@schema_name_option
+@schema_id_option
+@click.argument('input')
+def import_cadsr(input, output, schema_name, schema_id, **kwargs):
+    """
+    Imports from CADSR CDE JSON API output to LinkML
+
+    See :ref:`importers` for more on the importer framework
+
+    Example:
+
+        schemauto import-cadsr "cdes/*.json"
+    """
+    ie = CADSRImportEngine()
+    paths = [str(gf.absolute())  for gf in Path().glob(input) if gf.is_file()]
+    schema = ie.convert(paths, name=schema_name, id=schema_id)
+    write_schema(schema, output)
+
+
 @main.command()
 @click.argument('owlfile')
 @output_option

diff --git a/schema_automator/importers/cadsr_import_engine.py b/schema_automator/importers/cadsr_import_engine.py
@@ -0,0 +1,184 @@
+"""
+CADSR CDE Import Engine
+
+This ingests the output of the caDSR API https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api
+"""
+import logging
+import urllib
+from typing import Union, Dict, Tuple, List, Any, Optional, Iterable
+
+from dataclasses import dataclass
+
+from linkml.utils.schema_builder import SchemaBuilder
+from linkml_runtime.linkml_model import Annotation
+from linkml_runtime.linkml_model.meta import SchemaDefinition, SlotDefinition, EnumDefinition, \
+    PermissibleValue, UniqueKey, ClassDefinition
+from linkml_runtime.loaders import json_loader
+from linkml_runtime.utils.formatutils import camelcase, underscore
+
+from schema_automator.importers.import_engine import ImportEngine
+import schema_automator.metamodels.cadsr as cadsr
+
+
+TMAP = {
+    "DATE": "date",
+    "NUMBER": "float",
+    "ALPHANUMERIC": "string",
+    "CHARACTER": "string",
+    "HL7EDv3": "string",
+    "HL7CDv3": "string",
+    "java.lang.Double": "float",
+    "Numeric Alpha DVG": "float",
+    "SAS Date": "string",
+    "java.util.Date": "date",
+    "DATE/TIME": "datetime",
+    "TIME": "time",
+    "Integer": "integer",
+    "java.lang.Integer": "integer",
+    "Floating-point": "float",
+}
+
+@dataclass
+class CADSRImportEngine(ImportEngine):
+    """
+    An ImportEngine that imports NCI CADSR CDEs
+
+    Ingests the output of `caDSR API <https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api>`_.
+
+    - Each CDE becomes a unique slot
+    - the CDE is added as a lot of a context-specific class
+    - the context-specific class is a subclass of the CDE's DataElementConcept
+
+    Note that this creates a lot of 1-1 classes, as in many cases there is no
+    attempt to group concepts. However, this is not always the case.
+
+    E.g. the concept with publicId 2012668 (Access Route) is used in 5 contexts
+    (AHRQ, CCR, ...)
+
+    Each context-specific concept has its own set of CDEs
+
+    See also https://github.com/monarch-initiative/cde-harmonization
+    """
+
+    def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs) -> SchemaDefinition:
+        """
+        Converts one or more CDE JSON files into LinkML
+
+        :param files:
+        :param kwargs:
+        :return:
+        """
+        sb = SchemaBuilder()
+        schema = sb.schema
+        if id:
+            schema.id = id
+        if not name:
+            name = package.name
+        if name:
+            schema.name = name
+        classes = {}
+        slots = {}
+        enums = {}
+        for path in paths:
+            logging.info(f"Loading {path}")
+            with (open(path) as file):
+                container: cadsr.DataElementContainer
+                container = json_loader.load(file, target_class=cadsr.DataElementContainer)
+                cde = container.DataElement
+                ctxt = cde.context
+                source = urllib.parse.quote(ctxt)
+                source = f"cadsr:{source}"
+                slot = SlotDefinition(
+                    name=urllib.parse.quote(underscore(f"{ctxt} {cde.preferredName}")),
+                    slot_uri=f"cadsr:{cde.publicId}",
+                    title=cde.preferredName,
+                    description=cde.preferredDefinition,
+                    aliases=[cde.longName],
+                    source=source,
+                )
+                slots[slot.name] = slot
+                concept = cde.DataElementConcept
+                concept_name = urllib.parse.quote(camelcase(f"{ctxt} {concept.preferredName}"))
+                parent_concept_name = urllib.parse.quote(camelcase(concept.longName))
+                if parent_concept_name not in classes:
+                    parent_cls = ClassDefinition(
+                        name=parent_concept_name,
+                        title=concept.preferredName,
+                        description=concept.preferredDefinition,
+                        #aliases=[concept.longName],
+                        class_uri=f"cadsr:{concept.publicId}",
+                    )
+                    classes[parent_concept_name] = parent_cls
+                if concept_name not in classes:
+                    cls = ClassDefinition(
+                        name=concept_name,
+                        title=f"{concept.preferredName} ({ctxt})",
+                        description=concept.preferredDefinition,
+                        aliases=[concept.longName],
+                        class_uri=f"cadsr:{concept.publicId}",
+                        is_a=parent_concept_name,
+                    )
+                    classes[concept_name] = cls
+                else:
+                    cls = classes[concept_name]
+                cls.slots.append(slot.name)
+                objectClass = concept.ObjectClass
+                # TODO
+                valueDomain = cde.ValueDomain
+                conceptualDomain = valueDomain.ConceptualDomain
+                pvs = valueDomain.PermissibleValues
+                if pvs:
+                    enum_name = urllib.parse.quote(camelcase(valueDomain.preferredName))
+                    enum = EnumDefinition(
+                        name=enum_name,
+                        title=valueDomain.preferredName,
+                        description=valueDomain.preferredDefinition,
+                        aliases=[valueDomain.longName],
+                        # enum_uri=f"cadsr:{valueDomain.publicId}",
+                    )
+                    enums[enum_name] = enum
+                    rng = enum_name
+                    for pv in pvs:
+                        # url encode the value to escape symbols like <, >, etc.
+                        pv_value = urllib.parse.quote(pv.value)
+                        tgt_pv = PermissibleValue(
+                            text=pv_value,
+                            title=pv.value,
+                            description=pv.valueDescription,
+                        )
+                        enum.permissible_values[tgt_pv.text] = tgt_pv
+                        vm = pv.ValueMeaning
+                        tgt_pv.title = vm.preferredName
+                        if not tgt_pv.description:
+                            tgt_pv.description = vm.preferredDefinition
+                        for c in vm.Concepts:
+                            code = c.conceptCode.strip()
+                            tgt_pv.meaning = f"NCIT:{code}"
+                else:
+                    datatype = valueDomain.dataType
+                    rng = TMAP.get(datatype, "string")
+                slot.range = rng
+                anns = []
+                for rd in cde.ReferenceDocuments:
+                    rf_type = urllib.parse.quote(underscore(rd.type))
+                    anns.append(Annotation(
+                        tag=rf_type,
+                        value=rd.description,
+                    ))
+                for ann in anns:
+                    slot.annotations[ann.tag] = ann
+
+        sb.add_prefix("NCIT", "http://purl.obolibrary.org/obo/NCIT_")
+        sb.add_prefix("cadsr", "http://example.org/cadsr/")
+        sb.add_defaults()
+        for c in schema.classes.values():
+            c.from_schema = 'http://example.org/'
+        schema = sb.schema
+        schema.classes = classes
+        schema.slots = slots
+        schema.enums = enums
+        return schema
+
+
+
+
diff --git a/schema_automator/importers/rdfs_import_engine.py b/schema_automator/importers/rdfs_import_engine.py
@@ -5,6 +5,7 @@
 from collections import defaultdict
 
 from linkml.utils.schema_builder import SchemaBuilder
+from linkml_runtime import SchemaView
 from linkml_runtime.linkml_model import (
     SchemaDefinition,
     SlotDefinition,
@@ -50,9 +51,12 @@ class RdfsImportEngine(ImportEngine):
     reverse_metamodel_mappings: Dict[URIRef, List[str]] = None
     include_unmapped_annotations = False
     metamodel = None
+    metamodel_schemaview: SchemaView = None
+    classdef_slots: List[str] = None
 
     def __post_init__(self):
         sv = package_schemaview("linkml_runtime.linkml_model.meta")
+        self.metamodel_schemaview = sv
         self.metamodel = sv
         self.metamodel_mappings = defaultdict(list)
         self.reverse_metamodel_mappings = defaultdict(list)
@@ -73,6 +77,7 @@ def __post_init__(self):
                     mappings.append(uri)
                     self.reverse_metamodel_mappings[uri].append(e.name)
             self.metamodel_mappings[e.name] = mappings
+        self.defclass_slots = [s.name for s in sv.class_induced_slots(ClassDefinition.class_name)]
 
     def convert(
         self,
@@ -180,6 +185,8 @@ def _dict_for_subject(self, g: Graph, s: URIRef) -> Dict[str, Any]:
             if pp == RDF.type:
                 continue
             metaslot_name = self._element_from_iri(pp)
+            if metaslot_name not in self.defclass_slots:
+                continue
             if metaslot_name is None:
                 logging.warning(f"Not mapping {pp}")
                 continue