Skip to content

Commit

Permalink
cadsr importer (#132)
Browse files Browse the repository at this point in the history
* Adding caDSR CDE ingester

* docs

* Adding models

* pytestified test_rdfs_importer. Fixed rdfs import bug
  • Loading branch information
cmungall authored Feb 26, 2024
1 parent e9bbd52 commit 6186a49
Show file tree
Hide file tree
Showing 16 changed files with 13,302 additions and 286 deletions.
609 changes: 334 additions & 275 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ packages = [

[tool.poetry.dependencies]
python = "^3.9"
linkml = ">=1.6.7"
linkml = "^1.7.4"
mkdocs = ">=1.2.3"
pandas = ">=1.3.5"
python-dateutil = ">=2.8.2"
Expand All @@ -30,6 +30,7 @@ inflect = ">=6.0.0"
schemasheets = ">=0.1.24"
xmltodict = "^0.13.0"
click-default-group = "^1.2.4"
linkml-runtime = "^1.7.2"


[tool.poetry.dev-dependencies]
Expand Down
24 changes: 24 additions & 0 deletions schema_automator/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
"""
import logging
import os
from pathlib import Path

import click


Expand All @@ -20,6 +22,7 @@
from schema_automator.generalizers.csv_data_generalizer import CsvDataGeneralizer
from schema_automator.generalizers.generalizer import DEFAULT_CLASS_NAME, DEFAULT_SCHEMA_NAME
from schema_automator.generalizers.pandas_generalizer import PandasDataGeneralizer
from schema_automator.importers.cadsr_import_engine import CADSRImportEngine
from schema_automator.importers.dosdp_import_engine import DOSDPImportEngine
from schema_automator.generalizers.json_instance_generalizer import JsonDataGeneralizer
from schema_automator.importers.jsonschema_import_engine import JsonSchemaImportEngine
Expand Down Expand Up @@ -387,6 +390,27 @@ def import_frictionless(input, output, schema_name, schema_id, **kwargs):
write_schema(schema, output)


@main.command()
@output_option
@schema_name_option
@schema_id_option
@click.argument('input')
def import_cadsr(input, output, schema_name, schema_id, **kwargs):
"""
Imports from CADSR CDE JSON API output to LinkML
See :ref:`importers` for more on the importer framework
Example:
schemauto import-cadsr "cdes/*.json"
"""
ie = CADSRImportEngine()
paths = [str(gf.absolute()) for gf in Path().glob(input) if gf.is_file()]
schema = ie.convert(paths, name=schema_name, id=schema_id)
write_schema(schema, output)


@main.command()
@click.argument('owlfile')
@output_option
Expand Down
184 changes: 184 additions & 0 deletions schema_automator/importers/cadsr_import_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
CADSR CDE Import Engine
This ingests the output of the caDSR API https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api
"""
import logging
import urllib
from typing import Union, Dict, Tuple, List, Any, Optional, Iterable

from dataclasses import dataclass

from linkml.utils.schema_builder import SchemaBuilder
from linkml_runtime.linkml_model import Annotation
from linkml_runtime.linkml_model.meta import SchemaDefinition, SlotDefinition, EnumDefinition, \
PermissibleValue, UniqueKey, ClassDefinition
from linkml_runtime.loaders import json_loader
from linkml_runtime.utils.formatutils import camelcase, underscore

from schema_automator.importers.import_engine import ImportEngine
import schema_automator.metamodels.cadsr as cadsr


TMAP = {
"DATE": "date",
"NUMBER": "float",
"ALPHANUMERIC": "string",
"CHARACTER": "string",
"HL7EDv3": "string",
"HL7CDv3": "string",
"java.lang.Double": "float",
"Numeric Alpha DVG": "float",
"SAS Date": "string",
"java.util.Date": "date",
"DATE/TIME": "datetime",
"TIME": "time",
"Integer": "integer",
"java.lang.Integer": "integer",
"Floating-point": "float",
}

@dataclass
class CADSRImportEngine(ImportEngine):
"""
An ImportEngine that imports NCI CADSR CDEs
Ingests the output of `caDSR API <https://cadsrapi.cancer.gov/rad/NCIAPI/1.0/api>`_.
- Each CDE becomes a unique slot
- the CDE is added as a lot of a context-specific class
- the context-specific class is a subclass of the CDE's DataElementConcept
Note that this creates a lot of 1-1 classes, as in many cases there is no
attempt to group concepts. However, this is not always the case.
E.g. the concept with publicId 2012668 (Access Route) is used in 5 contexts
(AHRQ, CCR, ...)
Each context-specific concept has its own set of CDEs
See also https://github.com/monarch-initiative/cde-harmonization
"""

def convert(self, paths: Iterable[str], id: str=None, name: str=None, **kwargs) -> SchemaDefinition:
"""
Converts one or more CDE JSON files into LinkML
:param files:
:param kwargs:
:return:
"""
sb = SchemaBuilder()
schema = sb.schema
if id:
schema.id = id
if not name:
name = package.name
if name:
schema.name = name
classes = {}
slots = {}
enums = {}
for path in paths:
logging.info(f"Loading {path}")
with (open(path) as file):
container: cadsr.DataElementContainer
container = json_loader.load(file, target_class=cadsr.DataElementContainer)
cde = container.DataElement
ctxt = cde.context
source = urllib.parse.quote(ctxt)
source = f"cadsr:{source}"
slot = SlotDefinition(
name=urllib.parse.quote(underscore(f"{ctxt} {cde.preferredName}")),
slot_uri=f"cadsr:{cde.publicId}",
title=cde.preferredName,
description=cde.preferredDefinition,
aliases=[cde.longName],
source=source,
)
slots[slot.name] = slot
concept = cde.DataElementConcept
concept_name = urllib.parse.quote(camelcase(f"{ctxt} {concept.preferredName}"))
parent_concept_name = urllib.parse.quote(camelcase(concept.longName))
if parent_concept_name not in classes:
parent_cls = ClassDefinition(
name=parent_concept_name,
title=concept.preferredName,
description=concept.preferredDefinition,
#aliases=[concept.longName],
class_uri=f"cadsr:{concept.publicId}",
)
classes[parent_concept_name] = parent_cls
if concept_name not in classes:
cls = ClassDefinition(
name=concept_name,
title=f"{concept.preferredName} ({ctxt})",
description=concept.preferredDefinition,
aliases=[concept.longName],
class_uri=f"cadsr:{concept.publicId}",
is_a=parent_concept_name,
)
classes[concept_name] = cls
else:
cls = classes[concept_name]
cls.slots.append(slot.name)
objectClass = concept.ObjectClass
# TODO
valueDomain = cde.ValueDomain
conceptualDomain = valueDomain.ConceptualDomain
pvs = valueDomain.PermissibleValues
if pvs:
enum_name = urllib.parse.quote(camelcase(valueDomain.preferredName))
enum = EnumDefinition(
name=enum_name,
title=valueDomain.preferredName,
description=valueDomain.preferredDefinition,
aliases=[valueDomain.longName],
# enum_uri=f"cadsr:{valueDomain.publicId}",
)
enums[enum_name] = enum
rng = enum_name
for pv in pvs:
# url encode the value to escape symbols like <, >, etc.
pv_value = urllib.parse.quote(pv.value)
tgt_pv = PermissibleValue(
text=pv_value,
title=pv.value,
description=pv.valueDescription,
)
enum.permissible_values[tgt_pv.text] = tgt_pv
vm = pv.ValueMeaning
tgt_pv.title = vm.preferredName
if not tgt_pv.description:
tgt_pv.description = vm.preferredDefinition
for c in vm.Concepts:
code = c.conceptCode.strip()
tgt_pv.meaning = f"NCIT:{code}"
else:
datatype = valueDomain.dataType
rng = TMAP.get(datatype, "string")
slot.range = rng
anns = []
for rd in cde.ReferenceDocuments:
rf_type = urllib.parse.quote(underscore(rd.type))
anns.append(Annotation(
tag=rf_type,
value=rd.description,
))
for ann in anns:
slot.annotations[ann.tag] = ann

sb.add_prefix("NCIT", "http://purl.obolibrary.org/obo/NCIT_")
sb.add_prefix("cadsr", "http://example.org/cadsr/")
sb.add_defaults()
for c in schema.classes.values():
c.from_schema = 'http://example.org/'
schema = sb.schema
schema.classes = classes
schema.slots = slots
schema.enums = enums
return schema




7 changes: 7 additions & 0 deletions schema_automator/importers/rdfs_import_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collections import defaultdict

from linkml.utils.schema_builder import SchemaBuilder
from linkml_runtime import SchemaView
from linkml_runtime.linkml_model import (
SchemaDefinition,
SlotDefinition,
Expand Down Expand Up @@ -50,9 +51,12 @@ class RdfsImportEngine(ImportEngine):
reverse_metamodel_mappings: Dict[URIRef, List[str]] = None
include_unmapped_annotations = False
metamodel = None
metamodel_schemaview: SchemaView = None
classdef_slots: List[str] = None

def __post_init__(self):
sv = package_schemaview("linkml_runtime.linkml_model.meta")
self.metamodel_schemaview = sv
self.metamodel = sv
self.metamodel_mappings = defaultdict(list)
self.reverse_metamodel_mappings = defaultdict(list)
Expand All @@ -73,6 +77,7 @@ def __post_init__(self):
mappings.append(uri)
self.reverse_metamodel_mappings[uri].append(e.name)
self.metamodel_mappings[e.name] = mappings
self.defclass_slots = [s.name for s in sv.class_induced_slots(ClassDefinition.class_name)]

def convert(
self,
Expand Down Expand Up @@ -180,6 +185,8 @@ def _dict_for_subject(self, g: Graph, s: URIRef) -> Dict[str, Any]:
if pp == RDF.type:
continue
metaslot_name = self._element_from_iri(pp)
if metaslot_name not in self.defclass_slots:
continue
if metaslot_name is None:
logging.warning(f"Not mapping {pp}")
continue
Expand Down
Loading

0 comments on commit 6186a49

Please sign in to comment.