diff --git a/data/exclusions-disease-gene.tsv b/data/exclusions-disease-gene.tsv new file mode 100644 index 0000000..3d283a0 --- /dev/null +++ b/data/exclusions-disease-gene.tsv @@ -0,0 +1,8 @@ +omim_id mondo_id mondo_label orcid exclusion_reason_comment +OMIM:603956 MONDO:0002974 cervical cancer' https://orcid.org/0000-0002-4142-7153 evidence of various genes involved +OMIM:619151 MONDO:0030894 "AMED syndrome, digenic'" https://orcid.org/0000-0002-4142-7153 digenic +OMIM:158901 MONDO:0008031 https://orcid.org/0000-0002-4142-7153 digenic +OMIM:108770 MONDO:0007171 atrial standstill 1' https://orcid.org/0000-0002-4142-7153 digenic +OMIM:620040 MONDO:0031057 "dyskeratosis congenita, digenic'" https://orcid.org/0000-0002-4142-7153 digenic +OMIM:619478 MONDO:0030355 "facioscapulohumeral muscular dystrophy 4, digenic'" https://orcid.org/0000-0002-4142-7153 digenic +OMIM:300818 MONDO:0010438 paroxysmal nocturnal hemoglobinuria 1 https://orcid.org/0000-0002-4142-7153 "disease caused by a somatic mutation, therefore a gene association stating this is due to a germline mutation should not be added" \ No newline at end of file diff --git a/omim2obo/config.py b/omim2obo/config.py index fe6d870..dd93118 100644 --- a/omim2obo/config.py +++ b/omim2obo/config.py @@ -9,6 +9,7 @@ DATA_DIR = ROOT_DIR / 'data' ENV_PATH = ROOT_DIR / '.env' REVIEW_CASES_PATH = ROOT_DIR / 'review.tsv' +DISEASE_GENE_EXCLUSIONS_PATH = DATA_DIR / 'exclusions-disease-gene.tsv' with open(DATA_DIR / 'dipper/GLOBAL_TERMS.yaml') as file: GLOBAL_TERMS = yaml.safe_load(file) diff --git a/omim2obo/main.py b/omim2obo/main.py index 23b7537..541db46 100644 --- a/omim2obo/main.py +++ b/omim2obo/main.py @@ -51,7 +51,7 @@ Assumptions 1. Mappings obtained from official OMIM files as described above are interpreted correctly (e.g. skos:exactMatch). """ -from typing import Set +from typing import Optional, Set import yaml from hashlib import md5 @@ -64,7 +64,7 @@ from omim2obo.parsers.omim_entry_parser import REVIEW_CASES, cleanup_title, get_alt_and_included_titles_and_symbols, \ get_pubs, get_mapped_ids, log_review_cases, recapitalize_acronyms_in_titles from omim2obo.parsers.omim_txt_parser import * # todo: change to specific imports - +from omim2obo.utils.utils import get_d2g_exclusions_by_curator # Vars OUTPATH = os.path.join(ROOT_DIR / 'omim.ttl') @@ -123,21 +123,22 @@ def add_subclassof_restriction(graph: Graph, predicate: URIRef, some_values_from return b -def add_subclassof_restriction_with_evidence( - graph: Graph, predicate: URIRef, some_values_from: URIRef, on: URIRef, evidence: Union[str, Literal] +def add_subclassof_restriction_with_evidence_and_source( + graph: Graph, predicate: URIRef, some_values_from: URIRef, on: URIRef, evidence: Union[str, Literal], + source: Optional[URIRef] = None, ): """Creates a subClassOf someValuesFrom restriction, and adds an evidence axiom to it.""" evidence = Literal(evidence) if type(evidence) is str else evidence # Add restriction on MIM class b: BNode = add_subclassof_restriction(graph, predicate, some_values_from, on) # Add axiom to restriction - b2 = BNode() - graph.add((b2, RDF['type'], OWL['Axiom'])) - graph.add((b2, OWL['annotatedSource'], on)) - graph.add((b2, OWL['annotatedProperty'], RDFS['subClassOf'])) - graph.add((b2, OWL['annotatedTarget'], b)) - graph.add((b2, BIOLINK['has_evidence'], evidence)) - graph.add((b2, RDFS['comment'], evidence)) + annotation_pred_vals = [ + (BIOLINK['has_evidence'], evidence), + (RDFS['comment'], evidence) + ] + annotation_pred_vals += [(oboInOwl.source, source)] if source else [] + + add_axiom_annotations(graph, on, RDFS['subClassOf'], b, annotation_pred_vals) # Classes @@ -200,6 +201,7 @@ def omim2obo(use_cache: bool = False): # - Non-OMIM triples graph.add((URIRef('http://purl.obolibrary.org/obo/mondo/omim.owl'), RDF.type, OWL.Ontology)) graph.add((URIRef(oboInOwl.hasSynonymType), RDF.type, OWL.AnnotationProperty)) + graph.add((URIRef(oboInOwl.source), RDF.type, OWL.AnnotationProperty)) graph.add((URIRef(MONDONS.omim_included), RDF.type, OWL.AnnotationProperty)) graph.add((URIRef(OMO['0003000']), RDF.type, OWL.AnnotationProperty)) graph.add((BIOLINK['has_evidence'], RDF.type, OWL.AnnotationProperty)) @@ -362,11 +364,13 @@ def omim2obo(use_cache: bool = False): 'gene_id': gene_mim, 'phenotype_label': p_lab, 'mapping_key': p_map_key, 'mapping_label': p_map_lab}) # - Add relations (subclass restrictions) + exclusions_p_mim_orcid_map = get_d2g_exclusions_by_curator() for p_mim, assocs in phenotype_genes.items(): for assoc in assocs: gene_mim, p_lab, p_map_key, p_map_lab = assoc['gene_id'], assoc['phenotype_label'], \ assoc['mapping_key'], assoc['mapping_label'] evidence = f'Evidence: ({p_map_key}) {p_map_lab}' + p_mim_excluded = p_mim in exclusions_p_mim_orcid_map # Skip: No phenotype or unknown defect # - not p_mim: Skip because not an association to another MIM (Provenance: @@ -376,26 +380,33 @@ def omim2obo(use_cache: bool = False): if not p_mim or p_map_key == '1': continue - # Add restrictions: Gene->Disease non-causal (disease-defining) relationships + # Add restrictions: Gene->Disease non-causal / non-disease-defining relationships # - RO:0003302 docs: see MORBIDMAP_PHENOTYPE_MAPPING_KEY_PREDICATES - if p_map_key != '3': # 3 = 'causal' (disease-defining). Handled separately below. - g2d_pred = MORBIDMAP_PHENOTYPE_MAPPING_KEY_PREDICATES[p_map_key] if len(assocs) == 1 else RO['0003302'] - add_subclassof_restriction_with_evidence(graph, g2d_pred, OMIM[p_mim], OMIM[gene_mim], evidence) + # - Mapping key 3 = 'causal' (disease-defining). Handled separately below. + if p_map_key != '3' or p_mim_excluded: + g2d_pred = MORBIDMAP_PHENOTYPE_MAPPING_KEY_PREDICATES[p_map_key] \ + if len(assocs) == 1 and not p_mim_excluded \ + else RO['0003302'] + orcid: Optional[URIRef] = exclusions_p_mim_orcid_map[p_mim] if p_mim_excluded else None + add_subclassof_restriction_with_evidence_and_source( + graph, g2d_pred, OMIM[p_mim], OMIM[gene_mim], evidence, orcid) + continue # Skip non-causal (disease-defining) cases - if len(assocs) > 1 or p_map_key != '3' or not p2g_is_definitive(p_lab): + if len(assocs) > 1 or not p2g_is_definitive(p_lab): # or cases above: (p_map_key != '3') & p_mim_excluded continue # Log review.tsv cases log_review_cases(p_mim, p_lab, p_map_key, gene_mim, gene_phenotypes, omim_types) + # Add restrictions: Disease-defining ('causal germline mutation') # - Disease --(RO:0004003 'has material basis in germline mutation in')--> Gene # https://www.ebi.ac.uk/ols4/ontologies/ro/properties?iri=http://purl.obolibrary.org/obo/RO_0004003 - add_subclassof_restriction_with_evidence( + add_subclassof_restriction_with_evidence_and_source( graph, RO['0004003'], OMIM[gene_mim], OMIM[p_mim], evidence) # - Gene --(RO:0004013 'is causal germline mutation in')--> Disease # https://www.ebi.ac.uk/ols4/ontologies/ro/properties?iri=http://purl.obolibrary.org/obo/RO_0004013 - add_subclassof_restriction_with_evidence( + add_subclassof_restriction_with_evidence_and_source( graph, RO['0004013'], OMIM[p_mim], OMIM[gene_mim], evidence) # PUBMED, UMLS diff --git a/omim2obo/namespaces.py b/omim2obo/namespaces.py index 268f2dc..7212e41 100644 --- a/omim2obo/namespaces.py +++ b/omim2obo/namespaces.py @@ -102,6 +102,7 @@ # publication/citation/reference sources DOI = Namespace('http://dx.doi.org/') # Digital Object identifier GENEREVIEWS = Namespace('http://www.ncbi.nlm.nih.gov/books/') # NCBI gene and diseases +ORCID = Namespace('https://orcid.org/') # Open Researcher and Contributor ID # more bogus IRIs ISBN = Namespace('https://monarchinitiative.org/ISBN_') # International Standard Book Number ISBN_10 = Namespace('https://monarchinitiative.org/ISBN10_') # Same as ISBN has 10 digits pre 2007 diff --git a/omim2obo/utils/utils.py b/omim2obo/utils/utils.py index abd119f..d6a67d8 100644 --- a/omim2obo/utils/utils.py +++ b/omim2obo/utils/utils.py @@ -1,5 +1,10 @@ """Misc utilities""" -from typing import List, Union +from typing import Dict, List, Optional, Union + +import pandas as pd + +from omim2obo.config import DISEASE_GENE_EXCLUSIONS_PATH +from omim2obo.namespaces import ORCID # todo: also in mondo-ingest. Refactor into mondolib: https://github.com/monarch-initiative/mondolib/issues/13 @@ -14,3 +19,14 @@ def remove_angle_brackets(uris: Union[str, List[str]]) -> Union[str, List[str]]: x = x[:-1] if x.endswith('>') else x uris2.append(x) return uris2[0] if str_input else uris2 + + +def get_d2g_exclusions_by_curator(path=DISEASE_GENE_EXCLUSIONS_PATH) -> Dict[str, Optional[str]]: + """Get disease-gene exclusions + + :return: Dict[str, str]: Phenotype MIM as keys, ORCID of curator as values + """ + df = pd.read_csv(path, sep='\t').fillna('') + df['phenotype_mim'] = df['omim_id'].apply(lambda x: x.split(':')[1]) + phenotype_mim_orcid_map = {x['phenotype_mim']: x['orcid'] for x in df.to_dict(orient='records')} + return {k: ORCID[v] if v else None for k, v in phenotype_mim_orcid_map.items()} diff --git a/sparql/disease-gene-relationships.sparql b/sparql/disease-gene-relationships.sparql index 8895cdb..d1359bb 100644 --- a/sparql/disease-gene-relationships.sparql +++ b/sparql/disease-gene-relationships.sparql @@ -24,6 +24,7 @@ WHERE { FILTER( ?PredUri IN ( + , , , ,