-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHIPPO_distance.py
134 lines (96 loc) · 5.36 KB
/
HIPPO_distance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 7 16:59:39 2022
Compute the Resnik distances between patient cases using Human Phenotype Ontology.
Takes a hp.obo file as catalog input
Takes a tab-delimited dataframe as input file. Col 1 is ID, col 2 is a comma-delimited string of HPO IDs.
Output is the output path for a pairwise distance matrix
@author: Bradley Bowles
"""
import pandas as pd
import numpy as np
import pronto
import argparse
# parse arguments
parser = argparse.ArgumentParser(description='Compute a Resnik Distance Matrix for a Patient HPO Terms.')
parser.add_argument('-c', '--catalog', type=str, metavar='', required=True, help=r'Path to a downloaded HP.obo file. See https://hpo.jax.org/app/download/ontology.')
parser.add_argument('-i', '--input', type=str, metavar='', required=True, help=r'Tab-delimited input file. Col1="ID", Col2="HPO".')
parser.add_argument('-o', '--output', type=str, metavar='', required=True, help=r'Output file path. Will right a tab-delimited distance matrix to this path with input IDs as rows and cols.')
args = parser.parse_args()
if __name__ == "__main__":
# import HPO .obo catalog using pronto
# Docs: https://pronto.readthedocs.io/en/latest/api.html
hpo = pronto.Ontology(args.catalog)
# import labelled patient data
df = pd.read_csv(args.input, sep='\t')
# create a try/except statement to apply in lambda statements for mapping HPO terms
def map_ancs(term):
try:
return ( [i.id for i in hpo[term].rparents()] )
except KeyError:
return([])
def obsolete_terms(term_list):
if all([bool(map_ancs(i)) for i in term_list]):
#No obsolete terms detected.
return(False)
else:
# Obsolete terms detected!
return(True)
def resnik(HPO_list_1, HPO_list_2):
# this function uses list comprehension to create a Resnik semantic similarity distance matrix
# Specifically it generates the sim.max measure described in the following manuscript:
# "A Recurrent Missense Variant in AP2M1 Impairs Clathrin-Mediated Endocytosis
# and Causes Developmental and Epileptic Encephalopathy"
# compute max information score for each value in our HPO lists
resnik = pd.DataFrame(
[
[max([IC[i] for i in ( set(map_ancs(i)) & set(map_ancs(j)) )]) for j in HPO_list_2]
for i in HPO_list_1]
)
# Resnik matrix now contains max IC values for each intersection of common ancestors in input lists
# calculate SimMax:
# summing over the maximum of all rows and columns with appropriate normalization
simmax = 0.5 * ( (resnik.max(axis=0).sum()) + resnik.max(axis=1).sum() )
return simmax
# DATA IMPORTS AND PROCESSING
# drop empty ID/HPO rows
df.dropna(inplace=True)
# explode HPO terms
df.loc[:, 'HPO'] = df.HPO.str.split(',')
df = df.explode('HPO')
# map HPO terms to ancestors/superclasses
df['ancs'] = df.HPO.apply(lambda x: map_ancs(x) )
# drop HPO terms that could not be mapped - possibly a mistmatch between HPO versions
percent = round((df.loc[~df.ancs.astype(bool)].shape[0]/df.loc[df.ancs.astype(bool)].shape[0]*100),2)
if percent > 0:
print('\n', percent, '% of HPO values could not be matched to ancestors! Do your HPO versions match between your patient data file and the HPO ontology database?\n')
df = df.loc[df.ancs.astype(bool)] # drop all HPO terms that cannot be mapped
# Take the exploded data and flatten it back to a single row per ID
df = pd.concat([df.groupby('ID').ancs.sum(),
df.groupby('ID').HPO.apply(lambda x: list(x))], axis = 1).reset_index()[['ID','HPO','ancs']]
# Make sure HPO terms are included in their ancestors list
df.loc[:, 'ancs'] = df.HPO + df.ancs
# get unique HPO and ancestors in each row
df.loc[:, 'ancs'] = df.ancs.apply(lambda x: list(np.unique(x)))
df.loc[:, 'HPO'] = df.HPO.apply(lambda x: list(np.unique(x)))
# check for obsolete terms in our dataframe
if df.HPO.apply(obsolete_terms).any():
print('\nObsolete HPO terms detected in the input dataframe!\n')
# CALCULATING INFORMATION CONTENT
# create a flattened list of all HPO term ancestors, including duplicates
all_ancs = df.ancs.apply(pd.Series).stack().reset_index(drop=True)
# calculate information content for each HPO ancestor term (-log2(Freq.))
IC = (-1*(np.log2(all_ancs.value_counts(normalize=True)))).to_dict()
# CALCULATE RESNIK DISTANCE
print("Performing pairwise Resnik semantic similarity comparison for", len(df.ID), 'input IDs. This may take some time.')
# use list comprehension and our Resnik function to construct a dataframe with pairwise comparisons between cases
distance = pd.DataFrame([[resnik(j, i) for i in df.HPO] for j in df.HPO])
# set diagonal values to zero
for i in distance.columns:
distance.loc[i,i] = 0
# set ID as row index (already present in column index)
distance = distance.dropna().set_index(df.ID.values)
# save output
distance.to_csv(args.output, sep='\t', index=False)
print("Analysis completed! Data is saved to", args.output)