-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHSD_statistics.py
75 lines (66 loc) · 3.39 KB
/
HSD_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# HSDecipher v1.0
# Copyright 2022 Zhang X., Hu Y., Zhengyu Cheng, Archibald J.M.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import sys
# HSD_statistics.py is a python script that calculates the statistics of HSDs using a variety of HSDFinder thresholds.
# The output file is written in a table with the following headers:
# File name, Candidate HSDs, Non-redundant gene copies, Gene copies, True HSDs, Space, Incomplete HSDs, Capturing value, and Performance score.
# ‘Capturing value’ and 'Performance score’ are two parameters used to evaluate the HSD results
if len(sys.argv)!=4: #if the input arguments not 4, showing the usage.
print("Usage:python3 HSD_Statistics.py <path to HSD species folder> <format of HSD file. e.g., 'txt' or 'tsv'> <output file name. e,g. species_stat.tsv>")
sys.exit()
root = sys.argv[1]
folder = os.listdir(root)
folder.sort()
result_list = []
for file in sorted(folder):
file_paths = os.path.join(root, file)
if not os.path.isdir(file_paths) and file.split('.')[-1] == sys.argv[2]:
false_positive = 0
space = 0
hsd = 0
score = 0
gene_list = []
with open(file_paths, 'r') as f:
lines = f.readlines()
for line in lines:
items = line.split('\t')
if len(items) > 4:
is_space = True
domain_list = items[4].split('; ')
cur_domain = domain_list[0].split(', ')
for domain in domain_list:
temp_domain = domain.split(', ')
if not sorted(temp_domain) == sorted(cur_domain):
false_positive += 1
break
if not domain == "":
is_space = False
if is_space:
space += 1
hsd += 1
gene_list += items[1].split('; ')
true_positive = hsd-false_positive
precision = round(true_positive*100/hsd,2)
score = round((true_positive+hsd-space)/(false_positive+1),2)
gene_list_non_repeat = list(set(gene_list))
result_list.append(".".join(file.split('.')[:-1]) + '\t' + str(hsd) + '\t' + str(len(gene_list_non_repeat)) +
'\t' + str(len(gene_list)) + '\t' + str(true_positive) + '\t' + str(space) + '\t' +
str(false_positive) + '\t' + str(precision) + '\t' + str(score))
with open(os.path.join(root, sys.argv[3]), 'w') as out:
out.write("File_name\tCandidate_HSDs#\tNon-redundant_gene_copies#\tGene_copies#\tTrue_HSDs#\tSpace#\tIncomplete_HSDs#\tCapturing_value\tPerformance_score\n")
for r in result_list:
out.write(r + '\n')
#file.split('.')[0]
#".".join(file.split('.')[:-1])