-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworst_families.py
77 lines (61 loc) · 2.89 KB
/
worst_families.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from Bio import SeqIO
from analyse_plot_stats import *
from consensus import *
from tabulate import tabulate
def count_number_of_gaps(sequence):
gaps = 0
for char in sequence:
if char == '-':
gaps += 1
return gaps
def main():
path = 'worst_plots/'
wf = get_plot_stat_files(path)
#accessions_wf = [i[:7] for i in wf]
wf = [i[:7] + '_plot_stats.txt' for i in wf]
tuples_for_table = []
for filename in wf:
f = 'plot_stats/' + filename
accession = filename[:7]
train_stats, test_stats = get_plot_stats_from_file(f)
ref_mean_energy = train_stats['Mean']
ref_mode_energy = train_stats['Mode']
ref_cs_energy = train_stats['Consensus energy']
hmm_mean_energy = test_stats['Mean']
hmm_mode_energy = test_stats['Mode']
hmm_cs_energy = test_stats['Consensus energy']
ref_alignment_file = 'refined_alignments/' + accession + '_refined.fasta'
consensus_file = 'all_consensus_sequences/' + accession + '_consensus.fasta'
ref_alignment_sequences, ignore = fasta_to_list(ref_alignment_file)
no_of_seqs = len(ref_alignment_sequences)
loa = len(ref_alignment_sequences[0])
consensus_sequences, ignore = fasta_to_list(consensus_file)
refined_cs = consensus_sequences[0]
hmm_cs = consensus_sequences[1]
refined_cs_gaps = count_number_of_gaps(refined_cs)
hmm_cs_gaps = count_number_of_gaps(hmm_cs)
mean_diff = ref_mean_energy - hmm_mean_energy
mode_diff = ref_mode_energy - hmm_mode_energy
cs_diff = ref_cs_energy - hmm_cs_energy
tuples_for_table.append((accession, ref_mean_energy, ref_mode_energy, ref_cs_energy, hmm_mean_energy,
hmm_mode_energy, hmm_cs_energy, mean_diff, mode_diff, cs_diff, no_of_seqs, loa, refined_cs_gaps, hmm_cs_gaps))
headers = ['Family', 'r_mean_e', 'r_mode_en', 'r_cs_e', 'h_mean_e', 'h_mode_e', 'h_cs_e'
, 'mean_diff', 'mode_diff', 'cs_diff', 'nos',
'loa', 'ngaps_r_cs', 'ngaps_h_cs']
print(tabulate(tuples_for_table, headers, tablefmt = "grid"))
print('Here: ')
print('r_mean_e: mean energy of refined alignment')
print('r_mode_e: mode energy of refined alignment')
print('r_cs_e: energy of consensus from refined alignment')
print('h_mean_e: mean energy of hmm sequences')
print('h_mode_e: mode energy of hmm sequences')
print('h_cs_e: energy of consensus from hmm sequences')
print('mean_diff: r_mean_e - h_mean_e')
print('mode_diff: r_mode_e - h_mode_e')
print('cs_diff: r_cs_e - h_cs_e')
print('nos: number of sequences in refined/hmm alignment')
print('loa: length of refined/hmm alignment')
print('ngaps_r_cs: number of gaps in consensus from refined alignment')
print('ngaps_h_cs: number of gaps in consensus from hmm sequences')
if __name__ == '__main__':
main()