-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsolution.py
133 lines (113 loc) · 3.42 KB
/
solution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
amino_acids = 'ACDEFGHIKLMNPQRSTVWXY'
amino_acids = ['-'] + list(amino_acids)
import sys
def input_sequences(file):
sequences = []
sequences_with_name = []
with open(file) as f:
for line in f:
line = line.strip('\n')
sequences_with_name.append(line)
line = line[30: ]
#print(line + '\n', len(line))
sequences.append(line)
return sequences, sequences_with_name
def second_largest(numbers):
count = 0
m1 = m2 = float('-inf')
for x in numbers:
count += 1
if x > m2:
if x >= m1:
m1, m2 = x, m1
else:
m2 = x
return m2 if count >= 2 else None
def consensus(x, sequences, sequences_with_name, amino_acids, pseudocount = 0):
sequence_length = len(sequences[0])
profile_matrix = {}
for acid in amino_acids:
profile_matrix[acid] = [float(pseudocount) for i in range(sequence_length)]
for i in range(len(sequences)):
seq = sequences[i]
for j in range(len(seq)):
#print(i, j)
profile_matrix[seq[j]][j] += float(1)
for aa in profile_matrix:
l = profile_matrix[aa]
for i in range(len(l)):
if pseudocount > 0:
l[i] /= float((len(sequences)*2))
else:
l[i] /= float(len(sequences))
consensus_string = ''
if x == 7:
#identifying the position in the alignment which has the most dashes
max_value = max(profile_matrix['-'])
if max_value == 1:
max_value = second_largest(profile_matrix['-'])
positions = []
for i in range(len(profile_matrix['-'])):
if profile_matrix['-'][i] == max_value:
positions.append(i)
#return positions, max_value
bad_sequence_numbers = []
for i in range(len(sequences)):
for position in positions:
if sequences[i][position] != '-':
if i not in bad_sequence_numbers:
bad_sequence_numbers.append(i)
bad_sequences = []
for number in bad_sequence_numbers:
bad_sequences.append(sequences_with_name[number][0: 30])
return bad_sequences
elif x == 1 or x == 2:
for i in range(sequence_length):
l = []
for aa in profile_matrix:
l.append(profile_matrix[aa][i])
index = l.index(max(l))
consensus_string += amino_acids[index]
if x == 1:
return consensus_string
else:
return len(consensus_string)
elif x == 3 or x == 4:
for i in range(sequence_length):
l = []
for aa in profile_matrix:
l.append(profile_matrix[aa][i])
index = l.index(max(l))
if amino_acids[index] == '-':
continue
consensus_string += amino_acids[index]
if x == 3:
return consensus_string
else:
return len(consensus_string)
elif x == 5 or 6:
for i in range(sequence_length):
l = []
for aa in profile_matrix:
l.append(profile_matrix[aa][i])
index = l.index(max(l))
if amino_acids[index] == '-':
if l[index] < 0.5:
index = l.index(second_largest(l))
elif l[index] >= 0.5:
continue
consensus_string += amino_acids[index]
if x == 5:
return consensus_string
else:
return len(consensus_string)
file = 'PF00167_full.txt'
sequences, sequences_with_name = input_sequences(file)
#print(sequences[87][258: 261])
print('1a. - Enter 1, 1b. - Enter 2, 2a. - Enter 3, 2b. - Enter 4, 3a. - Enter 5, 3b. - Enter 6, 4. - Enter 7')
x = int(input())
#print(consensus_1a(x, sequences, amino_acids))
#print(consensus_2a(sequences, amino_acids))
#print(consensus_3a(sequences, amino_acids))
#print(bad_sequences(sequences, sequences_with_name, amino_acids))
print(consensus(x, sequences, sequences_with_name, amino_acids))