-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_end_count.py
executable file
·171 lines (134 loc) · 4.33 KB
/
get_end_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
from pysam import AlignmentFile
def count_end(pos_d):
"""
count the end of reads
"""
end_d = {}
for k, v in pos_d.items():
start, end = v
try:
end_d[end] += 1
except KeyError:
end_d[end] = 1
return end_d
def count_start(pos_d):
"""
count the start of the reads
:param pos_d:
:return:
"""
start_d={}
for k, v in pos_d.items():
start, end=v
try:
start_d[start] += 1
except KeyError:
start_d[start] = 1
return start_d
def filter_end(end_d, min_coverage=2):
"""
filter end_d with min_coverage
only merge the sites within the offset
"""
end_d_f = {}
for k, v in end_d.items():
if v >= min_coverage:
end_d_f[k] = v
return end_d_f
def group_end(end_d_f, offset=5):
"""
only merge the sites within the offset
"""
key_s = sorted(end_d_f.keys())
key_new = []
# print key_s
# Make two iterables (think: virtual lists) from one list
previous_sequence, current_sequence = iter(key_s), iter(key_s)
# Initialize the groups while advancing current_sequence by 1
# element at the same time
groups = [[next(current_sequence)]]
# Iterate through pairs of numbers
for previous, current in zip(previous_sequence, current_sequence):
try:
if abs(previous - current) >= offset:
# Large gap, we create a new empty sublist
groups.append([])
# Keep appending to the last sublist
except TypeError:
pass
groups[-1].append(current)
# print(groups)
#### write a dict for number to end
end_group_d = {}
for ll in groups:
last = ll[-1]
for i in ll:
end_group_d[i] = last
return end_group_d
def is_span_site(start, end, site):
"""
determine if the site is in the read region
"""
if start <= site < end:
return True
else:
return False
def get_span_number(pos_d, site):
count = 0
for k, v in pos_d.items():
start, end = v
if is_span_site(start, end, site):
count += 1
return count
def scan_span_dic(pos_d, end_d, end_group_d):
"""
ratio_d, end:(stop, span)
"""
ratio_d = {}
for site, stop_count in end_d.items():
real_site = end_group_d[site]
span_count = get_span_number(pos_d, real_site)
try:
ratio_d[real_site][0] += stop_count
except KeyError:
ratio_d[real_site] = [0, span_count]
ratio_d[real_site][0] += stop_count
return ratio_d
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-b", "--bamfile",
help="the sorted and indexed bam file")
parser.add_argument("-e", "--end_coverage", type=int, default=3,
help="the min end coverage to be parsed")
parser.add_argument("-r", "--region_offset", type=int, default=5,
help="the offset of end regions to be merged in counting spanning reads")
parser.add_argument("-c", "--coverage_ratio", type=float, default=0.001,
help="the min coverage/count ratio of the end reads and spanning reads to be printed")
parser.add_argument("-m", "--mode", type=str, default="end",
help="count the 3' end or the 5' start can be adjusted using the mode parameter")
args = parser.parse_args()
samfile = AlignmentFile(args.bamfile)
### get pos for all reads
pos_d = {}
for read in samfile.fetch():
pos_d[read.qname] = (read.reference_start, read.reference_end)
### pipeline:
if args.mode=="end":
end_d = count_end(pos_d)
else:
end_d=count_start(pos_d)
end_d_f = filter_end(end_d, args.end_coverage)
end_group_d = group_end(end_d_f, args.region_offset)
ratio_d = scan_span_dic(pos_d, end_d_f, end_group_d)
## use py3 print to stdout
for k in sorted(ratio_d.keys()):
v = ratio_d[k]
n_stop, n_span = v
ratio = float(n_stop) / (n_stop + n_span)
if ratio >= args.coverage_ratio:
print("{}\t{}\t{}".format(k, n_stop, n_span))
samfile.close()