forked from yezhengSTAT/mHiC
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy paths2_joinEnd.py
235 lines (191 loc) · 8.74 KB
/
s2_joinEnd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/bin/env python
## mHiC
## Author: Ye Zheng
## Contact: [email protected]
'''
Script to join two single-end alignment with multi-mapping reads into pair-ended bam file.
Only aligned pair-end reads are kept.
Alignment statistics are summarized by demand.
Dec, 2016
'''
import sys
import os
import re
import pysam
import argparse
def get_args():
"""Get arguments"""
parser = argparse.ArgumentParser(description = '------------Usage Start------------',
epilog = '------------Usage End------------')
parser.add_argument('-r1', '--readEnd1', help = 'Path to the aligned file of read end 1.')
parser.add_argument('-r2', '--readEnd2', help = 'Path to the aligned file of read end 2.')
parser.add_argument('-o', '--output', help = 'Output file.', default = None)
parser.add_argument('-s', '--summary', help = '(Optional) Summary of alignment results. Default is true.', default = True)
parser.add_argument('-sf', '--summaryFile', help = '(Optional) Summary file path . Default is infileName.alignSummary.', default = None)
parser.add_argument('-v', '--verbose', help = '(Optional) Verbose. Default is true.', default = True)
args = parser.parse_args()
if args.readEnd1 is None or args.readEnd2 is None or args.output is None:
parser.print_help()
sys.exit()
if not os.path.exists(args.readEnd1):
print("Single-ended read 1 file does not exist!")
sys.exit()
if not os.path.exists(args.readEnd2):
print("Single-ended read 2 file does not exist!")
sys.exit()
if args.output is None:
print("Please specify the output file to store the pair-ended alignment files after join the ends.")
sys.exit()
if not os.path.exists(args.output.rsplit("/", 1)[0]):
os.makedirs(args.output.rsplit("/", 1)[0])
return args
def is_uniread(read):
if not read.is_unmapped and read.mapping_quality > 24 and not read.has_tag('XA'):
return True
else:
return False
def is_lowQualMulti(read):
if not read.is_unmapped and read.mapping_quality < 25 and not read.has_tag("XA"):
return True
else:
return False
def sam_flag(r1, r2, file1, file2):
'''
This function cites the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mergeSAM.py) by Nicolas Servant, Eric Viara
'''
f1 = r1.flag
f2 = r2.flag
if r1.is_unmapped == False:
r1_chrom = file1.getrname(r1.tid)
else:
r1_chrom="*"
if r2.is_unmapped == False:
r2_chrom = file2.getrname(r2.tid)
else:
r2_chrom="*"
##Relevant bitwise flags (flag in an 11-bit binary number)
##1 The read is one of a pair
##2 The alignment is one end of a proper paired-end alignment
##4 The read has no reported alignments
##8 The read is one of a pair and has no reported alignments
##16 The alignment is to the reverse reference strand
##32 The other mate in the paired-end alignment is aligned to the reverse reference strand
##64 The read is the first (#1) mate in a pair
##128 The read is the second (#2) mate in a pair
##The reads were mapped as single-end data, so should expect flags of
##0 (map to the '+' strand) or 16 (map to the '-' strand)
##Output example: a paired-end read that aligns to the reverse strand
##and is the first mate in the pair will have flag 83 (= 64 + 16 + 2 + 1)
if f1 & 0x4:
f1 = f1 | 0x8
if f2 & 0x4:
f2 = f2 | 0x8
if (not (f1 & 0x4) and not (f2 & 0x4)):
##The flag should now indicate this is paired-end data
f1 = f1 | 0x1
f1 = f1 | 0x2
f2 = f2 | 0x1
f2 = f2 | 0x2
##Indicate if the pair is on the reverse strand
if f1 & 0x10:
f2 = f2 | 0x20
if f2 & 0x10:
f1 = f1 | 0x20
##Is this first or the second pair?
f1 = f1 | 0x40
f2 = f2 | 0x80
##Insert the modified bitwise flags into the reads
r1.flag = f1
r2.flag = f2
##Determine the RNEXT and PNEXT values (i.e. the positional values of a read's pair)
#RNEXT
if r1_chrom == r2_chrom:
r1.rnext = r1.tid
r2.rnext = r1.tid
else:
r1.rnext = r2.tid
r2.rnext = r1.tid
#PNEXT
r1.pnext = r2.pos
r2.pnext = r1.pos
return(r1, r2)
if __name__ == "__main__":
args = get_args()
if args.verbose:
print("Read 1 file = ", args.readEnd1)
print("Read 2 file = ", args.readEnd2)
print("Output file = ", args.output)
print("Summary of alignment results = ", args.summary)
print("Summary file path = ", args.summaryFile)
print("Verbose = ", args.verbose)
## Initial count variable for summary
singleton_count = 0
unmapped_count = 0
mapped_count = 0
uni_count = 0
multi_count = 0
multi_lowqual_count = 0
total_count = 0
if args.verbose:
print("------------Begin joining ends from read 1 file and read 2 file------------")
with pysam.Samfile(args.readEnd1, "rb") as readEnd1, pysam.Samfile(args.readEnd2, "rb") as readEnd2:
if args.output.endswith(".bam"):
outfile = pysam.AlignmentFile(args.output, "wb", template = readEnd1)
elif args.output.endswith(".sam"):
outfile = pysam.AlignmentFile(args.output, "wh", template = readEnd1)
else:
print("Output file format should be either bam or sam.")
sys.exit()
for r1, r2 in zip(readEnd1.fetch(until_eof = True), readEnd2.fetch(until_eof = True)):
total_count += 1
if total_count % 1000000 == 0 and args.verbose:
print("# of reads processed: ", total_count)
if r1.qname == r2.qname:
if r1.is_unmapped == True and r2.is_unmapped == True:
unmapped_count += 1
continue
elif r1.is_unmapped == False and r2.is_unmapped == False:
mapped_count += 1
if is_uniread(r1) == True and is_uniread(r2) == True:
uni_count += 1
(r1, r2) = sam_flag(r1, r2, readEnd1, readEnd2)
outfile.write(r1)
outfile.write(r2)
elif is_lowQualMulti(r1) == True or is_lowQualMulti(r2) == True:
"""
Need to check if it is low quality multi-mapping read, i.e. if score = 0 but no XA tag, namely multi-read which can be aligned to too many positions beyound upper bound and they are considered as low quality multi-mapping read - they will be discarded eventually.
"""
multi_lowqual_count += 1
else:
"""
the rest aligned reads pair with at least one end having XA tag are multi-mapping read pairs.
"""
multi_count += 1
(r1, r2) = sam_flag(r1, r2, readEnd1, readEnd2)
outfile.write(r1)
outfile.write(r2)
else:
singleton_count += 1
else:
print("Read id in read 1 and read 2 file does not match. Please check the input read files and sort them correctly.")
sys.exit()
if args.summary:
if args.summaryFile is None:
summaryFile = re.sub("\.bam$|\.sam$", ".alignSummary", args.output)
statSummary = open(summaryFile, "w+")
else:
summaryFile = args.summaryFile
statSummary = open(summaryFile, "a")
statSummary.write("Step: Alignment and join both ends." + "\n" + "\n")
statSummary.write("Total number of read pairs:\t" + str(total_count) + "\n")
statSummary.write(" Total number of unmapped read pairs:\t" + str(unmapped_count) + "\n")
statSummary.write(" Total number of singleton read pairs:\t" + str(singleton_count) + "\n")
statSummary.write(" Total number of mapped read pairs:\t" + str(mapped_count) + "\n")
statSummary.write(" Total number of uniquely mapping read pair for both ends:\t" + str(uni_count) + "\n")
statSummary.write(" Total number of multi-mapping for at least one end of the read pairs:\t" + str(multi_count + multi_lowqual_count) + "\n")
statSummary.write(" Total number of multi-mapping for at least one end of the read pairs and is of high quality:\t" + str(multi_count) + "\n")
statSummary.write(" Total number of multi-mapping for at least one end of the read pairs and is of low quality:\t" + str(multi_lowqual_count) + "\n")
statSummary.close()
readEnd1.close()
readEnd2.close()
outfile.close()