-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmycosnp_bwa_pp.nf
272 lines (213 loc) · 9.15 KB
/
mycosnp_bwa_pp.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/usr/bin/env nextflow
//Deshellion: Workflow that prepares a reference FASTA file for BWA alignment and
//GATK variant calling by masking repeats in the reference and generating the BWA index.
//Author: Lynn Dotrang
//eMail: [email protected]
// Before completetion, need to edit for all full paths and all hard coded values (CPU usage options, etc)
Channel
.fromFilePairs( "/home/ldotrang/C_auris_testing/SampleTestData/CombinedPairsTest/SRR*{1,2}.fastq.gz", size: 2)
.ifEmpty { exit 1, "Cannot find any reads matching" }
.set {raw_reads}
Channel
.fromPath( "/home/ldotrang/C_auris_testing/mycosnp-demo/test-data/reference/*.fasta" )
.ifEmpty { exit 1, "Cannot find any reference fasta" }
.view()
.into {reference_seq; reference_seq2; reference_seq3}
// Am I going to need this if I expect the correct file time every time?
//process concat_fastq_lanes{
// input:
// files
// output:
// file("masked_ref.bed") into masked_ref_bed
// shell:
// // Very long script goes here
// """
// #!/bin/bash -ue
// ##SCRIPT THAT DOES THE CONCAT STUFF HERE
// """
// }
process seqkit_fastq_pair{
input:
tuple val(name), file(reads) from raw_reads
output:
tuple name, file("${name}{_1,_2}.paired.fastq.gz") into filtered_reads
shell:
"""
seqkit pair -1 ${reads[0]} -2 ${reads[1]} -u -j 4
"""
}
process seqtk_downsample{
tag "$name"
input:
tuple val(name), file(reads), file(reference) from filtered_reads.combine(reference_seq)
output:
tuple name, file("${name}{_1,_2}.fastq") into downsampled_reads, bwa_downsampled_reads, fastqc_downsampled_reads
shell:
"""
#!/bin/bash -ue
#Set some variables first
COVERAGE=30
SEED=\${RANDOM}
REFERENCE_LEN=\$(awk '!/^>/ {len+=length(\$0)} END {print len}' < ${reference})
echo \${REFERENCE_LEN} > seqtktestoutput.txt
READS_LEN=\$(zcat ${reads[0]} ${reads[1]} | awk '/^@/ {getline; len+=length(\$0)} END {print len}')
echo \${READS_LEN} >> seqtktestoutput.txt
SAMPLE_RATE=\$(echo "\${COVERAGE} \${READS_LEN} \${REFERENCE_LEN}" | awk '{x=\$1/(\$2/\$3); x=(1<x?1:x)} END {print x}')
echo \${SAMPLE_RATE} >> seqtktestoutput.txt
NUM_READS=\$(zcat ${reads[0]} | awk '/^@/ {lines+=1} END {print lines}')
echo \${NUM_READS} >> seqtktestoutput.txt
SAMPLED_NUM_READS=\$(echo "\${NUM_READS} \${SAMPLE_RATE}" | awk '{x=\$1*\$2} END {printf "%.0f", x}')
echo \${SAMPLED_NUM_READS} >> seqtktestoutput.txt
#need to run seqtk twice, once per read
seqtk sample -s \${SEED} ${reads[0]} \${SAMPLED_NUM_READS} > ${name}_1.fastq
seqtk sample -s \${SEED} ${reads[1]} \${SAMPLED_NUM_READS} > ${name}_2.fastq
"""
}
process faqcs{
publishDir "/home/ldotrang/C_auris_testing/nfBWAref/bwa_preprocess_output/qc_reports", mode: 'copy'
input:
set val(name), file(reads) from downsampled_reads
output:
file('*.pdf') into qc_printouts, mqc_faqcs
shell:
"""
FaQCs --prefix ${name} -1 ${reads[0]} -2 ${reads[1]} -d .
"""
}
process qc_report{
// this part tuple does not work correctly, but it will still proceed ok
publishDir "/home/ldotrang/C_auris_testing/nfBWAref/bwa_preprocess_output/qc_reports", mode: 'copy'
tag "$name"
input:
tuple name, file(report), file(reference) from qc_printouts.combine(reference_seq2)
output:
file("${name.baseName}.formattedreport.csv") into qc_report
shell:
//this villain again
"""
REFERENCE_LEN=\$(awk '!/^>/ {len+=length(\$0)} END {print len}' < ${reference})
printf \"Sample Name\\t# Reads Before Trimming\\tGC Before Trimming\\tAverage Phred Before Trimming\\tCoverage Before Trimming\\t# Reads After Trimming\\t# Paired Reads After Trimming\\t# Unpaired Reads After Trimming\\tGC After Trimming\\tAverage Phred After Trimming\\tCoverage After Trimming\\n\" > ${name.baseName}.formattedreport.csv;
pdftotext -raw \"${name}\" \"${name}.txt\"|| continue;
printf \"${name.baseName}\" | awk -F'_qc_report' '{printf \$1\"\\t\"}' >> ${name.baseName}.formattedreport.csv
cat \"${name}.txt\" | grep 'Reads #:' | sed -z 's/\\n/|/g' | sed 's/Reads #: //g' | sed 's/Paired //g' | sed 's/Unpaired //g' | awk -F'|' '{printf \$1\"\\t\"}' >> ${name.baseName}.formattedreport.csv
cat \"${name}.txt\" | grep 'GC' | grep -v 'Reads' | sed -z 's/\\n/|/g' | sed 's/GC //g' | sed 's/ ± /|/g' | awk -F'|' '{printf \$1\"\\t\"}' >> ${name.baseName}.formattedreport.csv
cat \"${name}.txt\" | grep 'Average' | grep -v 'Reads' | sed -z 's/\\n/|/g' | sed 's/Average: //g' | awk -F'|' '{printf \$1\"\\t\"}' >> ${name.baseName}.formattedreport.csv
cat \"${name}.txt\" | grep 'Total bases:' | sed -z 's/\\n/|/g' | sed 's/Total bases: //g' | awk -F'|' '{x=\$1/\$REFERENCE_LEN} END {printf x\"\\t\"}' >> ${name.baseName}.formattedreport.csv
cat \"${name}.txt\" | grep 'Reads #:' | sed -z 's/\\n/|/g' | sed 's/Reads #: //g' | sed 's/Paired //g' | sed 's/Unpaired //g' | awk -F'|' '{printf \$2\"\\t\"\$3\"\\t\"\$4\"\\t\"}' >> ${name.baseName}.formattedreport.csv
cat \"${name}.txt\" | grep 'GC' | grep -v 'Reads' | sed -z 's/\\n/|/g' | sed 's/GC //g' | sed 's/ ± /|/g' | awk -F'|' '{printf \$3\"\\t\"}' >> ${name.baseName}.formattedreport.csv
cat \"${name}.txt\" | grep 'Average' | grep -v 'Reads' | sed -z 's/\\n/|/g' | sed 's/Average: //g' | awk -F'|' '{printf \$2\"\\t\"}' >> ${name.baseName}.formattedreport.csv
cat \"${name}.txt\" | grep 'Total bases:' | sed -z 's/\\n/|/g' | sed 's/Total bases: //g' | sed 's/ /|/g' | awk -F'|' '{x=\$2/\$REFERENCE_LEN} END {printf x\"\\n\"}' >> ${name.baseName}.formattedreport.csv
"""
}
process bwa_align{
tag "$name"
input:
tuple name, file(reads), file(reference) from bwa_downsampled_reads.combine(reference_seq3)
output:
tuple name, file("${name}.sam") into bwa_sams
shell:
"""
bwa index ${reference}
bwa mem -t 4 ${reference} ${reads[0]} ${reads[1]} > ${name}.sam
"""
}
process bam_sort{
tag "$name"
//maybe move the samtools step above to this process
//default is to sort by coordinates
//-n option is added to sort by query number
input:
tuple name, file(sam) from bwa_sams
output:
tuple name, file("${name}_sorted.bam") into sorted_bams
shell:
"""
samtools view -b ${name}.sam -o ${name}.bam
samtools sort ${name}.bam > ${name}_sorted.bam
"""
}
process picard{
publishDir "/home/ldotrang/C_auris_testing/nfBWAref/bwa_preprocess_output", mode: 'copy'
tag "$name"
//removed duplicates is default to true
//assume_sort_order is default to coordinate
//validation_stringency is default to lenient in og wf, but default is strict
//original wf has all of these as changeable params
//for addorreplacerreads there are a lot of params
// library
// barcode
// platform
// and MORE
// VALIDATION_STRINGENCY="LENIENT"
// REMOVE_DUPLICATES="false"
// ASSUME_SORT_ORDER="coordinate"
// EXEC_METHOD="auto"
// EXEC_INIT=":"
input:
tuple name, file(bam) from sorted_bams
output:
tuple name, file("${name}_picard.bam") into picard_bams, qualimap_bams
file ("*-metrics.txt") into mqc_picardmd
shell:
"""
picard MarkDuplicates I=${name}_sorted.bam O=${name}_marked_dup.bam METRICS_FILE=${name}-metrics.txt \
VALIDATION_STRINGENCY="LENIENT" \
ASSUME_SORT_ORDER="coordinate"
picard CleanSam I= ${name}_marked_dup.bam O= ${name}_cleansam.bam VALIDATION_STRINGENCY=LENIENT
picard FixMateInformation I= ${name}_cleansam.bam O= ${name}_fixmate.bam
picard AddOrReplaceReadGroups I= ${name}_fixmate.bam O= ${name}_picard.bam ID=1 LB=lib1 PL=ILLUMINA PU=unit1 SM=sample
"""
}
process bam_index{
publishDir "/home/ldotrang/C_auris_testing/nfBWAref/bwa_preprocess_output", mode: 'copy'
tag "$name"
input:
tuple name, file (bam) from picard_bams
output:
tuple name, file("${name}_picard.bam.bai") into indexed_bam
shell:
"""
samtools index ${name}_picard.bam
"""
}
// additional QC steps
process fastqc{
//same as hickory
publishDir "/home/ldotrang/C_auris_testing/nfBWAref/bwa_preprocess_output/qc_reports", mode: 'copy'
tag "$name"
input:
tuple name, file(reads) from fastqc_downsampled_reads
output:
file("*_fastqc.{zip,html}") into fastqc_results
script:
"""
fastqc -q ${reads}
"""
}
process qualimap{
publishDir "/home/ldotrang/C_auris_testing/nfBWAref/bwa_preprocess_output/qc_reports", mode: 'copy'
tag "$name"
input:
tuple name, file (bam) from qualimap_bams
output:
tuple name, file ("*") into qualimap_results
script:
"""
qualimap bamqc -bam ${bam}
"""
}
process multiqc{
publishDir "/home/ldotrang/C_auris_testing/nfBWAref/bwa_preprocess_output/qc_reports", mode: 'copy'
input:
file faqcs from mqc_faqcs
file picardmd from mqc_picardmd
file fastqc from fastqc_results.collect()
file qualimap from qualimap_results
output:
file("*multiqc_report.html") into multiqc_report
file("*_data")
script:
"""
multiqc . >/dev/null 2>&1
"""
}