-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathngs_BLAST.sh
executable file
·304 lines (253 loc) · 12 KB
/
ngs_BLAST.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#!/bin/bash
# Copyright (c) 2012,2013, Stephen Fisher and Junhyong Kim, University of
# Pennsylvania. All Rights Reserved.
#
# You may not use this file except in compliance with the Kim Lab License
# located at
#
# http://kim.bio.upenn.edu/software/LICENSE
#
# Unless required by applicable law or agreed to in writing, this
# software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License
# for the specific language governing permissions and limitations
# under the License.
##########################################################################################
# INPUT: $SAMPLE/init/unaligned_1.fq
# OUTPUT: $SAMPLE/blast/blast.txt (blast output), $SAMPLE/blast/species.txt (species hit counts)
# REQUIRES: blastn (provided with Blast version 2), randomSample.py, parseBlast.py
##########################################################################################
##########################################################################################
# USAGE
##########################################################################################
NGS_USAGE+="Usage: `basename $0` blast OPTIONS sampleID -- run blast on randomly sampled subset of reads\n"
##########################################################################################
# HELP TEXT
##########################################################################################
ngsHelp_BLAST() {
echo -e "Usage:\n\t`basename $0` blast [-r numReads] [-k kmer] [-l readLength] -p numProc -s species sampleID"
echo -e "Input:\n\tsampleID/init/unaligned_1.fq"
echo -e "Output:\n\tsampleID/blast/blast.txt (blast output)\n\tsampleID/blast/sampleID.blast.stats.txt (species hit counts)"
echo -e "Requires:\n\tblastn ( ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/ )\n\trandomSample.py ( https://github.com/safisher/ngs )\n\tparseBlast.py ( https://github.com/safisher/ngs )"
echo -e "Options:"
echo -e "\t-r numReads - number of reads to randomly select (default = 5000)"
echo -e "\t-k kmer - check for presence of k-mer in reads that failed to align"
echo -e "\t-l readLength - read length (default = 100). If paired end then this is the length of one mate. If readLength <= 50 then an e-value of 1e-8 will be used for Blast. If readLength > 50 then an e-value of 1e-15 will be used."
echo -e "\t-p numProc - number of cpu to use"
echo -e "\t-s species - expected species\n"
echo -e "Run blast on 5000 reads randomly sampled from init/unaligned_1.fq. Blast paramters used are 'num_descriptions: 10 num_alignments: 10 word_size: 15 gapopen: 3 gapextend: 1 evalue: 1e-15 (> 50bp reads) or 1e-8 (<= 50bp reads)'. The output is put in a directory called 'blast'. The species.txt file contains number of reads mapping to each species (mouse, rat, human, bacteria).\n"
echo -e "The blast mappings are split into two files: reads.counted.txt and reads.notCounted.txt. Reads.notCounted.txt contains all reads that mapped to something that wasn’t one of the species we track (i.e. mouse, rat, human, bacteria, fly, zebra fish, yeast, ERCC). Browsing this file should elucidate the species of the uncounted mappings."
}
##########################################################################################
# LOCAL VARIABLES WITH DEFAULT VALUES. Using the naming convention to
# make sure these variables don't collide with the other modules.
##########################################################################################
# number of reads to randomly select
ngsLocal_BLAST_NUM_READS=5000
ngsLocal_BLAST_KMER=""
ngsLocal_BLAST_EVALUE=1e-15
##########################################################################################
# PROCESSING COMMAND LINE ARGUMENTS
# BLAST args: -p value, sampleID
##########################################################################################
ngsArgs_BLAST() {
if [ $# -lt 5 ]; then printHelp "BLAST"; fi
# getopts doesn't allow for optional arguments so handle them manually
while true; do
case $1 in
-p) NUMCPU=$2
shift; shift;
;;
-s) SPECIES=$2
shift; shift;
;;
-k) ngsLocal_BLAST_KMER=$2
shift; shift;
;;
-l) READ_LENGTH=$2
shift; shift;
;;
-r) ngsLocal_BLAST_NUM_READS=$2
shift; shift;
;;
-*) printf "Illegal option: '%s'\n" "$1"
printHelp $COMMAND
exit 0
;;
*) break ;;
esac
done
SAMPLE=$1
}
##########################################################################################
# RUNNING COMMAND ACTION
# This will do a BLAST search on 5,000 untrimmed reads, using the nt database.
##########################################################################################
ngsCmd_BLAST() {
prnCmd "# BEGIN: BLAST"
# make relevant directory
if [ ! -d $SAMPLE/blast ]; then
prnCmd "mkdir $SAMPLE/blast"
if ! $DEBUG; then mkdir $SAMPLE/blast; fi
fi
# Get ngsLocal_BLAST_NUM_READS (5,000) randomly sampled reads
# Usage: randomSample.py <num lines> <lines grouped> <input> <output>
prnCmd "randomSample.py $ngsLocal_BLAST_NUM_READS 4 $SAMPLE/init/unaligned_1.fq $SAMPLE/blast/raw.fq > $SAMPLE/blast/sampling.out.txt"
if ! $DEBUG; then
randomSample.py $ngsLocal_BLAST_NUM_READS 4 $SAMPLE/init/unaligned_1.fq $SAMPLE/blast/raw.fq > $SAMPLE/blast/sampling.out.txt
fi
# Convert fastq file to fasta file
prnCmd "awk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,\">\");print}; if(P==4)P=0; P++}' $SAMPLE/blast/raw.fq > $SAMPLE/blast/raw.fa"
if ! $DEBUG; then
awk 'BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}' $SAMPLE/blast/raw.fq > $SAMPLE/blast/raw.fa
fi
# use a less stringent e-value for shorter reads
if [ $READ_LENGTH -le 50 ]; then
ngsLocal_BLAST_EVALUE=1e-8
fi
# Run BLAST. Output file should end with ".txt"
prnCmd "blastn -query $SAMPLE/blast/raw.fa -db nt -num_descriptions 10 -num_alignments 10 -word_size 15 -gapopen 3 -gapextend 1 -evalue $ngsLocal_BLAST_EVALUE -num_threads $NUMCPU -out $SAMPLE/blast/blast.txt"
if ! $DEBUG; then
blastn -query $SAMPLE/blast/raw.fa -db nt -num_descriptions 10 -num_alignments 10 -word_size 15 -gapopen 3 -gapextend 1 -evalue $ngsLocal_BLAST_EVALUE -num_threads $NUMCPU -out $SAMPLE/blast/blast.txt
fi
# Parse BLAST output. Will generate *.cvs and *.hits files.
# Usage: parseBlast.py targetSpecies readsFastaFile blastFile
prnCmd "parseBlast.py $SPECIES $SAMPLE/blast/raw.fa $SAMPLE/blast/blast.txt"
if ! $DEBUG; then
parseBlast.py $SPECIES $SAMPLE/blast/raw.fa $SAMPLE/blast/blast.txt
# test for kmer in reads that didn't map with BLAST. These reads
# are put into the file "failed.fa" by parseBlast.py.
if [[ ! -z "$ngsLocal_BLAST_KMER" ]]; then
# $ngsLocal_BLAST_KMER has a non-zero length, so user gave us
# a k-mer to test
local count=$(grep -c "$ngsLocal_BLAST_KMER" $SAMPLE/blast/failed.fa)
# We add the count to THE TOP of the speciesCounts.txt file
# that parseBlast.py generates. We use the top of the file
# because the last line in the file contains a tab-delimited
# string of counts generated by parseBlast.py.
local line="k-mer count: ${count}"
if [[ ${OS_VERSION} == "Darwin" ]]; then
# Sed on the Mac requires a zero-length extension when
# using the -i option. This is not a requirement in linux.
sed -i "" '1 i\
'"$line"'
' $SAMPLE/blast/speciesCounts.txt
else
sed -i "1 i${line}" $SAMPLE/blast/speciesCounts.txt
fi
line="k-mer sequence: ${ngsLocal_BLAST_KMER}"
if [[ ${OS_VERSION} == "Darwin" ]]; then
sed -i "" '1 i\
'"$line"'
' $SAMPLE/blast/speciesCounts.txt
else
sed -i "1 i${line}" $SAMPLE/blast/speciesCounts.txt
fi
fi
fi
# rename output stats file to conform to other modules
prnCmd "mv $SAMPLE/blast/speciesCounts.txt $SAMPLE/blast/$SAMPLE.blast.stats.txt"
if ! $DEBUG; then
mv $SAMPLE/blast/speciesCounts.txt $SAMPLE/blast/$SAMPLE.blast.stats.txt
fi
# run error checking
if ! $DEBUG; then ngsErrorChk_BLAST $@; fi
# print version info in $SAMPLE directory. We do this AFTER
# parseBlast.py has run because we need to get the version number
# from the output file.
prnCmd "# blastn version: blastn -version | tail -1 | awk '{print \$3}' | sed s/,//"
prnCmd "# parseBlast.py version: grep parseBlast $SAMPLE/blast/$SAMPLE.blast.stats.txt | awk -F: '{print \$2}'"
if ! $DEBUG; then
# gets this: "Package: blast 2.2.28, build Mar 12 2013 16:52:31"
# returns this: "2.2.28"
ver=$(blastn -version | tail -1 | awk '{print $3}' | sed s/,//)
ver1=$(grep parseBlast $SAMPLE/blast/$SAMPLE.blast.stats.txt | awk -F: '{print $2}')
local kmer=$ngsLocal_BLAST_KMER
if [[ $ngsLocal_BLAST_KMER == "" ]]; then kmer="none"; fi
prnVersion "blast" \
"program\tversion\tprogram\tversion\tspecies\treadLength\tnumReads\tkmer" \
"blastn\t$ver\tparseBlast.py\t$ver1\t$SPECIES\t$READ_LENGTH\t$ngsLocal_BLAST_NUM_READS\t$ngsLocal_BLAST_KMER"
fi
prnCmd "# FINISHED: BLAST"
}
##########################################################################################
# ERROR CHECKING. Make sure output file exists and contains species counts.
##########################################################################################
ngsErrorChk_BLAST() {
prnCmd "# BLAST ERROR CHECKING: RUNNING"
inputFile="$SAMPLE/init/unaligned_1.fq"
outputFile="$SAMPLE/blast/$SAMPLE.blast.stats.txt"
# make sure expected output file exists
if [ ! -f $outputFile ]; then
errorMsg="Expected BLAST output file does not exist.\n"
errorMsg+="\tinput file: $inputFile\n"
errorMsg+="\toutput file: $outputFile\n"
prnError "$errorMsg"
fi
# compute number of lines in speciesCounts.txt
counts=`wc -l $outputFile | awk '{print $1}'`
# if counts file has less than 3 lines, then BLAST didn't work
if [ "$counts" -lt "3" ]; then
errorMsg="BLAST failed to run properly and there are no species counts.\n"
errorMsg+="\tinput file: $inputFile\n"
errorMsg+="\toutput file: $outputFile\n"
prnError "$errorMsg"
fi
prnCmd "# BLAST ERROR CHECKING: DONE"
}
##########################################################################################
# PRINT STATS. Prints a tab-delimited list stats of interest.
##########################################################################################
ngsStats_BLAST() {
if [ $# -ne 1 ]; then
prnError "Incorrect number of parameters for ngsStats_BLAST()."
fi
local statsFile="$SAMPLE/blast/$SAMPLE.blast.stats.txt"
# the second to the last line of the stats file is a tab-delimited lists of headers
# Total Hits Hits Not Counted Bacteria Fish Fly Human Mouse Rat Yeast
local header=$(tail -2 $statsFile | head -1)
# the last line of the stats file is a tab-delimited lists of values
local values=$(tail -1 $statsFile | tr -d '%')
# test if k-mer search results exist in stats file. If so the
# include in output.
local kmerSeq=$(grep "k-mer sequence" $statsFile | awk '{print $3}')
# if not empty, then k-mer results exist
if [[ ! -z "$kmerSeq" ]]; then
# append sequence to header/values list
header+="\t$kmerSeq"
# add the count to the values list
local kmerCount=$(grep "k-mer count" $statsFile | awk '{print $3}')
# append sequence to header/values list
values+="\t$kmerCount"
fi
pCounted=$(grep 'Number hits:' $statsFile | awk -vRS="% )" -vFS="(" '{print $2}')
header="$header\tPerc Counted"
values="$values\t$pCounted"
case $1 in
header)
echo $header
;;
values)
echo $values
;;
keyvalue)
# output key:value pair of stats
# the bash IFS variable dictates the word delimiting which is " \t\n"
# by default. We want to only delimite by tabs for the case here.
local IFS=$'\t'
# convert tab-delimited header/values variables to array
declare -a headerArray=($header)
declare -a valuesArray=($values)
# output a tab-delimited, key:value list
numFields=${#headerArray[@]}
for ((i=0; i<$numFields-1; ++i)); do
echo -en "${headerArray[$i]}:${valuesArray[$i]}\t"
done
echo "${headerArray[$numFields-1]}:${valuesArray[$numFields-1]}"
;;
*)
# incorrect argument
prnError "Invalid parameter for ngsStats_BLAST() (got $1, expected: 'header|values')."
;;
esac
}