Skip to content

Commit

Permalink
validation scripts added
Browse files Browse the repository at this point in the history
  • Loading branch information
sejmodha committed Jul 20, 2017
1 parent 19cd7f7 commit 2428578
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 0 deletions.
26 changes: 26 additions & 0 deletions shuffle.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

mkdir $2
taxid="txid40120"
outfile=`echo $3`
touch $2/$outfile
printf "RunNo\tNoOfSeeds\tMinLen\tMinCov\n" > $2/$outfile
esearch -db protein -query "txid40120[Organism]" |efetch -format fasta > ${taxid}.fa
mv ${taxid}.fa $2/${taxid}.fa
awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} $2 {print ">"$0}' $2/${taxid}.fa > $2/${taxid}_checked.fa
makeblastdb -in $2/${taxid}_checked.fa -dbtype 'prot'
for i in `seq 1 100`;
do

cat $1|awk '/^>/ { if(i>0) printf("\n"); i++; printf("%s\t",$0); next;} {printf("%s",$0);} END { printf("\n");}' |shuf |head -n $2 |awk '{printf("%s\n%s\n",$1,$2)}' > $2/${i}.fa
echo $i;
makeblastdb -in $2/${i}.fa -dbtype 'prot'
blastp -query $2/${i}.fa -db $2/${i}.fa -outfmt '6 qseqid qlen sseqid stitle sacc evalue length qcovs' -out $2/${i}_blastp_output -num_threads 12
minlen=`cat $2/${i}_blastp_output|sort -k7 -n|head -1|cut -f7`;
mincov=`cat $2/${i}_blastp_output|sort -k8 -n|head -1|cut -f8`;
echo "Minimum length is $minlen"
echo "Minimum coverage is $mincov"
printf $i"\t"$2"\t"$minlen"\t"$mincov"\n" >>$2/$outfile
blastp -query $2/${i}.fa -db $2/${taxid}_checked.fa -outfmt '6 qseqid qlen sseqid stitle sacc evalue length qcovs' -out $2/${i}_${taxid}_blastout -num_threads 12
awk -F"\t" '{if($7>='$minlen' && $8>='$mincov') print}' $2/${i}_${taxid}_blastout > $2/${i}_filtered
done;
26 changes: 26 additions & 0 deletions validate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#! /bin/bash

#Script to calculate number of false positive and false negatives
# This script uses a file with list of currently classified sequences and *_filtered files generated from shuffle.sh
# $1 -> txid40120_current_ns1_accession
# $2 -> OutputfileName
# $3 -> Number of Seeds
printf "NoSeeds\tRunNo\tLength\tCoverage\tSeqFound\tFalsePos\tFalseNeg\n" >$2

for file in `ls -tr *_combination`
do
echo "Processing $file"
RunNo=`echo "$file"|cut -f1 -d "_"`
Length=`echo "$file"|cut -f4 -d "_"`
Coverage=`echo "$file"|cut -f5 -d "_"`
cut -f3 $file |cut -f1 -d "."|sort|uniq > ${file}_acc
#number of sequences identified
found=`grep -c -wf $1 ${file}_acc`
#number of false positive
fp=`grep -c -v -wf $1 ${file}_acc`
#number false negative
fn=`grep -c -v -wf ${file}_acc $1`

printf "$3\t$RunNo\t$Length\t$Coverage\t$found\t$fp\t$fn\n" >>$2

done

0 comments on commit 2428578

Please sign in to comment.