From 242857870203536fe379ecdd4aab0e10e9861210 Mon Sep 17 00:00:00 2001 From: sejmodha Date: Thu, 20 Jul 2017 12:02:30 +0100 Subject: [PATCH] validation scripts added --- shuffle.sh | 26 ++++++++++++++++++++++++++ validate.sh | 26 ++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 shuffle.sh create mode 100644 validate.sh diff --git a/shuffle.sh b/shuffle.sh new file mode 100644 index 0000000..0e80ac1 --- /dev/null +++ b/shuffle.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +mkdir $2 +taxid="txid40120" +outfile=`echo $3` +touch $2/$outfile +printf "RunNo\tNoOfSeeds\tMinLen\tMinCov\n" > $2/$outfile +esearch -db protein -query "txid40120[Organism]" |efetch -format fasta > ${taxid}.fa +mv ${taxid}.fa $2/${taxid}.fa +awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} $2 {print ">"$0}' $2/${taxid}.fa > $2/${taxid}_checked.fa +makeblastdb -in $2/${taxid}_checked.fa -dbtype 'prot' +for i in `seq 1 100`; +do + + cat $1|awk '/^>/ { if(i>0) printf("\n"); i++; printf("%s\t",$0); next;} {printf("%s",$0);} END { printf("\n");}' |shuf |head -n $2 |awk '{printf("%s\n%s\n",$1,$2)}' > $2/${i}.fa + echo $i; + makeblastdb -in $2/${i}.fa -dbtype 'prot' + blastp -query $2/${i}.fa -db $2/${i}.fa -outfmt '6 qseqid qlen sseqid stitle sacc evalue length qcovs' -out $2/${i}_blastp_output -num_threads 12 + minlen=`cat $2/${i}_blastp_output|sort -k7 -n|head -1|cut -f7`; + mincov=`cat $2/${i}_blastp_output|sort -k8 -n|head -1|cut -f8`; + echo "Minimum length is $minlen" + echo "Minimum coverage is $mincov" + printf $i"\t"$2"\t"$minlen"\t"$mincov"\n" >>$2/$outfile + blastp -query $2/${i}.fa -db $2/${taxid}_checked.fa -outfmt '6 qseqid qlen sseqid stitle sacc evalue length qcovs' -out $2/${i}_${taxid}_blastout -num_threads 12 + awk -F"\t" '{if($7>='$minlen' && $8>='$mincov') print}' $2/${i}_${taxid}_blastout > $2/${i}_filtered +done; diff --git a/validate.sh b/validate.sh new file mode 100644 index 0000000..a39f0b4 --- /dev/null +++ b/validate.sh @@ -0,0 +1,26 @@ +#! /bin/bash + +#Script to calculate number of false positive and false negatives +# This script uses a file with list of currently classified sequences and *_filtered files generated from shuffle.sh +# $1 -> txid40120_current_ns1_accession +# $2 -> OutputfileName +# $3 -> Number of Seeds +printf "NoSeeds\tRunNo\tLength\tCoverage\tSeqFound\tFalsePos\tFalseNeg\n" >$2 + +for file in `ls -tr *_combination` +do + echo "Processing $file" + RunNo=`echo "$file"|cut -f1 -d "_"` + Length=`echo "$file"|cut -f4 -d "_"` + Coverage=`echo "$file"|cut -f5 -d "_"` + cut -f3 $file |cut -f1 -d "."|sort|uniq > ${file}_acc + #number of sequences identified + found=`grep -c -wf $1 ${file}_acc` + #number of false positive + fp=`grep -c -v -wf $1 ${file}_acc` + #number false negative + fn=`grep -c -v -wf ${file}_acc $1` + + printf "$3\t$RunNo\t$Length\t$Coverage\t$found\t$fp\t$fn\n" >>$2 + +done