-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy path05gene_annotation
123 lines (101 loc) · 13 KB
/
05gene_annotation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
##########################################################################################################################################################################################################################################################################################################################################
##########################################################################################################################################################################################################################################################################################################################################
######
#Annoate Genes, use Db-1 as an example
#Augustus
cd ../genome_assembly/Db-1/07Gene_anno/01RepeatMask_out
ln -sf ../../06TE_anno_pan/genome.fa.out .
ln -sf ../../06TE_anno_pan/genome.fa.out.gff .
ln -sf ../../06TE_anno_pan/genome.fa.mod.MAKER.masked .
seqkit split -i ./genome.fa.mod.MAKER.masked
cd ../02Augustus_out
rm ./Db-1.temp.gff
find ../01RepeatMask_out/genome.fa.mod.MAKER.masked.split/ -type f -name "*.masked" | awk '{print "augustus --species=arabidopsis --UTR=on --print_utr=on --exonnames=on --codingseq=on --genemodel=complete --alternatives-from-evidence=true --gff3=on --uniqueGeneId=true "$1" >> ./Db-1.temp.gff"}' > ./run_augustus
./run_augustus
augustus_GFF3_to_EVM_GFF3.pl ./Db-1.temp.gff > ./Db-1.augustus.gff3
#GeneMark
cd /xxx/genome_assembly/Db-1/07Gene_anno/03GeneMark_out
gmes_petap.pl --ES --sequence ../../04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta --cores 5
gtf2gff3.pl ./genemark.gtf | bedtools sort -i - > ./Db-1.genemark.gff3
#GlimmerHMM
seqkit split -f -i ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta
find ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta.split/ -type f -name "*.fasta" | awk '{split($1,a,".part_"); print "glimmerhmm_linux_x86_64 "$1" /xxx/GlimmerHMM/trained_dir/arabidopsis -g -f -n 1 > ../genome_assembly/Db-1/07Gene_anno/04GlimmerHMM_out/Db-1."a[2]".gff"}' > ../genome_assembly/Db-1/07Gene_anno/04GlimmerHMM_out/run_glimmerhmm
../genome_assembly/Db-1/07Gene_anno/04GlimmerHMM_out/run_glimmerhmm
gff3_merge -o ../genome_assembly/Db-1/07Gene_anno/04GlimmerHMM_out/Db-1.glimmerhmm.gff ../genome_assembly/Db-1/07Gene_anno/04GlimmerHMM_out/Db-1.*fasta.gff
glimmerHMM_to_GFF3.pl ../genome_assembly/Db-1/07Gene_anno/04GlimmerHMM_out/Db-1.glimmerhmm.gff > ../genome_assembly/Db-1/07Gene_anno/04GlimmerHMM_out/Db-1.glimmerhmm.gff3
#SNAP
cd ../genome_assembly/Db-1/07Gene_anno/06SNAP_out
ln -sf /xxx/genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta genome.fa
snap /xxx/snap/HMM/A.thaliana.hmm genome.fa -gff > Db-1.snap.gff
#RNA-seq, Trinity + TransDecoder + CD-HIT, and then send to Exonerate
/xxx/trinityrnaseq-2.14.0/bin/Trinity --seqType fq --max_memory 50G --CPU 8 --left ../raw_data/RNA_publicData/ERR1588809_1.trim.paired.fastq.gz,../raw_data/RNA_publicData/ERR1588810_1.trim.paired.fastq.gz --right ../raw_data/RNA_publicData/ERR1588809_2.trim.paired.fastq.gz,../raw_data/RNA_publicData/ERR1588810_2.trim.paired.fastq.gz --output ../transcriptome_assembly/Can-0_pe_trinity/ --full_cleanup --no_version_check
/xxx/trinityrnaseq-2.14.0/bin/Trinity --seqType fq --max_memory 50G --CPU 8 --single ../raw_data/RNA_publicData/SRR1046859.trim.fastq.gz,../raw_data/RNA_publicData/SRR1046860.trim.fastq.gz,../raw_data/RNA_publicData/SRR1046861.trim.fastq.gz --output ../transcriptome_assembly/Kn-0_se_trinity/ --full_cleanup --no_version_check
TransDecoder.LongOrfs -t ../transcriptome_assembly/Bay0_se_trinity.Trinity.fasta --output_dir ../transcriptome_assembly/Bay0_se_transdecoder
/opt/share/software/packages/diamond-v2.0.4/bin/diamond blastp -q ../transcriptome_assembly/Bay0_se_transdecoder/longest_orfs.pep -d /xxx/Databases/UniProt_2022/uniprot_sprot.fasta -k 1 -f 6 -e 1e-5 --ultra-sensitive -p 4 -o ../transcriptome_assembly/Bay0_se_transdecoder/blastp.outfmt6
TransDecoder.Predict -t ../transcriptome_assembly/Bay0_se_trinity.Trinity.fasta --retain_pfam_hits ../transcriptome_assembly/Bay0_se_transdecoder/pfam.domtblout --retain_blastp_hits ../transcriptome_assembly/Bay0_se_transdecoder/blastp.outfmt6 --output_dir ../transcriptome_assembly/Bay0_se_transdecoder
mkdir ../pan_genome/panPep_lib
cat ../AMPRILS/*protein-coding.genes.v2.5.protein.fasta ../transcriptome_assembly/*_transdecoder/*_trinity.Trinity.fasta.transdecoder.pep > ../pan_genome/panPep_lib/PEPlib.list.panPEP.fa
nohup /opt/share/software/packages/cdhit-4.6.8/bin/cd-hit -i ../pan_genome/panPep_lib/PEPlib.list.panPEP.fa -o ../pan_genome/panPep_lib/PEPlib.list.panPEP.c98.fa -c 0.98 -M 80000 -T 48
#Exonerate
cd ../genome_assembly/Db-1/07Gene_anno/05Exonerate_out/
ln -sf /xxx/ref_genome/Athaliana_447_Araport11.protein.fa .
ln -sf /xxx/genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta genome.fa
echo | awk '{for(i=1;i<=20;i++){print "exonerate -q ./Athaliana_447_Araport11.protein.fa --querychunkid "i" --querychunktotal 20 -t ./genome.fa -m protein2genome --percent 70 --minintron 10 --maxintron 60000 --showvulgar no --showalignment no --showquerygff no --showtargetgff yes --ryo \"AveragePercentIdentity: %pi\\n\" > ./Db-1.Ath.chunk-"i".gff "}}' > ./run_exonerate_ath
./run_exonerate_ath
echo | awk '{for(i=1;i<=20;i++){print "exonerate_gff_to_alignment_gff3.pl ./Db-1.Ath.chunk-"i".gff prot > ./Db-1.Ath.chunk-"i".gff3"}}' > ./run_exonerate_gff_to_alignment_gff3_ath
./run_exonerate_gff_to_alignment_gff3_ath
cat ./Db-1.Ath.chunk-*.gff3 | bedtools sort -i - > ./Db-1.Ath.exonerate.gff3
cd ../genome_assembly/Db-1/07Gene_anno/05Exonerate_out/
ln -sf /xxx/ref_genome/Alyrata_384_v2.1.protein.fa .
ln -sf /xxx/genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta genome.fa
echo | awk '{for(i=1;i<=20;i++){print "exonerate -q ./Alyrata_384_v2.1.protein.fa --querychunkid "i" --querychunktotal 20 -t ./genome.fa -m protein2genome --percent 70 --minintron 10 --maxintron 60000 --showvulgar no --showalignment no --showquerygff no --showtargetgff yes --ryo \"AveragePercentIdentity: %pi\\n\" > ./Db-1.Aly.chunk-"i".gff "}}' > ./run_exonerate_aly
./run_exonerate_aly
echo | awk '{for(i=1;i<=20;i++){print "exonerate_gff_to_alignment_gff3.pl ./Db-1.Aly.chunk-"i".gff prot > ./Db-1.Aly.chunk-"i".gff3"}}' > ./run_exonerate_gff_to_alignment_gff3_aly
./run_exonerate_gff_to_alignment_gff3_aly
cat ./Db-1.Aly.chunk-*.gff3 | bedtools sort -i - > ./Db-1.Aly.exonerate.gff3
cd ../genome_assembly/Db-1/07Gene_anno/05Exonerate_out/
ln -sf /xxx/ref_genome/Osativa_323_v7.0.protein.fa .
ln -sf /xxx/genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta genome.fa
echo | awk '{for(i=1;i<=20;i++){print "exonerate -q ./Osativa_323_v7.0.protein.fa --querychunkid "i" --querychunktotal 20 -t ./genome.fa -m protein2genome --percent 70 --minintron 10 --maxintron 60000 --showvulgar no --showalignment no --showquerygff no --showtargetgff yes --ryo \"AveragePercentIdentity: %pi\\n\" > ./Db-1.Osa.chunk-"i".gff "}}' > ./run_exonerate_osa
./run_exonerate_osa
echo | awk '{for(i=1;i<=20;i++){print "exonerate_gff_to_alignment_gff3.pl ./Db-1.Osa.chunk-"i".gff prot > ./Db-1.Osa.chunk-"i".gff3"}}' > ./run_exonerate_gff_to_alignment_gff3_osa
./run_exonerate_gff_to_alignment_gff3_osa
cat ./Db-1.Osa.chunk-*.gff3 | bedtools sort -i - > ./Db-1.Osa.exonerate.gff3
cd ../genome_assembly/Db-1/07Gene_anno/05Exonerate_out/
ln -sf /xxx/ref_genome/Slycopersicum_514_ITAG3.2.protein.fa .
ln -sf /xxx/genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta genome.fa
echo | awk '{for(i=1;i<=20;i++){print "exonerate -q ./Slycopersicum_514_ITAG3.2.protein.fa --querychunkid "i" --querychunktotal 20 -t ./genome.fa -m protein2genome --percent 70 --minintron 10 --maxintron 60000 --showvulgar no --showalignment no --showquerygff no --showtargetgff yes --ryo \"AveragePercentIdentity: %pi\\n\" > ./Db-1.Sly.chunk-"i".gff "}}' > ./run_exonerate_sly
./run_exonerate_sly
echo | awk '{for(i=1;i<=20;i++){print "exonerate_gff_to_alignment_gff3.pl ./Db-1.Sly.chunk-"i".gff prot > ./Db-1.Sly.chunk-"i".gff3"}}' > ./run_exonerate_gff_to_alignment_gff3_sly
./run_exonerate_gff_to_alignment_gff3_sly
cat ./Db-1.Sly.chunk-*.gff3 | bedtools sort -i - > ./Db-1.Sly.exonerate.gff3
cd ../genome_assembly/Db-1/07Gene_anno/05Exonerate_out/
ln -sf /xxx/pan_genome/panPep_lib/PEPlib.list.panPEP.c98.fa .
ln -sf /xxx/genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta genome.fa
echo | awk '{for(i=1;i<=20;i++){print "exonerate -q ./PEPlib.list.panPEP.c98.fa --querychunkid "i" --querychunktotal 20 -t ./genome.fa -m protein2genome --percent 70 --minintron 10 --maxintron 60000 --showvulgar no --showalignment no --showquerygff no --showtargetgff yes --ryo \"AveragePercentIdentity: %pi\\n\" > ./Db-1.panPEP.chunk-"i".gff "}}' > ./run_exonerate_panPEP
./run_exonerate_panPEP
echo | awk '{for(i=1;i<=20;i++){print "exonerate_gff_to_alignment_gff3.pl ./Db-1.panPEP.chunk-"i".gff prot > ./Db-1.panPEP.chunk-"i".gff3"}}' > ./run_exonerate_gff_to_alignment_gff3_panPEP
./run_exonerate_gff_to_alignment_gff3_panPEP
cat ./Db-1.panPEP.chunk-*.gff3 | bedtools sort -i - > ./Db-1.panPEP.exonerate.gff3
#EVidenceModeler
cd ../genome_assembly/Db-1/07Gene_anno/07EVM_out/
ln -sf /xxx/genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta genome.fa
cat ../02Augustus_out/Db-1.augustus.gff3 ../03GeneMark_out/Db-1.genemark.gff3 ../04GlimmerHMM_out/Db-1.glimmerhmm.gff3 ../06SNAP_out/Db-1.snap.gff3 > denovo.gff3
cat ../05Exonerate_out/Db-1.Ath.exonerate.gff3 | sed "s/exonerate/exonerate.Ath/g" > exonerate.v2.gff3
cat ../05Exonerate_out/Db-1.Aly.exonerate.gff3 | sed "s/exonerate/exonerate.Aly/g" >> exonerate.v2.gff3
cat ../05Exonerate_out/Db-1.Sly.exonerate.gff3 | sed "s/exonerate/exonerate.Sly/g" >> exonerate.v2.gff3
cat ../05Exonerate_out/Db-1.Osa.exonerate.gff3 | sed "s/exonerate/exonerate.Osa/g" >> exonerate.v2.gff3
cat ../05Exonerate_out/Db-1.panPEP.exonerate.gff3 | sed "s/exonerate/exonerate.panPEP/g" >> exonerate.v2.gff3
partition_EVM_inputs.pl --genome genome.fa --gene_predictions denovo.gff3 --protein_alignments exonerate.v2.gff3 --segmentSize 100000 --overlapSize 10000 --partition_listing partitions_list.out
write_EVM_commands.pl --genome genome.fa --weights /xxx/run_scripts/evidencemodeler_weights.v2.txt --gene_predictions denovo.gff3 --protein_alignments exonerate.v2.gff3 --output_file_name evm.out.v2 --partitions partitions_list.out > commands.list.v2
./commands.list.v2
recombine_EVM_partial_outputs.pl --partitions partitions_list.out --output_file_name evm.out.v2
convert_EVM_outputs_to_GFF3.pl --partitions partitions_list.out --output evm.out.v2 --genome genome.fa
find . -regex ".*evm.out.v2.gff3" | sort -V | xargs cat | grep EVM > Db-1.EVM.v2.gff
cat Db-1.EVM.v2.gff | awk 'BEGIN{OFS="\t";acc="Db1";print "##gff-version 3"}{if($1 in chrs){chrs[$1]=1}else{chrs[$1]=1;if($1~/Chr/){split($1,a,"Chr");chr_id=a[2];idx=10000}else{if(chr_id!="U"){idx=10000};chr_id="U"}};if($3=="gene"){idx+=10;gene_id="AT"acc"-"chr_id"G"idx; print $1,$2,$3,$4,$5,$6,$7,$8,"ID="gene_id;exon_id=0;cds_id=0};if($3=="mRNA"){mrna_id="AT"acc"-"chr_id"G"idx".1"; print $1,$2,$3,$4,$5,$6,$7,$8,"ID="mrna_id";Parent="gene_id};if($3=="exon"){exon_id+=1; print $1,$2,$3,$4,$5,$6,$7,$8,"ID="mrna_id".exon"exon_id";Parent="mrna_id};if($3=="CDS"){cds_id+=1; print $1,$2,$3,$4,$5,$6,$7,$8,"ID="mrna_id".cds"cds_id";Parent="mrna_id}}' > Db-1.EVM.v2.5.gff
gff3_file_to_proteins.pl Db-1.EVM.v2.5.gff genome.fa prot | sed "s/\.1.*//g" 1> Db-1.EVM.v2.5.protein.fasta 2> nohup.gff3_file_to_proteins.prot.v2.5.log
gff3_file_to_proteins.pl Db-1.EVM.v2.5.gff genome.fa CDS | sed "s/\.1.*//g" 1> Db-1.EVM.v2.5.CDS.fasta 2> nohup.gff3_file_to_proteins.cds.v2.5.log
gff3_file_to_proteins.pl Db-1.EVM.v2.5.gff genome.fa cDNA | sed "s/\.1.*//g" 1> Db-1.EVM.v2.5.cDNA.fasta 2> nohup.gff3_file_to_proteins.cdna.v2.5.log
gff3_file_to_proteins.pl Db-1.EVM.v2.5.gff genome.fa gene | sed "s/\.1.*//g" 1> Db-1.EVM.v2.5.gene.fasta 2> nohup.gff3_file_to_proteins.gene.v2.5.log
##########################################################################################################################################################################################################################################################################################################################################
##########################################################################################################################################################################################################################################################################################################################################