-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy path02genome_annotation
78 lines (60 loc) · 9.79 KB
/
02genome_annotation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
##########################################################################################################################################################################################################################################################################################################################################
##########################################################################################################################################################################################################################################################################################################################################
######
#Annoate mitochondria and chloroplast sequence for Oxford Nanopore accessions, use Are-1 as an example
#polished flye assembly
cat ../genome_assembly/AtAre1/02Flye_out00/racon_out/nextpolish_out/genome.nextpolish.fa | awk '{print $1}' > ../genome_assembly/AtAre1/02Flye_out00/racon_out/nextpolish_out/Mt_Ct_LASTZ_out/Are-1.flye_ass_polished.fa
grep ">" ../genome_assembly/AtAre1/02Flye_out00/racon_out/nextpolish_out/genome.nextpolish.fa | sed -e "s/>//g" -e "s/ /\t/g" > ../genome_assembly/AtAre1/02Flye_out00/racon_out/nextpolish_out/Mt_Ct_LASTZ_out/Are-1.flye_ass_polished.len
cut -f1 ../genome_assembly/AtAre1/02Flye_out00/racon_out/nextpolish_out/Mt_Ct_LASTZ_out/Are-1.flye_ass_polished.len > ../genome_assembly/AtAre1/02Flye_out00/racon_out/nextpolish_out/Mt_Ct_LASTZ_out/Are-1.flye_ass_polished.contig_list
cd ../genome_assembly/AtAre1/02Flye_out00/racon_out/nextpolish_out/Mt_Ct_LASTZ_out
fasta_separator ./Are-1.flye_ass_polished.fa
while read ctg; do echo "lastz /xxx/ref_genome/t2t-col.20210610.ChrM.fasta chr${ctg}.fa --identity=80 --coverage=90 --format=general:score,name1,strand1,size1,start1,end1,name2,strand2,identity,length1,align1 | sort -k5,5n > ./Are-1.flye_ass_polished.${ctg}.ChrM.lastz.out.txt"; echo "lastz /xxx/ref_genome/t2t-col.20210610.ChrC.fasta chr${ctg}.fa --identity=80 --coverage=90 --format=general:score,name1,strand1,size1,start1,end1,name2,strand2,identity,length1,align1 | sort -k5,5n > ./Are-1.flye_ass_polished.${ctg}.ChrC.lastz.out.txt"; done < ./Are-1.flye_ass_polished.contig_list > run_lastz_mt_ct
./run_lastz_mt_ct
#quickmerge assembly
grep ">" ../genome_assembly/AtAre1/04quickmerge_out00/Are-1.smartdenovo_flye_merge.purged.fa | sed -e "s/>//g" -e "s/ /\t/g" > ../genome_assembly/AtAre1/04quickmerge_out00/Mt_Ct_LASTZ_out/Are-1.smartdenovo_flye_merge.purged.len
cut -f1 ../genome_assembly/AtAre1/04quickmerge_out00/Mt_Ct_LASTZ_out/Are-1.smartdenovo_flye_merge.purged.len > ../genome_assembly/AtAre1/04quickmerge_out00/Mt_Ct_LASTZ_out/Are-1.smartdenovo_flye_merge.purged.contig_list
cd ../genome_assembly/AtAre1/04quickmerge_out00/Mt_Ct_LASTZ_out
fasta_separator ../Are-1.smartdenovo_flye_merge.purged.fa
while read ctg; do echo "lastz /xxx/ref_genome/t2t-col.20210610.ChrM.fasta chr${ctg}.fa --identity=80 --coverage=90 --format=general:score,name1,strand1,size1,start1,end1,name2,strand2,identity,length1,align1 | sort -k5,5n > ./Are-1.smartdenovo_flye_merge.purged.${ctg}.ChrM.lastz.out.txt"; echo "lastz /xxx/ref_genome/t2t-col.20210610.ChrC.fasta chr${ctg}.fa --identity=80 --coverage=90 --format=general:score,name1,strand1,size1,start1,end1,name2,strand2,identity,length1,align1 | sort -k5,5n > ./Are-1.smartdenovo_flye_merge.purged.${ctg}.ChrC.lastz.out.txt"; done < ./Are-1.smartdenovo_flye_merge.purged.contig_list > run_lastz_mt_ct
./run_lastz_mt_ct
######
#Annoate mitochondria and chloroplast sequence for Pacbio HiFi accessions, use Db-1 as an example
#quickmerge assembly
grep ">" ../genome_assembly/Db-1/04quickmerge_out000/Db-1.canu_flye_hifiasm.merged.fasta | sed -e "s/>//g" -e "s/ /\t/g" > ../genome_assembly/Db-1/04quickmerge_out000/Mt_Ct_LASTZ_out/Db-1.canu_flye_hifiasm.merged.contig_list
cd ../genome_assembly/Db-1/04quickmerge_out000/Mt_Ct_LASTZ_out
ln -s ../Db-1.canu_flye_hifiasm.merged.fasta Db-1.canu_flye_hifiasm.merged.fasta
fasta_separator ./Db-1.canu_flye_hifiasm.merged.fasta
while read ctg; do echo "lastz chr${ctg}.fa /xxx/ref_genome/t2t-col.20210610.ChrM.fasta --identity=80 --coverage=5 --format=general:score,name1,strand1,size1,start1,end1,name2,strand2,identity,length1,align1 | sort -k5,5n > ./Db-1.canu_flye_hifiasm.${ctg}.ChrM.lastz.out.txt"; echo "lastz chr${ctg}.fa /xxx/ref_genome/t2t-col.20210610.ChrC.fasta --identity=80 --coverage=5 --format=general:score,name1,strand1,size1,start1,end1,name2,strand2,identity,length1,align1 | sort -k5,5n > ./Db-1.canu_flye_hifiasm.${ctg}.ChrC.lastz.out.txt"; done < ./Db-1.canu_flye_hifiasm.merged.contig_list > run_lastz_mt_ct
./run_lastz_mt_ct
#scaffold assembly
grep ">" ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr/Db-1.ragoo.fasta | sed -e "s/>//g" -e "s/ /\t/g" > ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr/Mt_Ct_LASTZ_out/Db-1.ragoo.contig_list
cd ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr/Mt_Ct_LASTZ_out
ln -s ../Db-1.ragoo.fasta Db-1.ragoo.fasta
fasta_separator ./Db-1.ragoo.fasta
while read ctg; do echo "lastz chr${ctg}.fa /xxx/ref_genome/t2t-col.20210610.ChrM.fasta --identity=80 --coverage=5 --format=general:score,name1,strand1,size1,start1,end1,name2,strand2,identity,length1,align1 | sort -k5,5n > ./Db-1.ragoo.${ctg}.ChrM.lastz.out.txt"; echo "lastz chr${ctg}.fa /xxx/ref_genome/t2t-col.20210610.ChrC.fasta --identity=80 --coverage=5 --format=general:score,name1,strand1,size1,start1,end1,name2,strand2,identity,length1,align1 | sort -k5,5n > ./Db-1.ragoo.${ctg}.ChrC.lastz.out.txt"; done < ./Db-1.ragoo.contig_list > run_lastz_mt_ct
./run_lastz_mt_ct
#scaffold-corrected-gapclosed-polished assembly
grep ">" ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta | sed -e "s/>//g" -e "s/ /\t/g" > ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Mt_Ct_LASTZ_out/Db-1.ragoo.gapclosing.polished.contig_list
cd ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Mt_Ct_LASTZ_out
ln -sf ../Db-1.ragoo.gapclosing.polished.fasta Db-1.ragoo.gapclosing.polished.fasta
fasta_separator ./Db-1.ragoo.gapclosing.polished.fasta
while read ctg; do echo "lastz chr${ctg}.fa /xxx/ref_genome/t2t-col.20210610.ChrM.fasta --identity=80 --coverage=5 --format=general:score,name1,strand1,size1,start1,end1,name2,strand2,identity,length1,align1 | sort -k5,5n > ./Db-1.ragoo.gapclosing.polished.${ctg}.ChrM.lastz.out.txt"; echo "lastz chr${ctg}.fa /xxx/ref_genome/t2t-col.20210610.ChrC.fasta --identity=80 --coverage=5 --format=general:score,name1,strand1,size1,start1,end1,name2,strand2,identity,length1,align1 | sort -k5,5n > ./Db-1.ragoo.gapclosing.polished.${ctg}.ChrC.lastz.out.txt"; done < ./Db-1.ragoo.gapclosing.polished.contig_list > run_lastz_mt_ct
./run_lastz_mt_ct
##########################################################################################################################################################################################################################################################################################################################################
##########################################################################################################################################################################################################################################################################################################################################
#Annoate telomeric and centromeric repeats
######
vim ../ref_genome/telomeric_repeat_motif.fa
>telo_repeat
CCCTAAA
vim ../ref_genome/centromeric_repeat_motif.fa
>centro_repeat
AGTATAAGAACTTAAACCGCAACCCGATCTTAAAAGCCTAAGTAGTGTTTCCTTGTTAGA
AGACACAAAGCCAAAGACTCATATGGACTTTGGCTACACCATGAAAGCTTTGAGAAGCAA
GAAGAAGGTTGGTTAGTGTTTTGGAGTCGAATATGACTTGATGTCATGTGTATGATTG
######
bowtie2-build --threads 4 ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta
bowtie2 -f -a --very-sensitive -p 4 -x ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta -U ../ref_genome/telomeric_repeat_motif.fa -S ../genome_evaluation/Db-1/00Bowtie2_out/telomeric_repeat_motif.Db-1.corr.t2t.gapclosing.polished.sam && samtools view -bS ../genome_evaluation/Db-1/00Bowtie2_out/telomeric_repeat_motif.Db-1.corr.t2t.gapclosing.polished.sam | samtools sort | samtools view -h > ../genome_evaluation/Db-1/00Bowtie2_out/telomeric_repeat_motif.Db-1.corr.t2t.gapclosing.polished.srt.sam
bowtie2 -f -a --very-sensitive -p 4 -x ../genome_assembly/Db-1/04quickmerge_out000/ragoo_output_t2t_corr_gapclosing_polish/Db-1.ragoo.gapclosing.polished.fasta -U ../ref_genome/centromeric_repeat_motif.fa -S ../genome_evaluation/Db-1/00Bowtie2_out/centromeric_repeat_motif.Db-1.corr.t2t.gapclosing.polished.sam && samtools view -bS ../genome_evaluation/Db-1/00Bowtie2_out/centromeric_repeat_motif.Db-1.corr.t2t.gapclosing.polished.sam | samtools sort | samtools view -h > ../genome_evaluation/Db-1/00Bowtie2_out/centromeric_repeat_motif.Db-1.corr.t2t.gapclosing.polished.srt.sam
##########################################################################################################################################################################################################################################################################################################################################
##########################################################################################################################################################################################################################################################################################################################################