forked from davekk/Scripts
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdiff_expre.bash
260 lines (210 loc) · 10.2 KB
/
diff_expre.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# trim reads
#!/bin/bash
#specify paths to your working directory and your reads directory
wok_dir='/home/dkiambi/New_diff_exx' # the working directory with relevant files and folders
read_dir='/home/agisel/Course/' # folder with reads
cd $wok_dir
#create an overall output directory
out_dir=$wok_dir/trim # location of output directory, created before
#make separate output directories within the overall output directories; -p indicates to create the whole path
mkdir -p $out_dir/run_logs
mkdir -p $out_dir/slurm_scripts
mkdir -p $out_dir/trimming_output
#the reads are in pairs in one folder, iterate over each pair
for READ in A1 A2 A3 E1 E2 E3 #pattern of files
do
#specify the input reads and output names
R1=$READ'-R1.fastq.gz' # expanded to E2-R1.fastq.gz
R2=$READ'-R2.fastq.gz'
R1_trimmed=$READ'_1_trimmed.fastq.gz'
R1_unpaired=$READ'_1_unpaired.fastq.gz'
R2_trimmed=$READ'_2_trimmed.fastq.gz'
R2_unpaired=$READ'_2_unpaired.fastq.gz'
#generate a unique slurm script for this pair
echo \
"#!/bin/bash -e
#SBATCH -p batch
#SBATCH -n 4
#SBATCH [email protected]
#SBATCH -o ./trim.%N.%j.out
#SBATCH -e ./trim.%N.%j.err
#SBATCH --mail-type=END,FAIL
#automatically loads 0.38
module load trimmomatic
#PE for paired end
trimmomatic PE -threads 4 $read_dir/$R1 $read_dir/$R2 $out_dir/trimming_output/$R1_trimmed \
$out_dir/trimming_output/$R1_unpaired $out_dir/trimming_output/$R2_trimmed $out_dir/trimming_output/$R2_unpaired
ILLUMINACLIP:/home/emurungi/TruSeq3-PE-2.fa:2:30:10 HEADCROP:10 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:60 AVGQUAL:20 \
" > $out_dir/slurm_scripts/$READ'_trimmomatic_slurm.sh'
sbatch $out_dir/slurm_scripts/$READ'_trimmomatic_slurm.sh'
done
##
# copy the reference files
cp /home/agisel/Course/Mesculenta_* ./HISAT2_ref/
# index the refernce with hisat
hisat2-build -p 10 HISAT2_ref/Mesculenta_Chr12.fasta HISAT2_ref/Mesculenta_Chr12_indx
module load cufflinks/2.2.1
gffread HISAT2_ref/Mesculenta_305_v6.1.gene_exons.gff3 -T -o HISAT2_ref/Mesculenta_305_v6.1.gene_exons.gtf
#extract splice sites
hisat2_extract_splice_sites.py HISAT2_ref/Mesculenta_305_v6.1.gene_exons.gtf > HISAT2_ref/Mesculenta_splices2.txt
# run hisat map
#!/bin/bash
#specify paths to your working directory and your reads directory
wok_dir='/home/dkiambi/New_diff_exx'
read_dir='/home/dkiambi/New_diff_exx/trim/trimming_output'
cd $wok_dir
#create an overall output directory
out_dir=$wok_dir/HISAT_out
#make separate output directories within the overall output directories; -p indicates to create the whole path
mkdir -p $out_dir/run_logs
mkdir -p $out_dir/HISAT_scripts
#the reads are in pairs in folders, iterate over each pair
for READ in A1 A2 A3 E1 E2 E3
do
#specify the input reads and output names
R1=$READ'_1_trimmed.fastq.gz'
R2=$READ'_2_trimmed.fastq.gz'
R1_map=$READ'.bam'
#generate a unique slurm script for this pair
echo \
"#!/bin/bash -e
#SBATCH -p batch
#SBATCH -n 4
#SBATCH -w mammoth
#SBATCH [email protected]
#SBATCH -o ./map.%N.%j.out
#SBATCH -e ./map.%N.%j.err
#SBATCH --mail-type=END,FAIL
#automatically loads hisat2 and samtools
module load hisat2/2.0.5 samtools/1.8
#create bam
hisat2 -x HISAT2_ref/Mesculenta_Chr12_indx -p 10 --dta --known-splicesite-infile HISAT2_ref/Mesculenta_splices2.txt \
-1 $read_dir/$R1 -2 $read_dir/$R2 | samtools view -bS - -@ 10 > $out_dir/$R1_map " > $out_dir/HISAT_scripts/$READ'_hisat_slurm.sh'
sbatch $out_dir/HISAT_scripts/$READ'_hisat_slurm.sh'
done
# sort with samtools
for c in *.bam; do echo "processing ${c}"; samtools sort -@ 10 ${c} -o 'sorted_'${c};done
# stringtie
for v in sorted_*; do echo "processing ${v}"; module load stringtie;stringtie ${v} -p 12 -G ../HISAT2_ref/Mesculenta_305_v6.1.gene_exons.gff3 -l strg -o ../Stingtie/${v}'_string.gtf';done
# create list of gtf file created by stringtie
for f in Stingtie/*; do echo ${f} > list_gtf.txt ;done
# merge the files
stringtie --merge -p 12 -G HISAT2_ref/Mesculenta_305_v6.1.gene_exons.gff3 -l mrg -o Stingtie/merge_stringtie_gtf list_gtf.txt
# re-estimate transcript abundances and create tables of counts for Ballgown
for x in HISAT_out/sorted_* ; do echo "processing ${x}";stringtie -e -B -G Stingtie/merge_stringtie_gtf -o \
ballgown_str_mrg/'1_'${x:17:-4}/'1_'${x:17:-4}'_vs_mrg_gtf.gtf' -p 10 ${x};done
## take care to string_subset based on your filing system #${x:17:-4}# count the cnumber of charaters to statrt position
# clean the substring with a basic
for x in HISAT_out/sorted_* ; do echo "processing ${x:17:-4}"; done
# transfer the cleaned substring to th ecommand only when sure it is whta you want
# working with full genome
wget #relevant fa.gz and assocated gff3 files
gunzip #ref #gff3
# index the ref
module load hisat2
hisat2-build -p 10 Manihot_esculenta.Manihot_esculenta_v6.dna.toplevel.fa Manihot_esculenta_index
# create intermediate gtf file
module load cufflinks/2.2.1
gffread Manihot_esculenta.Manihot_esculenta_v6.39.gff3 -T -o Manihot_esculenta.Manihot_esculenta_v6.39.gtf
#extract splice sites
hisat2_extract_splice_sites.py Manihot_esculenta.Manihot_esculenta_v6.39.gtf > Manihot_esculenta_splice.txt
#!/bin/bash
#specify paths to your working directory and your reads directory
wok_dir='/home/dkiambi/New_diff_exx'
read_dir='/home/dkiambi/New_diff_exx/trim/trimming_output'
cd $wok_dir
#create an overall output directory
out_dir=$wok_dir/hisat_full_genome
#make separate output directories within the overall output directories; -p indicates to create the whole path
mkdir -p $out_dir/run_logs
mkdir -p $out_dir/HISAT_scripts
#the reads are in pairs in folders, iterate over each pair
for READ in A1 A2 A3 E1 E2 E3
do
#specify the input reads and output names
R1=$READ'_1_trimmed.fastq.gz'
R2=$READ'_2_trimmed.fastq.gz'
R1_map=$READ'.bam'
#generate a unique slurm script for this pair
echo \
"#!/bin/bash -e
#SBATCH -p batch
#SBATCH -n 4
#SBATCH [email protected]
#SBATCH -o ./map.%N.%j.out
#SBATCH -e ./map.%N.%j.err
#SBATCH --mail-type=END,FAIL
#automatically loads hisat2 and samtools
module load hisat2/2.0.5 samtools/1.8
#create bam
hisat2 -x Cassava_geno/Manihot_esculenta_index -p 10 --dta --known-splicesite-infile Cassava_geno/Manihot_esculenta_splice.txt \
-1 $read_dir/$R1 -2 $read_dir/$R2 | samtools view -bS - -@ 10 > $out_dir/$R1_map " > $out_dir/HISAT_scripts/$READ'_hisat_slurm.sh'
sbatch $out_dir/HISAT_scripts/$READ'_hisat_slurm.sh'
done
#
for c in *.bam; do echo "processing ${c}"; samtools sort -@ 10 ${c} -o 'sorted_'${c};done
#
for v in sorted_*; do echo "processing ${v}"; module load stringtie;stringtie ${v} -p 12 -G \
../Cassava_geno/Manihot_esculenta.Manihot_esculenta_v6.39.gff3 -l strg -o ../Stingtie/${v}'_string.gtf';done
# create list of gtf file created by stringtie
for f in STRINGTIE_full_genome/*gtf; do echo ${f} >> list_gtf.txt ;done
# merge the files
stringtie --merge -p 12 -G Cassava_geno/Manihot_esculenta.Manihot_esculenta_v6.39.gff3 -l mrg -o STRINGTIE_full_genome/merge_stringtie_gtf list_gtf_2.txt
# re-estimate transcript abundances and create tables of counts for Ballgown
for x in hisat_full_genome/sorted_* ; do echo "processing ${x}";stringtie -e -B -G \
STRINGTIE_full_genome/merge_stringtie_gtf -o Ballgown_str_full_genome/'1_'${x:25:-4}/'1_'${x:25:-4}'_vs_mrg_gtf.gtf' -p 10 ${x};done
##
#star
### thursday arabidopsis
module load star samtools
STAR --runThreadN 10 --runMode genomeGenerate --genomeDir ./ --genomeFastaFiles Mesculenta_Chr12.fasta \
--sjdbGTFfile Mesculenta_305_v6.1.gene_exons.gtf --sjdbOverhang 140
#star alligner
STAR --runThreadN 4 --runMode alignReads --genomeDir ./HISAT2_ref/ --readFilesCommand zcat --readFilesIn {$At_accn}_R1_trimmomatic.fq.gz --outSAMtype BAM SortedByCoordinate --outSAMattrRGline ID:$At_accn SM:$At_accn --outFileNamePrefix ${At_accn}
# loop
for fil in trim/trimming_output/*trimmed*.gz; do echo "processing ${fil}"; STAR --runThreadN 16 --runMode alignReads --genomeDir ./HISAT2_ref/ --readFilesCommand zcat --readFilesIn ${fil} --outSAMtype BAM SortedByCoordinate --outSAMattrRGline ID:${fil:21:2} SM:${fil:21:2} --outFileNamePrefix ${fil:21:2};done
#####
#!/bin/bash
#specify paths to your working directory and your reads directory
wok_dir='/home/dkiambi/New_diff_exx'
read_dir='/home/dkiambi/New_diff_exx/trim/trimming_output'
cd $wok_dir
#create an overall output directory
out_dir=$wok_dir/star_out
#make separate output directories within the overall output directories; -p indicates to create the whole path
mkdir -p $out_dir/run_logs
mkdir -p $out_dir/star_scripts
#the reads are in pairs in folders, iterate over each pair
for READ in A1 A2 A3 E1 E2 E3
do
#specify the input reads and output names
R1=$READ'_1_trimmed.fastq.gz'
R2=$READ'_2_trimmed.fastq.gz'
R1_map=$READ'.bam'
#generate a unique slurm script for this pair
echo \
"#!/bin/bash -e
#SBATCH -p batch
#SBATCH -n 4
#SBATCH [email protected]
#SBATCH -o ./run_logs/star.%N.%j.out
#SBATCH -e ./run_logs/star.%N.%j.err
#SBATCH --mail-type=END,FAIL
#automatically loads star and samtools
module load module load star samtools
#create bam
STAR --runThreadN 8 --runMode alignReads --genomeDir ./HISAT2_ref/ --readFilesCommand zcat --readFilesIn $read_dir/$R1 $read_dir/$R2 --outSAMtype BAM SortedByCoordinate --outSAMattrRGline ID:${READ} SM:${READ} --outFileNamePrefix ${READ} " > $out_dir/star_scripts/$READ'_star_slurm.sh'
sbatch $out_dir/star_scripts/$READ'_star_slurm.sh'
done
####
#samtools
samtools sort -n -o *_st_namesort.bam *.bam
for file in star_out/*.bam ; do echo "processing ${file}"; module load samtools; samtools sort -@ 10 -n -o ${file:9:5}'_st_namesort.bam' ${file}; done
samtools fixmate -m *_st_namesort.bam *_st_fixmate.bam
for file in *namesort.bam; do echo "processing ${file}"; samtools fixmate -@ 10 -m ${file} ${file:0:9}'fixmate.bam';done
samtools sort -o *_st_positionsort.bam *_st_fixmate.bam
for file in *fixmate.bam ; do echo "processing ${file}"; samtools sort -@ 10 -o ${file:0:9}'positionsort.bam' ${file}; done
samtools markdup -s -r *_st_positionsort.bam *_st_rmdup.bam
for fil in *positionsort.bam; do echo "processing ${fil}"; samtools markdup -s -r ${fil} ${fil:0:9}'rmdup.bam' -@ 10; done
samtools merge -r merged.bam *_rmdup.bam -@ 10
samtools index -@ 10 Col_0_Ler_0_merged.bam