-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathFirstExon.R
60 lines (53 loc) · 2.62 KB
/
FirstExon.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
##### FirstExon.R
##Collecting all first exons from transcripts annotation
##Last command: Linux, cut -d" " -f1-6 transcripts.gtf | sed 's/ /\t/g' |sed 's/;//g' |sed 's/"//g'| cut -f1,3,4,5,7,10,12,14 >transcripts_cut8.gtf
argv <- commandArgs(TRUE)
path <- paste(argv[1],"",sep='')
#path <-"E:\\Placenta\\Data"
gtf <- as.matrix(read.table(paste(path,"/transcripts_cut8.gtf",sep=''),header = FALSE, sep = "\t",quote = ""))
iso <- as.matrix(read.table(paste(path,"/isoforms.fpkm_tracking",sep=''),header = TRUE, sep = "\t",quote = ""))
iso <- iso[,1:8]
tmp <- rep(0,nrow(gtf))
for (i in nrow(gtf):1) {
if (gtf[i,5]=="-") {
if (i==nrow(gtf) || (i<nrow(gtf) && gtf[i+1,2]=="transcript")) { tmp[i] <- 1 }
} else {
if (as.numeric(gtf[i,8])==1) { tmp[i] <- 1 }
}
}
ant <- gtf[tmp==1,] ##all first exons of each transcript
nontss <- gtf[((tmp==0)+(gtf[,2]=="exon"))==2,] ##all other exons
ant <- ant[order(ant[,7]),]
nontss <- nontss[order(nontss[,7]),]
iso <- iso[order(iso[,1]),] ##all isoforms information
#ant[,6]!=iso[,4]
ant <- cbind(iso,ant[,c(1,3,4,5,8,8,8)])
colnames(ant) <- c(colnames(ant)[1:8],"tss_chr","tss_ss","tss_es","strand","exon_num","in/out","over/non")
## annotate out=1,in=0
ant <- ant[order(ant[,4],as.numeric(ant[,10])),]
for (i in 1:nrow(ant)) {
if (ant[i,12]=="+"){
if (i==1 || (i!=1 && ant[i,4]!=ant[i-1,4])){ first <- as.numeric(ant[i,10]) }
if (as.numeric(ant[i,10])==first) { ant[i,14] <- 1
} else { ant[i,14] <- 0 }
}
}
ant <- ant[order(ant[,4],-as.numeric(ant[,11])),]
for (i in 1:nrow(ant)) {
if (ant[i,12]=="-"){
if (i==1 || (i!=1 && ant[i,4]!=ant[i-1,4])){ first <- as.numeric(ant[i,11]) }
if (as.numeric(ant[i,11])==first) { ant[i,14] <- 1
} else { ant[i,14] <- 0 }
}
}
ant <- ant[order(ant[,1]),]
colnames(nontss) <- c("chr","type","ss","es","strand","gene_id","tracking_id","exon_num")
write.table(ant, paste(path,"/ant.annotation",sep=''),sep="\t", row.names=FALSE, col.name=TRUE, quote = FALSE)
#ant is all first exons based on transcripts record,
ant <- gsub(pattern=" ", replacement="", ant)
nontss <- gsub(pattern=" ", replacement="", nontss)
write.table(ant, paste(path,"/ant.annotation",sep=''),sep="\t", row.names=FALSE, col.name=TRUE, quote = FALSE)
write.table(ant[,c(9:11,1,14,12)], paste(path,"/ant.bed",sep=''),sep="\t", row.names=FALSE, col.name=FALSE, quote = FALSE)
write.table(nontss[,c(1,3,4,7,8,5)], paste(path,"/nontss.bed",sep=''),sep="\t", row.names=FALSE, col.name=FALSE, quote = FALSE)
#ant.annotation: All the first exons abstracted from transcript annotation
#ant.bed and nontss.bed are used for detecting whether the first exons are overlapped with other non-first exons