-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathTCGA_correlations.Rmd
367 lines (326 loc) · 17.2 KB
/
TCGA_correlations.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
---
title: "Genes best correlating with the selected gene"
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
always_allow_html: yes
output:
pdf_document:
toc: no
html_document:
theme: united
toc: yes
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is', fig.width = 10, fig.height = 6) #out.width=700,
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
library(dplyr)
options(stringsAsFactors = FALSE)
```
```{r libraries, include=FALSE}
library(openxlsx)
library(MDmisc)
library(org.Hs.eg.db)
library(KEGG.db)
library(TCGA2STAT)
library(dplyr)
library(knitr)
library(sva)
# library(clusterProfiler)
# library(pathview)
# devtools::install_github("mdozmorov/enrichR")
# library(enrichR)
source("https://raw.githubusercontent.com/mdozmorov/enrichR/master/R/api_wrapper.R")
source("https://raw.githubusercontent.com/mdozmorov/RNA-seq/master/calcTPM.R")
library(enrichR)
library(annotables)
# Append gene length
grch37 <- grch37 %>% mutate(Length = abs(end - start))
# Remove non-canonical chromosome names
grch37 <- grch37[ !(grepl("_", grch37$chr) | grepl("GL", grch37$chr)), ] %>% as.data.frame()
grch37 <- grch37[, c("symbol", "description", "Length")]
# grch37 <- grch37[ complete.cases(grch37) , ]
grch37 <- grch37[ !duplicated(grch37), ]
```
```{r functions}
# A function to load TCGA data, from remote repository, or a local R object
load_data <- function(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE) {
FILE = paste0(data_dir, "/mtx_", disease, "_", data.type, "_", type, ".rda") # R object with data
if (all(file.exists(FILE), !(force_reload))) {
# If the data has been previously saved, load it
load(file = FILE)
} else {
# If no saved data exists, get it from the remote source
mtx <- getTCGA(disease = disease, data.type = data.type, type = type, clinical = TRUE)
save(file = FILE, list = c("mtx")) # Save it
}
return(mtx)
}
# A wrapper function to perform all functional enrichment analyses.
```
```{r settings}
system("mkdir -p data")
system("mkdir -p results")
# Path where the downloaded data is stored
data_dir = "/Users/mdozmorov/Documents/Data/GenomeRunner/TCGAsurvival/data" # Mac
# data_dir = "F:/Data/GenomeRunner/TCGAsurvival/data" # Windows
# Selected genes
precalculated <- FALSE
selected_genes <- c("SPHK2") # If nothing precalculated - use one of the genes
method <- "" # If correlation with the selected_gene is measured, method is empty
# If precalculated, use precalculated values
# precalculated <- TRUE
# selected_genes <- "interferon_signature"
# method <- "NMF" # Which dimensionaliry reduction results to use, from NMF, PCA, FA
# Data type
data.type = "RNASeq2" ; type = ""
# data.type = "2018_pub"; type = "mrna" # Neuroblastoma
# Expression cutoff to select a particular range of expression of the selected gene.
# To use all expression, use "0" expression cutoff and "TRUE" top_expression (Default)
expression_cutoff <- 0 # From 0 to 1, percent cutoff of expression of the selected gene
top_expression <- TRUE # Whether to take top (TRUE) of bottom (FALSE) expression
# All cancers with RNASeq2 data
# cancer = c("ACC", "BLCA", "HNSC" , "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA", "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS")
# fileNameIn <- (paste0("data/All_expression_", data.type, "_", type, ".Rda")) # Save expression data
# fileNameOut <- paste0("results/All_correlation_", selected_genes, "_", data.type, "_", type, ".Rda") # Save correlation data
# fileNameRes <- paste0("results/All_results_", selected_genes, "_", data.type, "_", type, ".xlsx") # Save results
# Or, several cancers
cancer = c("BRCA")
# cancer = "nbl_target" # Neuroblastoma
# Correlation type
corr_type <- "pearson"
# Correlation cutoffs
corr_cutoff <- 0.2
p_val_cutoff <- 0.05 # Regular p-value cutoff
p_adj_cutoff <- 0.3 # FDR cutoff
min_kegg_genes <- 20 # Minimum number of genes to run enrichment analysis on
max_kegg_genes <- 2000 # Maximum number of genes to run enrichment analysis on
up_dn_separate <- FALSE # Whether to run KEGG separately on up- and downregulated genes. FALSE - do not distinguish directionality
ntable <- 15 # Number of genes to output in a DEG table
# Save results
fileNameIn <- (paste0("data/Expression_", paste(cancer, collapse = "_"), ".Rda")) # Save expression data
fileNameOut <- paste0("data/Correlation_", selected_genes, "_", paste(cancer, collapse = "_"), ".Rda") # Save correlation data
fileNameRes <- paste0("results/Results_", selected_genes, "_", paste(cancer, collapse = "_"), ".xlsx")
```
```{r loadExpressionData}
if (!file.exists(fileNameIn)) {
all_exprs <- list() # List to store cancer-specific expression matrixes
Group <- c() # Vector to keep cancer assignment
# Get correlation matrixes for the gene of interest in each cancer
for (cancer_type in cancer) {
# print(paste0("Processing cancer ", cancer_type))
# Prepare expression data
mtx <- load_data(disease = cancer_type, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE)
expr <- mtx$merged.dat[ , 4:ncol(mtx$merged.dat)] %>% as.matrix
# Filter by percentile of expression of the selected gene
selected_expression <- expr[, selected_genes]
index_to_keep <- if (top_expression) {selected_expression >= quantile(selected_expression, p = expression_cutoff)} else {selected_expression <= quantile(selected_expression, p = expression_cutoff)}
expr <- expr[index_to_keep, ] # Subset expression
# Save group assignment
Group <- c(Group, rep(cancer_type, nrow(expr)))
# Add gene name
expr <- data.frame(hgnc = colnames(expr), t(expr))
# Save to list
all_exprs[length(all_exprs) + 1] <- list(expr)
}
all_expression <- Reduce(function(...) inner_join(..., by = "hgnc"), all_exprs) # Combine all expression matrixes
rownames(all_expression) <- all_expression$hgnc
all_expression$hgnc <- NULL # Remove hgnc column
all_expression <- as.matrix(all_expression)
# Convert to TPM
common_genes <- intersect(rownames(all_expression), grch37$symbol) %>% unique # Common gene names
all_expression <- all_expression[rownames(all_expression) %in% common_genes, ] # Subset expression matrix
feature_length <- data.frame(Geneid = grch37$symbol[grch37$symbol %in% common_genes],
Length = grch37$Length[grch37$symbol %in% common_genes]) # Make subsetted feature length matrix
feature_length <- aggregate(feature_length$Length, list(feature_length$Geneid), max) # Get the maximum length for duplicate gene symbols
colnames(feature_length) <- c("Geneid", "Length")
feature_length$Geneid <- as.character(feature_length$Geneid)
feature_length <- feature_length[match(rownames(all_expression), feature_length$Geneid), ] # Match row order
all.equal(rownames(all_expression), feature_length$Geneid) # Must be TRUE
all_TPM <- calcTPM(all_expression, feature_length) # Actual TPM calculation
all_TPM <- as.matrix(all_TPM)
# Remove low-expressed genes
ff <- genefilter::pOverA(p = 0.5, A = 0, na.rm = TRUE) # Should be more than 90% of non-zero values
all_TPM <- all_TPM[apply(all_TPM, 1, ff), ]
# boxplot(all_expression[1:1000, 2:100])
# Batch removal, if necessary
if(length(all_exprs) > 1) {
modcombat <- model.matrix(~1, data = as.data.frame(Group))
combat_edata = ComBat(dat=all_TPM, batch=as.factor(Group), mod=modcombat, par.prior=TRUE, prior.plots = FALSE)
combat_edata[combat_edata < 0 ] <- 0 # Set negative values to zeros
all_TPM <- combat_edata
}
# Log-transform
all_TPM <- log2(all_TPM + 1)
# all_expression <- limma::normalizeQuantiles(all_expression)
# sd_cutoff <- quantile(apply(all_expression, 1, sd), 0.10)
# all_expression <- all_expression[ apply(all_expression, 1, sd) > sd_cutoff, ]
# save(all_expression, file = (paste0("data/all_expression_", data.type, "_", type, ".Rda"))) # All cancers
save(all_expression, file = fileNameIn) # Select cancers
} else {
load(file = fileNameIn)
}
```
```{r correlations}
if (!file.exists(fileNameOut)) {
all_corrs <- vector(mode = "numeric", length = nrow(all_expression))
all_pvals <- vector(mode = "numeric", length = nrow(all_expression))
if (precalculated) {
load(paste0("data/", cancer, "_", selected_genes, "_", method, ".Rda"))
}
for (i in 1:nrow(all_expression)) {
# Depending on the existence of precalculated value, calculate the correlation
cors <- Hmisc::rcorr(if(precalculated) {mtx_reduced[, 1]} else {all_expression[ rownames(all_expression) == selected_genes, ]},
all_expression[ i, ], type = corr_type)
all_corrs[i] <- cors[[1]][1, 2]
all_pvals[i] <- cors[[3]][1, 2]
}
# all_corrs <- apply(all_expression, 1, function(x) Hmisc::rcorr(all_expression[ rownames(all_expression) == selected_genes], x)[[1]][1, 2])
# all_pvals <- apply(all_expression, 1, function(x) Hmisc::rcorr(all_expression[ rownames(all_expression) == selected_genes], x)[[3]][1, 2])
correlations <- data.frame(hgnc = rownames(all_expression), corr = all_corrs, pval = all_pvals)
correlations <- right_join(grch37, correlations, by = c("symbol" = "hgnc"))
# correlations <- correlations[ !(is.na(correlations$description) | correlations$description == ""), ]
# Remove genes for which correlation cannot be calculated
correlations <- correlations[complete.cases(correlations), ]
# Sort in decreasing order
correlations <- correlations[ order(correlations$corr, decreasing = TRUE), ]
# Save correlation results
save(correlations, file = fileNameOut)
} else {
load(file = fileNameOut)
}
```
# Correlation analysis
```{r}
# Save correlation results
# Create (or, load) Excel file
unlink(fileNameRes)
wb <- openxlsx::createWorkbook(fileNameRes) # loadWorkbook(fileNameRes) #
save_res(correlations, fileName = fileNameRes, wb = wb, sheetName = "CORR")
```
```{r}
# Select max_kegg_genes for up and dn genes
correlations.up <- correlations[ correlations$pval < p_val_cutoff & correlations$corr > corr_cutoff, ]
if (nrow(correlations.up) > max_kegg_genes) {
correlations.up <- correlations.up[1:max_kegg_genes, ]
}
correlations.dn <- correlations[ correlations$pval < p_val_cutoff & correlations$corr < -corr_cutoff, ]
if (nrow(correlations.dn) > max_kegg_genes) {
correlations.dn <- correlations.dn[(nrow(correlations.dn) - max_kegg_genes):nrow(correlations.dn), ]
}
# Select only significantly correlated genes, positive and negative separately
up.genes <- sort(unique(correlations.up$symbol))
dn.genes <- sort(unique(correlations.dn$symbol))
```
Top `r ntable` genes positively correlated with `r selected_genes`
```{r}
correlations.up$corr <- signif(correlations.up$corr)
correlations.up$pval <- signif(correlations.up$pval)
kable(correlations.up[1:min(nrow(correlations.up), ntable), ])
```
Top `r ntable` genes negatively correlated with `r selected_genes`
```{r}
correlations.dn$corr <- signif(correlations.dn$corr)
correlations.dn$pval <- signif(correlations.dn$pval)
kable(correlations.dn[nrow(correlations.dn):min((nrow(correlations.dn) - ntable), nrow(correlations.dn)), ])
```
Genes positively (n = `r length(up.genes)`) and negatively (n = `r length(dn.genes)`) correlating with the selected gene `r selected_genes` at p < `r p_val_cutoff` cutoff and `r corr_type` correlation coefficient cutoff: >`r corr_cutoff`. Legend:
- `symbol`, `description` - gene symbols/description
- `cor`, `pval - Pearson correlation coefficient, and p-value of correlation significance
Full correlation results are saved in `r fileNameRes` file.
# Functional enrichment analysis
## KEGG canonical pathway enrichment analysis
- Genes positively and negatively correlated with the `r selected_genes` are tested for pathway enrichment separately.
- Each table has enrichment results for both positively/negatively correlated genes. The "direction" column indicate which pathways are enriched in "UP"- or "DN"-regulated genes for positively/negatively correlated genes, respectively.
- Use the "Search" box for each table, to filter the results for "UP" or "DN" only. Search is global within the table, case insensitive.
- FDR cutoff of the significant enrichments - `r p_adj_cutoff`.
**Legend:** "database" - source of functional annotations, "category" - name of functional annotation, "pval" - unadjusted enrichment p-value, "qval" - FDR-adjusted p-value, "genes" - comma-separated differentially expressed genes enriched in a corresponding functional category, "direction" - UP/DN, an indicator whether genes are up- or downregulated.
```{r}
# Run KEGG
if (up_dn_separate) {
# Analyze up- and downregulated genes separately
print(paste0("KEGG pathway run on ", length(up.genes), " upregulated and ", length(dn.genes), " downregulated genes."))
# res.kegg <- save_enrichr(up.genes = up.genes, dn.genes = dn.genes, databases = "KEGG_2019_Human", fdr.cutoff = p_adj_cutoff, fileName = fileNameRes, wb = wb)
res.kegg <- NULL # Initially, empty value
res.kegg.up <- enrichr(up.genes, databases = "KEGG_2019_Human")
res.kegg.dn <- enrichr(dn.genes, databases = "KEGG_2019_Human")
# If significant results are present, save them
if(nrow(res.kegg.up[["KEGG_2019_Human"]]) > 0 & sum(res.kegg.up[["KEGG_2019_Human"]]$Adjusted.P.value < p_adj_cutoff) > 0) {
res.kegg.up <- as.data.frame(res.kegg.up[["KEGG_2019_Human"]])
res.kegg.up <- res.kegg.up[res.kegg.up$Adjusted.P.value < p_adj_cutoff, , drop = FALSE]
res.kegg.up <- res.kegg.up %>% mutate(Direction = "UP")
res.kegg <- rbind(res.kegg, res.kegg.up)
}
if(nrow(res.kegg.dn[["KEGG_2019_Human"]]) > 0 & sum(res.kegg.dn[["KEGG_2019_Human"]]$Adjusted.P.value < p_adj_cutoff) > 0) {
res.kegg.dn <- as.data.frame(res.kegg.dn[["KEGG_2019_Human"]])
res.kegg.dn <- res.kegg.dn[res.kegg.dn$Adjusted.P.value < p_adj_cutoff, , drop = FALSE]
res.kegg.dn <- res.kegg.dn %>% mutate(Direction = "DN")
res.kegg <- rbind(res.kegg, res.kegg.dn)
}
} else {
# Analyze up- and downregulated genes together
print(paste0("KEGG pathway run on ", length(unique(c(up.genes, dn.genes))), " genes without distinguishing them by directionality."))
# res.kegg <- save_enrichr(up.genes = unique(c(up.genes, dn.genes)), databases = "KEGG_2019_Human", fdr.cutoff = p_adj_cutoff, fileName = fileNameRes, wb = wb)
res.kegg <- enrichr(unique(c(up.genes, dn.genes)), databases = "KEGG_2019_Human") # KEGG results only
# If significant results are present, save them
if(nrow(res.kegg[["KEGG_2019_Human"]]) > 0 & sum(res.kegg[["KEGG_2019_Human"]]$Adjusted.P.value < p_adj_cutoff) > 0) {
res.kegg <- as.data.frame(res.kegg[["KEGG_2019_Human"]])
res.kegg <- res.kegg[res.kegg$Adjusted.P.value < p_adj_cutoff, , drop = FALSE]
}
}
# Finally, if something is significant, save that
if (class(res.kegg) == "data.frame") {
res.kegg <- res.kegg[, !grepl("Old", colnames(res.kegg))] # Remove columns having "Old" prefix
save_res(res.kegg[res.kegg$Adjusted.P.value < p_adj_cutoff, , drop = FALSE], fileName = fileNameRes, wb = wb, sheetName = "KEGG")
}
# Display the results
# DT::datatable(res.kegg)
if (nrow(res.kegg) > 0 ) {
kable(res.kegg[1:min(ntable, nrow(res.kegg)), ])
}
```
```{r eval = FALSE}
# For the genes best correlating with the selected gene `r selected_genes` across all cancers. Legend:
#
# - `ID` - unique identifier of functional category
# - `Pvalue` - non-adjusted p-value
# - `OddsRatio` - enrichment odds ratio
# - `ExpCount` - number of genes expected to be selected in a category
# - `Count` - number of genes observed in the current list
# - `Size` - total number of genes in a category
# - `Term` - category description
# - `p.adj` - false discovery rate
# - `SYMBOL`, `ENTREZ` - genes observed in the current list as annotated with a category
res <- gene_enrichment(selected = correlations$symbol, id="symbol", use="KEGG")
res$Pvalue <- signif(res$Pvalue)
res$OddsRatio <- signif(res$OddsRatio)
res$ExpCount <- signif(res$ExpCount)
DT::datatable(res)
```
```{r eval = FALSE}
eg = bitr(correlations$symbol, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
correlations <- left_join(correlations, eg, by = c("symbol" = "SYMBOL"))
geneList <- correlations$corr
names(geneList) <- correlations$ENTREZID
geneList <- geneList[ order(geneList, decreasing = TRUE) ]
kk2 <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 10,
pvalueCutoff = 1,
verbose = TRUE)
head(summary(kk2))
```
```{r eval = FALSE}
degs <- read.xlsx(fileNameRes, cols = c(1, 3), sheet = "CORR") # Read in two columns, gene symbol and fold change
degs.genes <- degs$corr # A vector of numeric log fold changes
names(degs.genes) <- degs$symbol # Give this vector names
# Adjust as needed
pv.out <- pathview(gene.data = degs.genes, pathway.id = "hsa05217", species = "hsa", gene.idtype = "SYMBOL", gene.annotpkg = "org.Hs.eg.db", out.suffix = paste(selected_genes, collapse = "-"))
```
```{r echo=FALSE, out.height='300px', eval=FALSE}
knitr::include_graphics('hsa05217.SLC40A1.png')
```