-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathTCGA_stemness.Rmd
206 lines (179 loc) · 8.58 KB
/
TCGA_stemness.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
---
title: "Correlation of selected genes with stemness index"
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
always_allow_html: yes
output:
pdf_document:
toc: no
html_document:
theme: united
toc: yes
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is', fig.width = 10, fig.height = 6) #out.width=700,
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
library(dplyr)
options(stringsAsFactors = FALSE)
```
```{r libraries, include=FALSE}
library(openxlsx)
library(MDmisc)
library(org.Hs.eg.db)
library(KEGG.db)
library(TCGA2STAT)
library(dplyr)
library(knitr)
# library(clusterProfiler)
library(pathview)
library(enrichR)
library(annotables)
# Remove non-canonical chromosome names
grch38 <- grch38[ !(grepl("_", grch38$chr) | grepl("GL", grch38$chr)), ]
grch38 <- grch38[, c("symbol", "description")]
grch38 <- grch38[ complete.cases(grch38) , ]
grch38 <- grch38[ !duplicated(grch38), ]
```
```{r functions}
# A function to load TCGA data, from remote repository, or a local R object
load_data <- function(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE) {
FILE = paste0(data_dir, "/mtx_", disease, "_", data.type, "_", type, ".rda") # R object with data
if (all(file.exists(FILE), !(force_reload))) {
# If the data has been previously saved, load it
load(file = FILE)
} else {
# If no saved data exists, get it from the remote source
mtx <- getTCGA(disease = disease, data.type = data.type, type = type, clinical = TRUE)
save(file = FILE, list = c("mtx")) # Save it
}
return(mtx)
}
# A wrapper function to perform all functional enrichment analyses.
# Helper function to save non-empty results
save_res <- function(res, fileName = fileName, wb = wb, sheetName = "KEGG") {
if (nrow(res) > 0) {
openxlsx::addWorksheet(wb = wb, sheetName = sheetName)
openxlsx::writeData(wb, res, sheet = sheetName)
openxlsx::saveWorkbook(wb, fileName, overwrite = TRUE)
}
}
# A wrapper to save the results
save_enrichr <- function(up.genes = up.genes, dn.genes = NULL, databases = "KEGG_2016", fdr.cutoff = 1, fileNameOut = NULL, wb = NULL) {
print(paste("Running", databases, "analysis", sep = " "))
if (is.null(dn.genes)) {
res.kegg <- enrichGeneList(up.genes, databases = databases, fdr.cutoff = 1)
} else {
res.kegg <- enrichFullGeneList(up.genes, dn.genes, databases = databases, fdr.cutoff = 1)
}
res.kegg$pval <- formatC(res.kegg$pval, digits = 3, format = "e")
res.kegg$qval <- formatC(res.kegg$qval, digits = 3, format = "e")
if (!is.null(fileNameOut)) {
if (nchar(databases) > 30) databases <- paste0(substr(databases, 1, 20), "_", substr(databases, nchar(databases) - 8, nchar(databases))) # If a database is longer that 30 characters, keep first 20 and last 10 characters
save_res(res.kegg, fileNameOut, wb = wb, sheetName = databases)
}
# Pause for a few seconds
pause_sec <- round(runif(1, min = 1, max = 10))
Sys.sleep(pause_sec)
return(res.kegg)
}
```
```{r settings}
system("mkdir -p data")
system("mkdir -p results")
# Path where the downloaded data is stored
data_dir = "/Users/mdozmorov/Documents/Data/GenomeRunner/TCGAsurvival/data" # Mac
# data_dir = "F:/Data/GenomeRunner/TCGAsurvival/data" # Windows
# Selected genes
selected_genes <- c("STARD5")
# Data type
data.type = "RNASeq2" ; type = ""
# All cancers with RNASeq2 data
cancer = c("ACC", "BLCA", "BRCA" , "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA", "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS")
# Or, one cancer
# cancer = c("COAD")
# Correlation type
corr_type <- "pearson"
```
```{r loadExpressionData}
# Combines expression across all selected cancers into one gigantic matrix
all_exprs <- list() # List to store cancer-specific expression matrixes
# Get correlation matrixes for the gene of interest in each cancer
for (cancer_type in cancer) {
# print(paste0("Processing cancer ", cancer_type))
# Prepare expression data
mtx <- load_data(disease = cancer_type, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE)
expr <- mtx$merged.dat[ , 4:ncol(mtx$merged.dat)] %>% as.matrix
rownames(expr) <- mtx$merged.dat[, "bcr"] # Add row TCGA names
expr <- t(log2(expr + 1))
all_exprs[length(all_exprs) + 1] <- list(expr)
}
```
```{r loadStemnessData}
mtx_RNA <- read.xlsx("data.TCGA/TCGA_stemness.xlsx", sheet = 1) # mRNA-based stemness indices
# Remove NA cancers
mtx_RNA <- mtx_RNA[complete.cases(mtx_RNA), ]
# Subset to primary solid tumors, "01A" at positions 14-16
index <- sapply(mtx_RNA$TCGAlong.id, function(x) make.names(substr(x, 14, 16)))
mtx_RNA <- mtx_RNA[index == "X01A", ]
# Make names 12-characters long
mtx_RNA$TCGAlong.id <- sapply(mtx_RNA$TCGAlong.id, function(x) make.names(substr(x, 1, 12)))
# Subset by cancer type
# mtx_RNA <- mtx_RNA[mtx_RNA$cancer.type == cancer, ]
mtx_DNA <- read.xlsx("data.TCGA/TCGA_stemness.xlsx", sheet = 2) # DNAm-based stemness indices
# Remove NA cancers
mtx_DNA <- mtx_DNA[complete.cases(mtx_DNA), ]
# Subset to primary solid tumors, "01A" at positions 14-16
index <- sapply(mtx_DNA$TCGAlong.id, function(x) make.names(substr(x, 14, 16)))
mtx_DNA <- mtx_DNA[index == "X01A", ]
# Make names 12-characters long
mtx_DNA$TCGAlong.id <- sapply(mtx_DNA$TCGAlong.id, function(x) make.names(substr(x, 1, 12)))
# Subset by cancer type
# mtx_DNA <- mtx_DNA[mtx_DNA$cancer.type == cancer, ]
# Names of stemness indices
si_names <- c(colnames(mtx_RNA)[grepl("si", colnames(mtx_RNA))],
colnames(mtx_DNA)[grepl("si", colnames(mtx_DNA))])
# Full matrix of stemness indices
si_matrix <- full_join(mtx_RNA, mtx_DNA, by = "TCGAlong.id")
```
```{r correlations}
correlations <- rbind()
for (i in 1:length(cancer)) {
# Expression of the selected gene in the selected cancer
selected_expression <- data.frame(sample.id = make.names(colnames(all_exprs[[i]])), expr = all_exprs[[i]][ rownames(all_exprs[[i]]) == selected_genes, ])
# For each stemness indes si
# Vectors to store correlation values
all_corrs <- c() # vector(mode = "numeric", length = length(si_names))
all_pvals <- c() # vector(mode = "numeric", length = length(si_names))
all_nums <- c()
for (j in 1:length(si_names)) {
si_matrix_selected <- si_matrix[, c("TCGAlong.id", si_names[j])] # Select current si
selected_expression_si <- inner_join(selected_expression, si_matrix_selected, by = c("sample.id" = "TCGAlong.id")) # Join it with expression
selected_expression_si <- selected_expression_si[complete.cases(selected_expression_si), ] # Drop NAs
cors <- Hmisc::rcorr(selected_expression_si[, "expr"], selected_expression_si[, si_names[j]], type = corr_type) # Run correlation
all_corrs[j] <- cors[[1]][1, 2] # Save the results
all_pvals[j] <- cors[[3]][1, 2]
all_nums[j] <- nrow(selected_expression_si)
}
# Organize them into data frame
correlations <- rbind(correlations,
data_frame(cancer = rep(cancer[i], length(si_names)), num.samples = all_nums, si= si_names, corr = all_corrs, pval = all_pvals))
}
# Sort from largest to smallest correlation
correlations <- correlations[order(correlations$corr, decreasing = TRUE), ]
```
# Correlation analysis
**Question:** In which cancers expression of gene `r selected_genes` significantly correlates with stemness index, what type of stemness index?
- **Legend:**
- "cancer" - cancer type. See abbreviations of cancer types at [http://www.liuzlab.org/TCGA2STAT/CancerDataChecklist.pdf](http://www.liuzlab.org/TCGA2STAT/CancerDataChecklist.pdf)
- "num.samples" - number of samples used to run correlation analysis
- "si" - stemness index. mRNAsi - gene expression-based, EREG-miRNAsi - epigenomic- and gene expression-baset, mDNAsi, EREG-mDNAsi - same but methylation-based, DMPsi - differentially methylated probes-based, ENHsi - enhancer-based. Sorted by in decreasing order
- "corr" - `r corr_type` correlation coefficient
- "pval" - significance of correlation
For details on stemness indices, see Malta, Tathiane M., Artem Sokolov, Andrew J. Gentles, Tomasz Burzykowski, Laila Poisson, John N. Weinstein, Bożena Kamińska, et al. “Machine Learning Identifies Stemness Features Associated with Oncogenic Dedifferentiation.” Cell 173, no. 2 (April 2018): 338-354.e15. https://doi.org/10.1016/j.cell.2018.03.034
```{r}
pander(correlations)
```