-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalc_gene_tn.r
80 lines (67 loc) · 2.85 KB
/
calc_gene_tn.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# This is fast, and can be done without parallelization (but should)
# original results were from Xinpei and did not have statistics
# rerun for all cohorts after update of LUAD data
library(jsonlite)
library(mongolite)
source("./src/source_func.R")
source("./src/config.R")
db <- mongo(collection = "gene_tn", db = "pget", url = db_url)
db$drop()
# add gene symbol for searching
manifest <- read.delim(manifest_path, sep = "\t", stringsAsFactors = FALSE, colClasses = "character")
manifest <- manifest[manifest$Type== "protein_coding", c("gene", "gene_name_BCM_refined")]
manifest <- unique(manifest)
rna_t_data <- read_all_data(cohorts, "_RNAseq_gene_RSEM_coding_UQ_1500_log2_Tumor.cct", F)
rna_n_data <- read_all_data(cohorts, "_RNAseq_gene_RSEM_coding_UQ_1500_log2_Normal.cct", F)
pro_t_data <- read_all_data(cohorts, "_proteomics_gene_abundance_log2_reference_intensity_normalized_Tumor.cct", F)
pro_n_data <- read_all_data(cohorts, "_proteomics_gene_abundance_log2_reference_intensity_normalized_Normal.cct", F)
calc_tn <- function(ensg_full, t_data, n_data, datatype) {
tmp <- strsplit(ensg_full, ".", fixed = T)[[1]]
ensg <- tmp[1]
ensg_ver <- tmp[2]
sym <- manifest[manifest$gene == ensg_full, "gene_name_BCM_refined"]
res <- list(
ensembl = ensg, ensembl_ver = ensg_ver,
symbol = sym,
datatype = datatype
)
for (cohort in cohorts) {
t_d <- t_data[[cohort]]
t_row <- NULL
if (!is.null(t_d)) {
t_row <- unlist(t_d[ensg_full, ])
}
n_d <- n_data[[cohort]]
n_row <- NULL
if (!is.null(n_d)) {
n_row <- unlist(n_d[ensg_full, ])
}
if (!is.null(t_row) && !is.null(n_row) &&
length(na.omit(t_row)) >= 20 && length(na.omit(n_row)) >= 10 &&
!all(t_row == 0) && !all(n_row == 0)) {
# T/N analysis
wx_res <- wilcox.test(t_row[!is.na(t_row) & !is.infinite(t_row)], n_row[!is.na(n_row) & !is.infinite(n_row)], conf.int = T)
sign <- unname(ifelse(wx_res$estimate >= 0, 1, -1))
res[[cohort]] <- list(
val = unname(wx_res$estimate),
pval = wx_res$p.value * sign
)
}
}
res[['metap']] <- calc_metap_data(res)
# limit significant number for cohort result in function below
res <- add_sorter(res)
#print(toJSON(res, auto_unbox = TRUE, null = "null", digits = NA))
db$insert(toJSON(res, auto_unbox = TRUE, null = "null", digits = NA))
}
for (ensg_full in manifest$gene) {
calc_tn(ensg_full, rna_t_data, rna_n_data, "RNA")
calc_tn(ensg_full, pro_t_data, pro_n_data, "protein")
}
db$index(add = '{"symbol" : 1}')
# mongoDB 3.4 does not have wildcard index
db$index(add = paste0('{"sorter.', 'metap', '" : 1}'))
for (coh in cohorts) {
db$index(add = paste0('{"sorter.', coh, '" : 1}'))
}
db$disconnect()