-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDataPreparation.R
123 lines (74 loc) · 3.04 KB
/
DataPreparation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
library(ggplot2)
library(RANN)
library(ggpubr)
library(rstatix)
library(dplyr); library(stringr); library(data.table)
library(readxl)
###########################################
# Expr: 0: DAPI
# 4: PDL1
# 64: PD1 (High - overexhausted - Opal 650)
# 68: PDL1 PD1
# working directory
current_path <- rstudioapi::getActiveDocumentContext()$path
setwd(dirname(current_path ))
setwd('..')
allTables <- list.files('./Tables')
allTables_tr <- str_replace(allTables, '\\[', ',')
allTables_tr <- str_replace(allTables_tr, ']',',')
coreData_all <- data.frame(matrix(nrow = 0, ncol = 0))
for(id1 in seq_len(13)){
for(id2 in seq_len(16)){
# file id
pattern <- paste0(',1,', id1, ',', id2, ',')
core <- paste0(LETTERS[id1], id2)
# read file data
coreData <- read.csv(paste0('./Tables/', allTables[which(grepl(pattern,allTables_tr) == 'TRUE')]))
# combine
coreData_all <- rbind(coreData_all, cbind(core, coreData))
}
}
saveRDS(coreData_all, 'NSCLCdataset.rds')
#---------------- Nucleus density per type per core
Region.area <- data.frame(read.csv('./Files_Archive/tissue_area_edge_versus_core.csv')) %>%
select(c(Core, area_sum))
# compute density
quant_profiles_all <- data.frame(matrix(nrow = 0, ncol = 0))
for(Core in Region.area$Core){
# core data
core_data <- coreData_all %>%
dplyr::filter(core == Core)
# tissue area
tissue_area <- Region.area[Region.area$Core == Core, 'area_sum']
# get the number/density profile for the core
quant_profiles <- table(core_data$Phenotype) %>%
data.frame() %>%
mutate(density = Freq / tissue_area) %>%
mutate(Core = Core) %>%
`colnames<-` (c('ctype', 'number', 'density', 'Core'))
quant_profiles_all <- rbind.data.frame(quant_profiles_all, quant_profiles)
}
saveRDS(quant_profiles_all, 'Nucleus_Density_Core.rds')
quant_profiles_all <- readRDS('Nucleus_Density_Core.rds')
CD163 <- quant_profiles_all %>%
filter(ctype == 'Other')
summary(CD163)
sd(CD163$density)
#---------------- Nucleus Polygon save all to one
for(file in list.files('./polygonList')) {
#file = list.files('./boundary')[2]
# name to Letter - Number combination
spName <- strsplit(file, '\\]|,') # regex to extract cores
core <- paste0(LETTERS[as.numeric(spName[[1]][2])], spName[[1]][3]) # pieces to cores
file = fread(paste0('./polygonList/', file))
# prefix
#prefix <- strsplit(file, '_')
#prefix <- paste0(prefix[[1]][1], '_', prefix[[1]][2], '_', prefix[[1]][3], '_', prefix[[1]][4], '_morphometrics.csv')
# index NSCLC database to retrieve the core data
cellPos <- NSCLCdata %>%
filter(Core == Core) %>%
#filter(Phenotype != 'Other') %>%
select('CellXPos', 'CellYPos', 'Phenotype', 'ExprPhenotype', 'CellID') #%>%
cellBounds <- read.csv(paste0('./polygonList/', 'TMA_1314_Core[1,3,1]_[66305,17745]_polygons.csv')) %>%
merge(cellPos, by = 'CellID')
}