-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy path2_SC_SCT_normalization.R
executable file
·197 lines (153 loc) · 7.65 KB
/
2_SC_SCT_normalization.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# Script to normalize and perform PCA
# |__Understand normalizing counts is necessary for accurate comparison between cells
# |__Understand how similarity in cellular gene expression between cells can be evaluated by Principal Components Analysis (PCA)
# |__ Executing the normalization, variance estimation, and identification of the most variable genes for each sample
# Goals:
# To accurately normalize and scale the gene expression values to account for differences in sequencing depth and overdispersed count values.
# To identify the most variant genes likely to be indicative of the different cell types present.
# setwd("~/KP/singleCellProjects/multiomeProject")
library(Seurat)
library(ggplot2)
library(harmony) # 1.0
library(Rcpp)
library(openxlsx)
library(here)
library(dplyr)
library(stringr)
library(tibble)
library(markdown)
library(XML)
library(RCurl)
library(AnnotationHub) # BiocManager::install("AnnotationHub")
set.seed(1234)
#load('data/seurat_filtered.RData')
str(filtered_seurat)
# Exploring Sources of unwanted variation
# 1. Normalize the counts -------------------------
seurat_phase <- NormalizeData(filtered_seurat)
str(seurat_phase)
# 2. Evaluating effects of cell cycle --------------
load('~/KP/supporting_files/singleCell/cell_cycle_markers.RData')
# Acquire the S phase genes
s_genes <- cell_cycle_markers %>%
dplyr::filter(phase == "S") %>%
pull("gene_name")
# Acquire the G2M phase genes
g2m_genes <- cell_cycle_markers %>%
dplyr::filter(phase == "G2/M") %>%
pull("gene_name")
# Score cells for cell cycle
# Assign each cell a score based on its expression og G2/M and S phase markers. The function calculates cell cycle phase scores based on canonical markers
seurat_phase <- CellCycleScoring(seurat_phase,
g2m.features = g2m_genes,
s.features = s_genes)
# View cell cycle scores and phases assigned to cells
View([email protected])
# Next we would like to determine whether cell cycle is a major source of variation in our dataset using PCA
# To perform PCA: Find most variable features > scale data > runPCA
# Identify the most variable genes
seurat_phase <- FindVariableFeatures(seurat_phase,
selection.method = "vst",
nfeatures = 2000,
verbose = FALSE)
# Scale the counts
seurat_phase <- ScaleData(seurat_phase)
# Perform PCA to evaulate similarities/differences between cell cycle phase
seurat_phase <- RunPCA(seurat_phase)
# Plot the PCA colored by cell cycle phase
cc1 <- DimPlot(seurat_phase,
reduction = "pca",
group.by= "Phase")
cc2 <- DimPlot(seurat_phase,
reduction = "pca",
group.by= "Phase",
split.by = "Phase")
cc3 <- DimPlot(seurat_phase,
reduction = "pca",
group.by= "Phase",
split.by = "sample")
plot <- plot_grid(cc1, cc2, cc3, labels=c("Grouped all cells together", "Split by Cell Cycle Phase", "Split by Source"), ncol = 2, nrow = 2)
ggsave(plot, filename = 'figures/dimplot_cellCycle_before_regressing.pdf', width = 20, height = 10)
# Since cells separate entirely by phase, we need to regress out cell cycle scores
# regressing it out (Takes a long time!)
# seurat_phase <- ScaleData(seurat_phase, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(seurat_phase))
# regressed_cc_seurat_phase <- seurat_phase
# Now, a PCA on the variable genes no longer returns components associated with cell cycle
# regressed_cc_seurat_phase <- RunPCA(regressed_cc_seurat_phase, features = VariableFeatures(regressed_cc_seurat_phase), nfeatures.print = 10)
# Plot the PCA colored by cell cycle phase post-regression
# plot2 <- DimPlot(regressed_cc_seurat_phase,
# reduction = "pca",
# group.by= "Phase")
# ggsave(plot2, filename = 'figures/dimplot_cellCycle_post_regressing.pdf', width = 10, height = 10)
#
# saveRDS(regressed_cc_seurat_phase, "data/regressed_cc_seurat_phase.rds")
# 3. Evaluating effects of mitochondrial gene expression --------------
# Next, we want to evaluate expression of mitochondrial genes' expression
# Check quartile values
summary([email protected]$mitoRatio)
# Turn mitoRatio into categorical factor vector based on quartile values
[email protected]$mitoFr <- cut([email protected]$mitoRatio,
breaks=c(-Inf, 0.003867, 0.011655, 0.013933, Inf),
labels=c("Low","Medium","Medium high", "High"))
# Plot the PCA colored by mitoFr
# Plot the PCA colored by cell cycle phase
m1 <- DimPlot(seurat_phase,
reduction = "pca",
group.by= "mitoFr")
m2 <- DimPlot(seurat_phase,
reduction = "pca",
group.by= "mitoFr",
split.by = "mitoFr")
m3 <- DimPlot(seurat_phase,
reduction = "pca",
group.by= "mitoFr",
split.by = "sample")
plot <- plot_grid(m1, m2, m3, labels=c("Grouped all cells together", "Split by Mito Fractions", "Split by Source"), ncol = 2, nrow = 2)
ggsave(plot, filename = 'figures/dimplot_mitoFr_before_regressing.pdf', width = 20, height = 10)
# since they show different pattern of scatter, mitoRatio will be required to be regressed out
# 4. SCTransform --------------
# SCTransform automatically accounts for cellular sequencing depth by regressing out sequencing depth (nUMIs)
# Split seurat object by condition to regress cell cycle scoring, mitoRatio and SCT on all samples
split_seurat <- SplitObject(seurat_phase, split.by = "seq_folder")
options(future.globals.maxSize = 4000 * 1024^2)
for (i in 1:length(split_seurat)) {
split_seurat[[i]] <- SCTransform(split_seurat[[i]], vars.to.regress = c("S.Score","G2M.Score","mitoRatio"))
}
# Save the split seurat object
saveRDS(split_seurat, "data/split_seurat.rds")
# Split seurat object by condition to perform cell cycle scoring and SCT on conditions (PDX & Parental)
# Important NOTE: By default, after normalizing, adjusting the variance, and regressing out uninteresting sources of variation,
# SCTransform will rank the genes by residual variance and output the 3000 most variant genes. If the dataset has larger cell numbers,
# then it may be beneficial to adjust this parameter higher using the variable.features.n argument.
# split_seurat <- SplitObject(seurat_phase, split.by = "sample")
# options(future.globals.maxSize = 4000 * 1024^2)
#
#
# for (i in 1:length(split_seurat)) {
# split_seurat[[i]] <- SCTransform(split_seurat[[i]], vars.to.regress = c("S.Score","G2M.Score"))
# }
#
# # finding variable features after SCTransform
# # split_seurat$PDX@[email protected][1:20]
# # split_seurat$Parental@[email protected][1:20]
#
# # Save the split seurat object
# saveRDS(split_seurat, "data/split_seurat.rds")
#' #' 4a. SCTransform for regressed_cc_seurat_phase (using seurat obj which is manually regressed for cc genes)--------------
#' regressed_cc_seurat_phase <- readRDS("data/regressed_cc_seurat_phase.rds")
#' # split by conditions (PDX & Parental)
#' split_regressed_cc_seurat_phase <- SplitObject(regressed_cc_seurat_phase, split.by = "sample")
#' options(future.globals.maxSize = 4000 * 1024^2)
#'
#'
#' for (i in 1:length(split_regressed_cc_seurat_phase)) {
#' split_regressed_cc_seurat_phase[[i]] <- SCTransform(split_regressed_cc_seurat_phase[[i]])
#' }
#'
#' # finding variable features after SCTransform
#' #split_regressed_cc_seurat_phase$PDX@[email protected][1:20]
#' #split_regressed_cc_seurat_phase$Parental@[email protected][1:20]
#'
#' # Save the split seurat object
#' saveRDS(split_regressed_cc_seurat_phase, "data/split_regressed_cc_seurat_phase.rds")
#'