ComputationalCytometry.Rmd

---
title: "ComputationalCytometry"
author: "Katrien Quintelier"
date: "5/10/2022"
output:
  pdf_document: default
  html_document: default
editor_options:
  chunk_output_type: inline
---

FlowSOM protocol: https://doi.org/10.1038/s41596-021-00550-0

# Install and load the libraries
## Install the libraries
```{r}
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("flowCore")
# BiocManager::install("ggplot2")
# BiocManager::install("ggpubr")
# BiocManager::install("pheatmap")
# BiocManager::install("tidyr")
# BiocManager::install("CytoML")
# devtools::install_github("LKremer/ggpointdensity")
# devtools::install_github("saeyslab/FlowSOM")
# devtools::install_github("saeyslab/PeacoQC")
# devtools::install_github("saeyslab/CytoNorm")
```

## Load the libraries
```{r}
library("flowCore") # For basic cytometry operations
library("ggplot2") # For nice plots
library("ggpubr")
library("pheatmap") # For pretty heatmap plots
library("tidyr")
library("FlowSOM") # For FlowSOM clustering and functions
library("PeacoQC") # For quality control
library("ggpointdensity") # For nice density plots
library("CytoNorm") # For normalization

source("Extra_functions.R") # To load in additional functions
```


# Organization
## File organization
File organization should be:
ComputationalCytometry (main folder)
- Background_material
- Data
  + AdditionalExercise
  + exampleMatrix.RDS
  + 1_raw
    ~ Day1_WT_Control1
    ~ Day1_WT_Sample1
    ~ Day1_WT_Sample3
    ~ Day1_KO_Control2
    ~ Day1_KO_Sample2
    ~ Day1_KO_Sample4
    ~ Day2_WT_Control1
    ~ Day2_WT_Sample5
    ~ Day2_WT_Sample7
    ~ Day2_KO_Control2
    ~ Day2_KO_Sample6
    ~ Day2_KO_Sample8
    ~ ManualLabeling.rds
    ~ COMP240110_day1.csv
    ~ COMP240110_day2.csv
  + meta.xlsx
 - Extra_functions.R
 - ComputationalCytometry.Rmd
 - FlowSOM_Cheatsheet.pdf
 - AdditionalExercise_ComputationalCytometry.pdf
 - AdditionalExercise.R

## Directories
```{r}
# Set directories
dir_prepr <- "Data/2_preprocessed/" #where the preprocessed data will be stored
dir_QC <- "Data/2_preprocessed/QC/" #where the data QC results will be stored
dir_RDS <- "RDS/" #where the R objects will be stored
dir_results <- "Results/" #where the results will be stored
dir_raw <- "Data/1_raw/" #where the raw data is located

# Create directories
for (path in c(dir_prepr, dir_QC, dir_RDS, dir_results)){
  dir.create(path)
}
```

# Exploration
## Reading in the FCS file
```{r}
ff <- read.FCS("Data/1_raw/Day1_KO_Control2.fcs")
```

## Structure of the flowframe
```{r}
head(ff@exprs)
ff@parameters
head(ff@description)
ff@description$SPILL
```

## Excercises
- Find the number of cells in the file
- Find the acquisition date
```{r}

```


# Preprocessing
## Read in metadata
```{r}
meta <- readxl::read_xlsx("Data/meta.xlsx")
meta <- data.frame(meta, check.names = FALSE)
meta
```

## List files
```{r}
getwd() # What is the current working directory?
# setwd() # To set the correct working directory

dir <- "Data/1_raw" # Point to directory where the data is located
files <- list.files(dir, pattern = ".fcs", full.names = TRUE)
files
```

## Set some variables
```{r}
# List markers of interest
markers_of_interest <- c("FSC-A", "SSC-A", "CD103#BUV395", "CD4#BUV496", 
                         "CD8a#BUV615", "CD86#BUV735", "CD45#BUV805", 
                         "CD62L#BV421", "XCR1#BV510", "Ly6C#BV570", 
                         "CD161#BV605", "Ly6G#BV650", "CD64#BV711", 
                         "CD11c#BV750", "F4/80#BV786", "MHCII#FITC", 
                         "CD69#BB700", "CD3e#PE", "CD11b#PE-CF594", 
                         "CD19#PE-Cy5", "PD1#PE-Cy7", "CD63#AF647", 
                         "CD44#RedFluor710", "LD#APC-eFluor780")

channels_of_interest <- GetChannels(object = ff,
                                    markers = markers_of_interest, 
                                    exact = FALSE)
channels_of_interest
```


Plot two markers against each other in the traditional dot plot
```{r}
plotScatter(ff_after = ff, 
            channels = GetChannels(ff, c("CD3", "CD19"), exact = FALSE),
            adjust = 200)
```

## Remove margins events
### Remove margin events
```{r}
ff_m <- PeacoQC::RemoveMargins(ff, channels = channels_of_interest)
```

### Visualize
```{r}
plotScatter(ff_before = ff, 
            ff_after = ff_m,
            channels = GetChannels(ff, c("CD3", "CD19"), exact = FALSE),
            adjust = 500)
```


## Compensate the data with the spillover matrix that was exported fron FlowJo
```{r}
# Read in the compensation matrix
# comp <- ff@description$SPILL # To use original spillover matrix
comp <- read.csv("Data/1_raw/COMP240110_day1.csv", check.names = FALSE, row.names = 1)
comp

# Correct the row and column names if necessary
colnames(comp) <- gsub(" ::.*", "", colnames(comp))
rownames(comp) <- gsub(" ::.*", "", rownames(comp))
comp

# Compensate
ff_c <- flowCore::compensate(ff_m, comp)

plotScatter(ff_after = ff_c, 
            channels = GetChannels(ff, c("CD3", "CD19"), exact = FALSE),
            adjust = 200)
```


## Transform the data using an estimateLogicle on all files
```{r}
# Aggregate all the files of day one
set.seed(2024) #Make sure that everyone ends up with the same thing
agg <- FlowSOM::AggregateFlowFrames(grep("Day1", files, value = TRUE), 
                                    cTotal = 1000000) 

# Perform all the preprocessing steps on the aggregate
agg_m <- PeacoQC::RemoveMargins(agg, channels = channels_of_interest)
agg_c <- compensate(agg_m, comp)

# Calculate the estimateLogicle on the aggregate
tf <- estimateLogicle(agg_c, colnames(comp)) # Non-scatter channels
agg_t <- transform(agg_c, tf)

# Add linear transform for scatters
reference_marker <- "UV515-A"
q5_goal <- quantile(exprs(agg_t)[,reference_marker], 0.05)
q95_goal <- quantile(exprs(agg_t)[,reference_marker], 0.95)
for(ch in c("FSC-A", "SSC-A", "FSC-H")){
  q5 <- quantile(exprs(agg_t)[,ch], 0.05)
  q95 <- quantile(exprs(agg_t)[,ch], 0.95)
  a <- (q95_goal - q5_goal) / (q95 - q5)
  b <- q5_goal - q5 * (q95_goal - q5_goal) / (q95 - q5)
  tf <- c(tf,
          transformList(ch, flowCore::linearTransform(a = a,
                                                      b = b)))
}

# Transform
ff_t <- transform(ff_c, tf)

plotScatter(ff_after = ff_t, 
            channels = GetChannels(ff, c("CD3", "CD19"), exact = FALSE),
            adjust = 1)
```

## Quality control
PeacoQC: https://doi.org/10.1002/cyto.a.24501
```{r}
ff_QC <- PeacoQC::PeacoQC(ff = ff_t, 
                          channels = channels_of_interest,
                          output_directory = dir_QC,
                          plot = TRUE)

plotScatter(ff_after = ff_QC$FinalFF, 
            ff_before = ff_t,
            channels = GetChannels(ff, c("Time", "CD3"), exact = FALSE),
            adjust = 1)
```

## Data filtering (can include removal of debris, doublets, dead cells, etc.)
```{r}
# Remove doublets
ff_d <- PeacoQC::RemoveDoublets(ff_QC$FinalFF, nmad = 6)
plotScatter(ff_after = ff_d, 
            ff_before = ff_QC$FinalFF,
            channels = c("FSC-A", "FSC-H"),
            adjust = 1)

# Remove dead cells
live_gate <- flowCore::polygonGate(filterId = "Live", 
                                   "V810-A" = c( 0. , 4  ,4,3.5,2.2,0.  ),
                                   "R780-A" = c(-0.5,-0.5,3,3  ,1.5,1.25))
selected_live <- filter(ff_d, live_gate) 
ff_l <- ff_d[selected_live@subSet,]

plotScatter(ff_after = ff_l, 
            ff_before = ff_d,
            channels = GetChannels(ff, c("F4/80", "LD"), exact = FALSE),
            adjust = 1)

# Remove CD45- events
CD45_gate <- estimateGate(ff_l, 
                          marker_values = list("FSC-A" = c(min = 1, 
                                                           range_min = 0.2),
                                               "UV810-A" = c(min = 2.25,
                                                             range_min = 0.5)))

ff_CD45pos <- ff_l[CD45_gate,]


plotScatter(ff_before = ff_l, 
            ff_after = ff_CD45pos,
            channels = GetChannels(ff, c("FSC-A", "CD45"), exact = FALSE))

```

## Preprocessing all files
### Prepare varibales
```{r}
# Check files
files

# Compensation matrices for day 1 and day 2
spill <- read.csv("Data/1_raw/COMP240110_day1.csv", check.names = FALSE, row.names = 1)
spill_d2 <- read.csv("Data/1_raw/COMP240220_day2.csv", check.names = FALSE, row.names = 1)
colnames(spill) <-  gsub(" ::.*", "", colnames(spill))
rownames(spill) <-  gsub(" ::.*", "", rownames(spill))
colnames(spill_d2) <-  gsub(" ::.*", "", colnames(spill_d2))
rownames(spill_d2) <-  gsub(" ::.*", "", rownames(spill_d2))

# We will reuse the same transformationList from previously
names(tf@transforms)

# We will use the same live gate from previously
live_gate
```


### Preprocess in a loop
```{r}
for (file in files){
  
  print(file)
  
  ff <- read.FCS(file)
  
  # Remove margins
  ff_m <- PeacoQC::RemoveMargins(ff, channels = channels_of_interest)

  # Compensate with the correct compensation matrix
  if(startsWith(basename(file), "Day1")){
    ff_c <- compensate(ff_m, spill)
  } else {
    ff_c <- compensate(ff_m, spill_d2)
  }

  # Transform
  ff_t <- transform(ff_c, tf)
  
  # Quality control
  ff_QC <- PeacoQC::PeacoQC(ff = ff_t, 
                            channels = channels_of_interest, 
                            output_directory = dir_QC, 
                            save_fcs = FALSE)
  ff_QC <- ff_QC$FinalFF
  
  # Data filtering
  ff_d <- PeacoQC::RemoveDoublets(ff_QC, nmad = 6)

  selected_live <- filter(ff_d, live_gate) 
  ff_l <- ff_d[selected_live@subSet,]

  CD45_gate <- estimateGate(ff_l, 
                            marker_values = list("FSC-A" = c(min = 1, 
                                                             range_min = 0.2),
                                                 "UV810-A" = c(min = 2.25,
                                                               range_min = 0.5)))

  ff_CD45pos <- ff_l[CD45_gate,]
  
  # Save preprocessed fcs file
  write.FCS(ff_CD45pos, file.path(dir_prepr, basename(file)))
  
  # Make an overview figure of all the different steps
  preprocessing_plots <- list(filter_plot(ff, ff_m, "Non-margins", "FSC-A", "SSC-A"),
                              filter_plot(ff_t, ff_QC, "Stable signal", "Time", "FSC-A"),
                              filter_plot(ff_QC, ff_d, "Singlets", "FSC-A", "FSC-H"),
                              filter_plot(ff_d, ff_l, "Live", "V810-A", "R780-A"),
                              filter_plot(ff_l, ff_CD45pos, "CD45+", "FSC-A", "UV810-A"),
                              filter_plot(ff_t, ff_CD45pos, "Final selection", "YG586-A", "YG670-A"))
  ggsave(filename = file.path(dir_prepr, gsub(".fcs", "_preprocessing.png", basename(file))),
         plot = ggpubr::ggarrange(plotlist = preprocessing_plots, nrow = 1),
         width = 25, height = 5)
}
```

## Exercise
Plot Ly6G against Ly6C for the preprocessed sample 1
```{r}

```

# Check for batch effects
## List possible batches
```{r}
files <- list.files(path = "Data/2_preprocessed", 
                    full.names = TRUE, 
                    pattern = "Day.*fcs")

batchLevels <- list("genotype" = sub(".*Day[1-2]_(..)_.*", "\\1", files),
                    "day" = sub(".*/(Day[1-2])_.*", "\\1", files))
batchLevels
```

## Markers and channels of interest
```{r}
ff <- read.FCS("Data/2_preprocessed/Day1_WT_Control1.fcs")
markersOfInterest <- c("FSC-A", "SSC-A", "CD103", "CD4#", "CD8a", "CD86", 
                       "CD62L", "XCR1", "Ly6C", "CD161", "Ly6G", 
                       "CD64", "CD11c", "F4/80", "MHCII", "CD69", "CD3e", 
                       "CD11b", "CD19", "PD1", "CD63", "CD44") #All markers except CD45 and LD
channelsOfInterest <- FlowSOM::GetChannels(object = ff,
                                           markers = markersOfInterest,
                                           exact = FALSE)
channelsOfInterest
```

## Scatter and density plots
```{r}
scatterDir <- "Data/3_normalized/scatterPlots/"
densityDir <- "Data/3_normalized/densityPlots/"
UMAPDir <- "Data/3_normalized/UMAPPlots/"
dir.create(scatterDir, recursive = TRUE)
dir.create(densityDir)
dir.create(UMAPDir)

set.seed(2024)
agg <- FlowSOM::AggregateFlowFrames(fileNames = files, 
                                    cTotal = length(files)*500) 
df <- data.frame(exprs(agg), check.names = FALSE)
df$genotype <- batchLevels$genotype[df$File]
df$day <- batchLevels$day[df$File]

dimred <- umap::umap(df[,channelsOfInterest]) 
df$UMAP1 <- dimred$layout[,1]
df$UMAP2 <- dimred$layout[,2]

for (level in names(batchLevels)){
  # Scatter plots
  FlowSOM::PlotFileScatters(input = files[order(batchLevels[[level]])],
                            names = sub(".*/", "", files)[order(batchLevels[[level]])],
                            groups = batchLevels[[level]][order(batchLevels[[level]])], 
                            channels = channelsOfInterest,
                            plotFile = paste0(scatterDir, "scatters_", level, ".png"), 
                            ncol = 5)
  
  # Density plots
  plotDens(input = df,
           channels = channelsOfInterest,
           level = level,
           plotFile = paste0(densityDir, "densities_", level, ".pdf"))
    
  # Color UMAP plot
  plotUMAP(input = df,
           level = level,
           plotFile = paste0(UMAPDir, "UMAP_", level, ".pdf"))
}
```

# Normalisation
## Minmax normalization
Most simple approach: align the min and max of all files individually.  
x' = (x - min(x)) / (max(x) - min(x)) 

- Upside: easy to understand  
- Downside: often too simplistic, sensitive to outliers, file-approach might 
remove biological variation 

### Normalization
```{r}
dir.create("Data/3_normalized/minMax/files", recursive = TRUE)
for (file in files){
  ff <- read.FCS(file)
  for (channel in channelsOfInterest){
    min <- min(ff@exprs[,channel])
    max <- max(ff@exprs[,channel])
    ff@exprs[,channel] <- (ff@exprs[,channel] - min)/(max - min)
  }
  write.FCS(ff, paste0("Data/3_normalized/minMax/files/", basename(file)))
}
```

### Evaluate the normalization quality
#### Scatter and density plots
```{r}
files_norm <- list.files("Data/3_normalized/minMax/files", pattern = ".fcs",
                         full.names = TRUE)

set.seed(2024)
agg <- FlowSOM::AggregateFlowFrames(fileNames = files_norm, 
                                    cTotal = length(files_norm)*500)
df <- data.frame(exprs(agg), check.names = FALSE)
df$genotype <- batchLevels$genotype[df$File]
df$day <- batchLevels$day[df$File]

dimred <- umap::umap(df[,channelsOfInterest]) 
df$UMAP1 <- dimred$layout[,1]
df$UMAP2 <- dimred$layout[,2]

for (level in names(batchLevels)){
  # Scatter plots
  FlowSOM::PlotFileScatters(input = files_norm[order(batchLevels[[level]])],
                            names = sub(".*/", "", files_norm)[order(batchLevels[[level]])],
                            groups = batchLevels[[level]][order(batchLevels[[level]])], 
                            channels = channelsOfInterest,
                            plotFile = paste0(scatterDir, "scatters_", level, "_minMax.png"), ncol = 5)
  
  # Density plots
  plotDens(input = df,
           channels = channelsOfInterest,
           level = level,
           plotFile = paste0(densityDir, "densities_", level, "_minMax.pdf"))
    
  # Color UMAP plot
  plotUMAP(input = df,
           level = level,
           plotFile = paste0(UMAPDir, "UMAP_", level, "_minMax.pdf"))
}
```

## Quantile batch normalization
Find the e.g. 0.01 and 0.99 quantiles of all files, calculate the medians, 
compute the parameters to align the quantiles of the second batch to the first 
one and apply this to all batch 2 files.  
x' = goal_min + ((x- min(x)) * (goal_max - goal_min)) / (max(x) - min(x)) 

- Upside: batch approach preserves biological variation, less sensitive to outliers 
- Downside: might still be too simplistic, cannot resolve population-specific batch effects

- Added with respect to previous approach:
   - batch approach instead of single file approach
   - 1st and 99th quantile instead of absolute min/max
   
The quantiles can be changed to 2.5 and 97.5 or any other value, depending on your data.

### Normalization
```{r}
dir.create("Data/3_normalized/quantile/files", recursive = TRUE)
files_Day1 <- files[grep("Day1", files)]
files_Day2 <- files[grep("Day2", files)]

# Make list of matrices to store quantiles
quantiles <- list()
for (channel in channelsOfInterest){
  quantiles[[channel]] <- matrix(data = NA,
                                 nrow = length(c(files_Day1, files_Day2)), 
                                 ncol = 2,
                                 dimnames = list(c(files_Day1, files_Day2),
                                                 c("q01", "q99"))) 
}

# Loop over files to get quantiles
for (file in c(files_Day1, files_Day2)){
  ff <- read.FCS(file)
  for (channel in channelsOfInterest){
    quantiles[[channel]][file,] <- c(quantile(ff@exprs[,channel], 0.01),
                                     quantile(ff@exprs[,channel], 0.99))
  }
}

# Normalize files
## Copy day 1 files without normalizing
for (file in files_Day1){ 
  ff <- read.FCS(file)
  write.FCS(ff, paste0("Data/3_normalized/quantile/files/", basename(file)))
}

## Quantile normalization for Day 2 files
for (file in files_Day2){ 
  ff <- read.FCS(file)
  for (channel in channelsOfInterest){
    batch1_q01 <- median(quantiles[[channel]][files_Day1, "q01"]) # Calculate goal quantile
    batch1_q99 <- median(quantiles[[channel]][files_Day1, "q99"])
    batch2_q01 <- median(quantiles[[channel]][files_Day2, "q01"]) # Calculate quantiles of whole day 2 batch
    batch2_q99 <- median(quantiles[[channel]][files_Day2, "q99"])
    ff@exprs[,channel] <- batch1_q01 + ((ff@exprs[,channel]- batch2_q01) * (batch1_q99 - batch1_q01)) / (batch2_q99 - batch2_q01)
  }
  write.FCS(ff, paste0("Data/3_normalized/quantile/files/", basename(file)))
}
```

### Evaluate the normalization quality
#### Scatter and density plots
```{r}
files_norm <- list.files("Data/3_normalized/quantile/files", pattern = ".fcs",
                         full.names = TRUE)

set.seed(2024)
agg <- FlowSOM::AggregateFlowFrames(fileNames = files_norm, 
                           cTotal = length(files_norm)*500)
df <- data.frame(exprs(agg), check.names = FALSE)
df$genotype <- batchLevels$genotype[df$File]
df$day <- batchLevels$day[df$File]

dimred <- umap::umap(df[,channelsOfInterest]) 
df$UMAP1 <- dimred$layout[,1]
df$UMAP2 <- dimred$layout[,2]

for (level in names(batchLevels)){
  # Scatter plots
  FlowSOM::PlotFileScatters(input = files_norm[order(batchLevels[[level]])],
                            names = sub(".*/", "", files_norm)[order(batchLevels[[level]])],
                            groups = batchLevels[[level]][order(batchLevels[[level]])], 
                            channels = channelsOfInterest,
                            plotFile = paste0(scatterDir, "scatters_", level, "_quantile.png"), ncol = 5)
  
  # Density plots
  plotDens(input = df,
           channels = channelsOfInterest,
           level = level,
           plotFile = paste0(densityDir, "densities_", level, "_quantile.pdf"))
    
  # Color UMAP plot
  plotUMAP(input = df,
           level = level,
           plotFile = paste0(UMAPDir, "UMAP_", level, "_quantile.pdf"))
}
```

## CytoNorm normalization
FlowSOM clustering followed by a spline normalization
https://doi.org/10.1002/cyto.a.23904

- Added with respect to previous approach:
   - clustering before normalization
   - 101 quantiles instead of 2 quantiles

### Train model
```{r}
dir.create("Data/3_normalized/CytoNorm")

# Specify channels for clustering
markersToCluster <- c("FSC-A", "SSC-A", "CD4#", "CD8a", 
                      "CD62L", "XCR1", "Ly6C", "CD161", "Ly6G", "CD64", 
                      "CD11c", "F4/80", "MHCII", "CD3e", "CD11b", 
                      "CD19", "CD63", "CD44") #Not all markers contribute to clustering
channelsToCluster <- FlowSOM::GetChannels(object = ff,
                                          markers = markersToCluster,
                                          exact = FALSE)

model <- CytoNorm::CytoNorm.train(files = c("Data/2_preprocessed/Day1_WT_Control1.fcs",
                                            "Data/2_preprocessed/Day2_WT_Control1.fcs"), 
                                  labels = c("Day1", "Day2"), 
                                  channels = channelsOfInterest, seed = 2024, 
                                  plot = TRUE, normParams = list("goal" = "Day1"), 
                                  transformList = NULL,
                                  FlowSOM.params = list(nCells = 200000, 
                                                        nClus = 10, 
                                                        channels = channelsToCluster),
                                  outputDir = "Data/3_normalized/CytoNorm")
saveRDS(model, "Data/3_normalized/CytoNorm/model.rds")
```

### Evaluate the CytoNorm assumption: check the CVs
```{r}
CVs <- CytoNorm::testCV(fsom = model$fsom,
                        cluster_values = 5:15,
                        plot = FALSE)

pdf("Data/3_normalized/CytoNorm/CVs.pdf")
CytoNorm::PlotOverviewCV(fsom = model$fsom, cv_res = CVs,
                         show_cv = 1, max_cv = 1.5)
dev.off()
```

### Apply model
```{r}
CytoNorm::CytoNorm.normalize(model = model,
                             files = files,
                             labels = sub(".*/(Day[1-2])_.*", "\\1", files),
                             transformList = NULL,
                             prefix = "",
                             transformList.reverse = NULL,
                             outputDir = "Data/3_normalized/CytoNorm/files")
```

### Evaluate the normalization quality
#### Density plots
```{r}
files_notnorm <- list.files("Data/2_preprocessed", pattern = "WT.*fcs", 
                            full.names = TRUE)
files_norm <- list.files("Data/3_normalized/CytoNorm/files", pattern = ".fcs",
                         full.names = TRUE)
# Show only control files
p <- CytoNorm::plotDensities(input = list("Day1" = "Data/2_preprocessed/Day1_WT_Control1.fcs",
                                          "Day2" = "Data/2_preprocessed/Day2_WT_Control1.fcs",
                                          "Day1_norm" = "Data/3_normalized/CytoNorm/files/Day1_WT_Control1.fcs",
                                          "Day2_norm" = "Data/3_normalized/CytoNorm/files/Day2_WT_Control1.fcs"),
                             channels = channelsOfInterest)
# Show all files
p2 <- CytoNorm::plotDensities(input = list("Day1" = files_notnorm[startsWith(sub(".*/", "", files_notnorm), "Day1")],
                                           "Day2" = files_notnorm[startsWith(sub(".*/", "", files_notnorm), "Day2")],
                                           "Day1_norm" = files_norm[startsWith(sub(".*/", "", files_norm), "Day1")],
                                           "Day2_norm" = files_norm[startsWith(sub(".*/", "", files_norm), "Day2")]),
                              channels = channelsOfInterest)

pdf("Data/3_normalized/CytoNorm/densities_control.pdf", height = 30, width = 5)
ggarrange(ggarrange(plotlist = p[1:length(p)-1], 
                    ncol = 2, 
                    nrow = length(channelsOfInterest)),
          p$legend, nrow = 2, heights = c(20,1))
ggarrange(ggarrange(plotlist = p2[1:length(p2)-1], 
                    ncol = 2, 
                    nrow = length(channelsOfInterest)),
          p2$legend, nrow = 2, heights = c(20,1))
dev.off()
```

#### Spline plots
```{r}
pdf("Data/3_normalized/CytoNorm/splines.pdf", height = 20, width = 20)
# Splines of all cluster in single plot
CytoNorm::plotSplines(model = model, channels = channelsOfInterest[-c(1,2)], 
            groupClusters = TRUE)
# Single plot per spline
CytoNorm::plotSplines(model = model, channels = channelsOfInterest[-c(1,2)])
dev.off()
```

#### Scatter and density plots
```{r}
set.seed(2024)
agg <- FlowSOM::AggregateFlowFrames(fileNames = files_norm, 
                                    cTotal = length(files_norm)*500)
df <- data.frame(exprs(agg), check.names = FALSE)
df$genotype <- batchLevels$genotype[df$File]
df$day <- batchLevels$day[df$File]

dimred <- umap::umap(df[,channelsOfInterest]) 
df$UMAP1 <- dimred$layout[,1]
df$UMAP2 <- dimred$layout[,2]

for (level in names(batchLevels)){
  # Scatter plots
  FlowSOM::PlotFileScatters(input = files_norm[order(batchLevels[[level]])],
                            names = sub(".*/", "", files_norm)[order(batchLevels[[level]])],
                            groups = batchLevels[[level]][order(batchLevels[[level]])], 
                            channels = channelsOfInterest,
                            plotFile = paste0(scatterDir, "scatters_", level, "_CytoNorm.png"), ncol = 5)
  
  # Density plots
  plotDens(input = df,
           channels = channelsOfInterest,
           level = level,
           plotFile = paste0(densityDir, "densities_", level, "_CytoNorm.pdf"))
    
  # Color UMAP plot
  plotUMAP(input = df,
           level = level,
           plotFile = paste0(UMAPDir, "UMAP_", level, "_CytoNorm.pdf"))
}
```

# Make aggregate file
```{r}
files <- list.files(path = "Data/3_normalized/CytoNorm/files", #dir_prepr if no normalization was needed
                    pattern = "Day.*Sample.*.fcs",
                    full.names = TRUE)
files

set.seed(2024)
agg <- AggregateFlowFrames(fileNames = files,
                           cTotal = length(files)*10000,
                           writeOutput = TRUE,
                           outputFile = "Data/3_normalized/CytoNorm/files/aggregate.fcs")
```

# Make FlowSOM model 

Markers are divided into two distinct categories:
- Lineage (type) markers:
  + Their level of expression is assumed stable across cell populations
  + They are used by FlowSOM to define the clustering of data
- State (activation) markers:
  + More transient than lineage, their expression levels can differ within populations
  + They are not used in clustering, but their MFIs (median fluorescence intensity) *within* FlowSOM-generated meta-clusters can be analysed

## Markers and channels of interest
```{r}
ff <- read.FCS("Data/3_normalized/CytoNorm/files/Day1_WT_Control1.fcs")
# Could be changed to reading in an Excel
lineageMarkers <- c("FSC-A", "SSC-A", "CD4#", "CD8a", "CD62L", "XCR1", "Ly6C", 
                    "CD161", "Ly6G", "CD64", "CD11c", "F4/80", "MHCII", "CD3e", 
                    "CD11b", "CD19", "CD63", "CD44")
stateMarkers <- c("CD103", "CD86", "CD69", "PD1")

channelsForClustering <- GetChannels(object = ff,
                                     markers = lineageMarkers,
                                     exact = FALSE)
channelsForMFI <- GetChannels(object = ff,
                              markers = stateMarkers,
                              exact = FALSE)
markerInfo <- data.frame(
  "Channel" = c(channelsForClustering, channelsForMFI),
  "Marker"  = c(lineageMarkers, stateMarkers),
  "Type"    = as.factor(c(
    rep("type", times = length(channelsForClustering)),
    rep("state", times = length(channelsForMFI))
  ))
)
head(markerInfo)
saveRDS(markerInfo, paste0(dir_RDS, "MarkerInfo.RDS"))
```

## Train the FlowSOM model
```{r}
fsom <- FlowSOM(input = agg, 
                scale = FALSE,
                colsToUse = channelsForClustering,
                seed = 2024,
                nClus = 25,
                xdim = 15, ydim = 15)

saveRDS(fsom, "RDS/fsom.RDS")

PlotStars(fsom = fsom,
          maxNodeSize = 4,
          backgroundValues = fsom$metaclustering)
```

# Test FlowSOM quality
## Make 2D scatter plots
```{r}
channel_pairs = list(c("Ly6C", "CD11b"),
                     c("CD19", "CD3e"),
                     c("CD8a", "CD4"),
                     c("CD11b", "XCR1"))

Plot2DScatters(fsom = fsom,
               channelpairs = channel_pairs,
               metaclusters = seq_len(NMetaclusters(fsom)),
               clusters = 1,
               plotFile = paste0(dir_results, "fsom_2D_scatters.png"))
```

## Check consistency with manual gating
```{r}
manual <- readRDS("Data/1_raw/ManualLabeling.rds")

agg_labels <- rep(NA, nrow(agg))
for (file_nr in unique(agg@exprs[,"File"])){
  man <- as.character(manual[[basename(files[file_nr])]])
  row_ids <- agg@exprs[,"File"] == file_nr
  agg_labels[row_ids] <- man[agg@exprs[,"Original_ID"][row_ids]]
}

# On the FlowSOM tree
PlotPies(fsom = fsom,
         cellTypes = factor(agg_labels,
                            levels = c("Unlabeled", "Neutrophils", "Monocytes", 
                                       "B cells", "NK cells", "CD4 T cells", 
                                       "CD8 T cells", "DCs")),
        maxNodeSize = 3)

ggsave(file.path(dir_results, "fsom_manualTree.pdf"))

# In a bar plot
pdf(file.path(dir_results, "fsom_manualBar.pdf"), width = 10)
PlotManualBars(fsom = fsom,
               manualVector = agg_labels, 
               manualOrder = c("Unlabeled", "Neutrophils", "Monocytes", 
                               "B cells", "NK cells", "CD4 T cells", 
                               "CD8 T cells", "DCs"),
               colors = c("#E5E5E5", "#070089", "#141EF5", "#4EACF8", 
                          "#81FBC3", "#D8FE5D", "#F1A23A", "#EA3724"))
dev.off()
```

## Calculate the purity of the FlowSOM clustering
Returns the mean purity score, worst purity score and number of clusters with score <0.75
```{r}
?Purity
Purity(realClusters = agg_labels,
       predictedClusters = GetClusters(fsom))
```

## Inspect the file contribution per cluster
```{r}
file_colors <- c("#6a040f", "#9d0208", "#d00000", "#dc2f02", "#e85d04", "#F4AE82", #Different shades within the groups
                 "#03045e", "#023e8a", "#0077b6", "#0096c7", "#00b4d8", "#80daec") 

PlotPies(fsom = fsom,
         cellTypes = factor(basename(files)[fsom$data[,"File"]],
                           levels = basename(files)),
         colorPalette = file_colors,
         maxNodeSize = 3)
ggsave(paste0(dir_results, "fsom_filecontribution.pdf"))
```

# Discovery and downstream analysis
## Eplore the FlowSOM result, make the FlowSOMmary
```{r}
FlowSOMmary(fsom = fsom,
            plotFile = paste0(dir_results, "fsom_summary.pdf"))
```

## Exercise
Plot a FlowSOM tree with the nodes colored according to CD19 expression. Hint: check the PlotMarker function in the FlowSOM_Cheatsheet.
```{r}

```


## Look for nodes with a specific pattern
```{r}
query <- list("B cells" = c("CD19" = "high", "CD3e" = "low"),
              "NK cells" = c("CD19" = "low", "CD161" = "high", 
                             "MHCII" = "low", "CD3e" = "low"),
              "CD4+ T cells" = c("CD3e" = "high", "MHCII" = "low", 
                                 "Ly6G" = "low", "CD4" = "high"),
              "CD8+ T cells" = c("CD3e" = "high", "MHCII" = "low", 
                                 "Ly6G" = "low", "CD8a" = "high"),
              "Dendritic cells" = c("CD11c" = "high", "MHCII" = "high", 
                                    "CD11b" = "high"),
              "Neutrophils" = c("Ly6G" = "high", "CD11b" = "high", 
                                "CD3e" = "low"))

labels <- QueryMultiple(fsom = fsom,
                        cellTypes = query,
                        plotFile = paste0(dir_results, "fsom_QueryStarPlot.pdf"))

PlotVariable(fsom = fsom,
             variable = labels,
             maxNodeSize = 3)
ggsave(paste0(dir_results, "fsom_query.pdf"))
```

## Exercise
Annotate the meta-clusters and plot the annotated FlowSOM tree
```{r}
fsom_annotated <- UpdateMetaclusters(fsom,
                                     newLabels = c("1" = "1: B cells",
                                                   "2" = "2: ",
                                                   "3" = "3: ",
                                                   "4" = "4: ",
                                                   "5" = "5: ",
                                                   "6" = "6: ",
                                                   "7" = "7: ",
                                                   "8" = "8: ",
                                                   "9" = "9: ",
                                                   "10" = "10: ",
                                                   "11" = "11: ",
                                                   "12" = "12: ",
                                                   "13" = "13: ",
                                                   "14" = "14: ",
                                                   "15" = "15: ",
                                                   "16" = "16: ",
                                                   "17" = "17: ",
                                                   "18" = "18: ",
                                                   "19" = "19: ",
                                                   "20" = "20: ",
                                                   "21" = "21: ",
                                                   "22" = "22: ",
                                                   "23" = "23: ",
                                                   "24" = "24: ",
                                                   "25" = "25: "))

saveRDS(fsom_annotated, file.path(dir_RDS, "fsom_annotated.RDS"))

PlotStars()

#PlotDimRed(fsom_annotated, cTotal = 10000)
```

## Get features per fcs file
```{r}
features <- GetFeatures(fsom = fsom_annotated,
                        files = files,
                        filenames = basename(files),
                        level = c("clusters", "metaclusters"),
                        type = c("counts", "MFI", "percentages"),
                        MFI = channelsForMFI)

saveRDS(features, file.path(dir_RDS, "fsom_features.RDS"))
names(features)
```

## Visualize features
### Abundance boxplots
```{r}
df <- data.frame(features$metacluster_percentages[grep("Sample", rownames(features$metacluster_percentages)),]*100, 
                 check.names = FALSE)
df$Group <- meta$Genotype[match(rownames(df), meta$FileName)]
df_g <- gather(df, "Feature", "Percentage", -Group)
df_g$Feature <- factor(x = df_g$Feature)
ggplot(data = df_g, aes(x=Group, y=Percentage))+
  geom_boxplot() +
  geom_jitter(aes(color=Group)) +
  facet_wrap(~Feature, scales = "free") +
  theme_minimal()
```

### MFI boxplots
```{r}
df <- data.frame(features$metacluster_MFIs[grep("Sample", rownames(features$metacluster_MFIs)),
                                           grep("CD103", colnames(features$metacluster_MFIs))], 
                 check.names = FALSE)
df$Group <- meta$Genotype[match(rownames(df), meta$FileName)]
df_g <- gather(df, "Feature", "MFI", -Group)
df_g$Feature <- factor(x = df_g$Feature)
ggplot(data = df_g, aes(x=Group, y=MFI))+
  geom_boxplot() + 
  geom_jitter(aes(color=Group)) +
  facet_wrap(~Feature, scales = "free") +
  theme_minimal()
```