From b5d936a511e2c9e9c6ffec73a9f03554e7264ef6 Mon Sep 17 00:00:00 2001
From: JohannesGawron <johannes.gawron@bsse.ethz.ch>
Date: Wed, 3 Jul 2024 18:17:40 +0200
Subject: [PATCH] redone the statistical testing for validation experiment

---
 Rcode/validation_statistical_test.R           | 199 ------------------
 .../data/markdowns/Br11_topSeparators.Rmd     |  21 +-
 .../data/markdowns/Br16_AC_topSeparators.Rmd  |   6 +-
 .../data/markdowns/Br23_topSeparators.Rmd     |   6 +-
 .../data/markdowns/Br26_topSeparators.Rmd     |   6 +-
 .../data/markdowns/Br38_topSeparators.Rmd     |   6 +-
 .../data/markdowns/Br7_topSeparators.Rmd      |  49 ++---
 .../data/markdowns/Pr9_topSeparators.Rmd      |   6 +-
 8 files changed, 48 insertions(+), 251 deletions(-)
 delete mode 100755 Rcode/validation_statistical_test.R

diff --git a/Rcode/validation_statistical_test.R b/Rcode/validation_statistical_test.R
deleted file mode 100755
index 5383661..0000000
--- a/Rcode/validation_statistical_test.R
+++ /dev/null
@@ -1,199 +0,0 @@
-source("statistical_test_source.R")
-
-#comp_1 <- prop.test(x = c(y[1], y[2]), n = c(z[1], z[2]))
-#comp_2 <- prop.test(x = c(y[2], y[3]), n = c(z[2], z[3]))
-#comp_3 <- prop.test(x = c(y[3], y[4]), n = c(z[3], z[4]))
-
-#p_val <- c(comp_1$p.value, comp_2$p.value, comp_3$p.value)
-
-# Group the rows by the "value" column only
-summary_df_filter %>%
-  group_by(value) %>%
-  summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>%
-  ungroup() %>% 
-  
-  ggplot(aes(x = as.factor(value), y = prop)) + 
-  geom_bar(stat = "identity", fill = colors[1:4], width = 0.8, position = position_dodge(width=0.6)) +
-  labs(x = "Primary Tumor complexity", y = "Proportion of oligoclonal Clusters")+
-  theme_classic()#+
-  #geom_signif(comparisons = list(c("100", "1000"), c("1000", "10000"), c("10000", "50000")), y_position = c(1, 1, 1), tip_length = 0.01, textsize = 3, 
-              annotations = ifelse(p_val < 0.001, "***", 
-                                   ifelse(p_val < 0.01, "**", 
-                                          ifelse(p_val < 0.05, "*", "ns"))))
-
-
-#Check oligoclonality of Clusters for the lowest complexity by number of cells in cluster
-
-centi_df <- summary_df_filter[summary_df_filter$value==100, ]
-
-#centi_df %>%
-#  group_by(cluster_category_no_WBCs) %>%
-#  summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>%
-#  ungroup() %>%
-  
-#  ggplot(aes(x = reorder(cluster_category_no_WBCs, +prop), y = prop)) +
-#  geom_bar(stat = "identity", position = "dodge", fill = colors[1:4]) +
-#  labs(x = "Cluster size", y = "Proportion oligoclonal Clusters")+
-#  theme_classic()
-
-
-centi_df %>%
-  group_by(cluster_category_no_WBCs) %>%
-  summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>%
-  ungroup() %>%
-  mutate(cluster_category_no_WBCs = as.numeric(cluster_category_no_WBCs)) %>%
-  ggplot(aes(x = cluster_category_no_WBCs, y = prop)) +
-  geom_line() +
-  labs(x = "Cluster size", y = "Proportion oligoclonal Clusters")+
-  theme_classic()
-
-
-centi_df$cluster_category_no_WBCs <- factor(centi_df$cluster_category_no_WBCs, levels = c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15"))
-
-
-centi_df %>%
-  group_by(cluster_category_no_WBCs) %>%
-  summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>%
-  ungroup() %>%
-  
-  filter(cluster_category_no_WBCs %in% c("1","2","3","4","5","6","7","8")) %>%
-  ggplot(aes(x = cluster_category_no_WBCs, y = prop, fill = cluster_category_no_WBCs)) +
-  geom_bar(stat = "identity") +
-  labs(x = "Cluster size", y = "Proportion oligoclonal Clusters")+
-  theme_classic()
-
-### Now take the fraction of each of the clones in the tumor and compute the 
-# theoretical probability to obtain an oligoclonal CTC-cluster to 
-# under a well-mixed model
-
-primary_904A <- read_delim("../../validation_experiment/Primary_tumor_csv_files/20220812.B-904A_R2_stats.csv",
-                           col_names = F, delim = "\t")
-primary_904B <- read_delim("../../validation_experiment/Primary_tumor_csv_files/20220812.B-904B_R2_stats.csv",
-                           col_names = F, delim = "\t" )
-
-colnames(primary_904A)[1] <- "barcodes"
-colnames(primary_904A)[3] <- "counts"
-
-colnames(primary_904B)[1] <- "barcodes"
-colnames(primary_904B)[3] <- "counts"
-
-
-primary_904A_counts <-  primary_904A %>%
-  filter(counts !=0)
-primary_904B_counts <-  primary_904B %>%
-  filter(counts !=0) 
- 
-length(intersect(primary_904A_counts$barcodes,primary_904B_counts$barcodes))
-
-primary_904_counts <- rbind(primary_904A_counts,primary_904B_counts)
-primary_904_counts <- primary_904_counts %>%
-  filter(barcodes %in% intersect(primary_904A_counts$barcodes,primary_904B_counts$barcodes))
-
-primary_904_counts <- primary_904_counts %>%
-  group_by(barcodes) %>%
-  summarize(total_count = sum(counts))
-
-primary_904_counts<- primary_904_counts %>%
-  mutate(relative_count = total_count/sum(total_count))
-
-
-primary_904_counts$relative_count_squared <- primary_904_counts$relative_count^2
-
-probability_of_monoclonality <- c()
-for (i in 2:9){
-  probability_of_monoclonality <- c(probability_of_monoclonality, sum(primary_904_counts$relative_count^i))
-}
-
-theoretical_result <- data.frame(cluster_size = 2:9,
-                                 monoclonality = probability_of_monoclonality,
-                                 oligoclonality = 1-probability_of_monoclonality)
-  
-ggplot(theoretical_result,aes(x = cluster_size, y = oligoclonality)) +
-  geom_line()
- 
-
-centi_df %>%
-  filter(grepl("_904_", centi_df$basename)) %>%
-  group_by(cluster_category_no_WBCs) %>%
-  filter(n() > 1) %>%
-  summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>%
-  ungroup() %>%
-  mutate(cluster_category_no_WBCs = as.numeric(cluster_category_no_WBCs)) %>%
-  ggplot(aes(x = cluster_category_no_WBCs, y = prop)) +
-  geom_point() +
-  labs(x = "Cluster size", y = "Proportion oligoclonal Clusters")+
-  theme_classic() +
-  geom_line(theoretical_result[1:5,], mapping = aes(x = cluster_size, y = oligoclonality))
-
-ggsave("../../validation_experiment/output/observed_oligoclonality_vs_predicted_904.png", plot = last_plot(), dpi = 300)
-
-
-
-
-###### Do the same with the next tumor
-primary_905A <- read_delim("../../validation_experiment/Primary_tumor_csv_files/20220812.B-905A_R2_stats.csv",
-                           col_names = F, delim = "\t")
-primary_905B <- read_delim("../../validation_experiment/Primary_tumor_csv_files/20220812.B-905B_R2_stats.csv",
-                           col_names = F, delim = "\t" )
-
-colnames(primary_905A)[1] <- "barcodes"
-colnames(primary_905A)[3] <- "counts"
-
-colnames(primary_905B)[1] <- "barcodes"
-colnames(primary_905B)[3] <- "counts"
-
-
-primary_905A_counts <-  primary_905A %>%
-  filter(counts !=0)
-primary_905B_counts <-  primary_905B %>%
-  filter(counts !=0) 
-
-length(intersect(primary_905A_counts$barcodes,primary_905B_counts$barcodes))
-
-primary_905_counts <- rbind(primary_905A_counts,primary_905B_counts)
-primary_905_counts <- primary_905_counts %>%
-  filter(barcodes %in% intersect(primary_905A_counts$barcodes,primary_905B_counts$barcodes))
-
-primary_905_counts <- primary_905_counts %>%
-  group_by(barcodes) %>%
-  summarize(total_count = sum(counts))
-
-primary_905_counts<- primary_905_counts %>%
-  mutate(relative_count = total_count/sum(total_count))
-
-summary(primary_905_counts)
-
-
-theoretical_result_905 <- data.frame(cluster_size = 2:9,
-                                 monoclonality = probability_of_monoclonality,
-                                 oligoclonality = 1-probability_of_monoclonality)
-
-
-centi_df %>%
-  filter(grepl("_905_", centi_df$basename)) %>%
-  group_by(cluster_category_no_WBCs) %>%
-#  filter(n() > 1) %>%
-  summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>%
-  ungroup() %>%
-  mutate(cluster_category_no_WBCs = as.numeric(cluster_category_no_WBCs)) %>%
-  ggplot(aes(x = cluster_category_no_WBCs, y = prop)) +
-  geom_point() +
-  labs(x = "Cluster size", y = "Proportion oligoclonal Clusters")+
-  theme_classic() +
-  geom_line(theoretical_result_905, mapping = aes(x = cluster_size, y = oligoclonality))
-
-
-
-
-ggplot(theoretical_result,aes(x = cluster_size, y = oligoclonality)) +
-  geom_line()
-
-
-
-
-
-
-#save(summary_df, file="summary_df_nine_ninefive.rds")
-
-
-
diff --git a/experiments/data/markdowns/Br11_topSeparators.Rmd b/experiments/data/markdowns/Br11_topSeparators.Rmd
index 169a17d..0a1e4ea 100755
--- a/experiments/data/markdowns/Br11_topSeparators.Rmd
+++ b/experiments/data/markdowns/Br11_topSeparators.Rmd
@@ -17,9 +17,9 @@ output:
 ## Data 
 
 ```{r initialization}
-source('../../workflow/resources/annotateVariants.R')
-sampleName <- 'Br11'
-inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder'
+source("../../workflow/resources/annotateVariants.R")
+sampleName <- "Br11"
+inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder"
 ```
 
 #### Mutation distance matrix
@@ -33,10 +33,10 @@ A **private branch** is defined as the path from a leaf to the node just below t
 This is a generalization of the earlier method to find the top seperating mutations of pairs of leafs. The generalization was necessary to handle the larger clusters that were broken in more than 2 pieces.
 
 ```{r}
-clusterName <- 'lightcoral'
+clusterName <- "lightcoral"
 
-d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
-mat<-as.matrix(d)
+d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
+mat <- as.matrix(d)
 mat[1:4, 1:4]
 ```
 
@@ -44,13 +44,12 @@ mat[1:4, 1:4]
 For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants.
 
 ```{r message=FALSE}
-coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
+coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
 coverage$variantName <- rownames(coverage)
 head(coverage)
 annotations <- annotate_variants(sampleName, inputFolder)
 
 coverage <- inner_join(coverage, annotations, by = "variantName")
-
 ```
 
 ## Method
@@ -79,7 +78,7 @@ heatmaply(mat)
 mat2 <- mat
 diag(mat2) <- 1
 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations
-selected_muts <- which(min_dist<0.9) # select those below 0.5 say
+selected_muts <- which(min_dist < 0.9) # select those below 0.5 say
 mat2 <- mat[selected_muts, selected_muts]
 ```
 
@@ -100,8 +99,8 @@ coverage %>% filter(variantName %in% colnames(mat2))
 To cluster mutations, we create a dendrogram based on the pairwise distances:
 ```{r}
 d_mat <- as.dist(mat)
-hc <- hclust(d_mat, "average")                   ## hierarchical clustering of mutations based on distance matrix
-par(cex=0.6)
+hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix
+par(cex = 0.6)
 plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
 ```
 
diff --git a/experiments/data/markdowns/Br16_AC_topSeparators.Rmd b/experiments/data/markdowns/Br16_AC_topSeparators.Rmd
index e6def00..f52339f 100755
--- a/experiments/data/markdowns/Br16_AC_topSeparators.Rmd
+++ b/experiments/data/markdowns/Br16_AC_topSeparators.Rmd
@@ -18,9 +18,9 @@ output:
 ## Data 
 
 ```{r initialization, message = FALSE}
-source('../../workflow/resources/annotateVariants.R')
-sampleName <- 'Br16_AC'
-inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder'
+source("../../workflow/resources/annotateVariants.R")
+sampleName <- "Br16_AC"
+inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder"
 annotations <- annotate_variants(sampleName, inputFolder)
 ```
 
diff --git a/experiments/data/markdowns/Br23_topSeparators.Rmd b/experiments/data/markdowns/Br23_topSeparators.Rmd
index 9dffb80..d56f2cf 100755
--- a/experiments/data/markdowns/Br23_topSeparators.Rmd
+++ b/experiments/data/markdowns/Br23_topSeparators.Rmd
@@ -17,9 +17,9 @@ output:
 ## Data 
 
 ```{r initialization}
-source('../../workflow/resources/annotateVariants.R')
-sampleName <- 'Br23'
-inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder'
+source("../../workflow/resources/annotateVariants.R")
+sampleName <- "Br23"
+inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder"
 ```
 
 #### Mutation distance matrix
diff --git a/experiments/data/markdowns/Br26_topSeparators.Rmd b/experiments/data/markdowns/Br26_topSeparators.Rmd
index 5810861..22e53aa 100755
--- a/experiments/data/markdowns/Br26_topSeparators.Rmd
+++ b/experiments/data/markdowns/Br26_topSeparators.Rmd
@@ -17,9 +17,9 @@ output:
 ## Data 
 
 ```{r initialization, message = FALSE}
-source('../../workflow/resources/annotateVariants.R')
-sampleName <- 'Br26'
-inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder'
+source("../../workflow/resources/annotateVariants.R")
+sampleName <- "Br26"
+inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder"
 
 annotations <- annotate_variants(sampleName, inputFolder)
 ```
diff --git a/experiments/data/markdowns/Br38_topSeparators.Rmd b/experiments/data/markdowns/Br38_topSeparators.Rmd
index 109013b..27231e9 100755
--- a/experiments/data/markdowns/Br38_topSeparators.Rmd
+++ b/experiments/data/markdowns/Br38_topSeparators.Rmd
@@ -17,9 +17,9 @@ output:
 ## Data 
 
 ```{r initialization, message = FALSE}
-source('../../workflow/resources/annotateVariants.R')
-sampleName <- 'Br38'
-inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder'
+source("../../workflow/resources/annotateVariants.R")
+sampleName <- "Br38"
+inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder"
 
 annotations <- annotate_variants(sampleName, inputFolder)
 ```
diff --git a/experiments/data/markdowns/Br7_topSeparators.Rmd b/experiments/data/markdowns/Br7_topSeparators.Rmd
index b8a925c..902a6c7 100755
--- a/experiments/data/markdowns/Br7_topSeparators.Rmd
+++ b/experiments/data/markdowns/Br7_topSeparators.Rmd
@@ -17,9 +17,9 @@ output:
 ## Data 
 
 ```{r initialization, message = FALSE}
-source('../../workflow/resources/annotateVariants.R')
-sampleName <- 'Br7'
-inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder'
+source("../../workflow/resources/annotateVariants.R")
+sampleName <- "Br7"
+inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder"
 
 annotations <- annotate_variants(sampleName, inputFolder)
 ```
@@ -38,10 +38,10 @@ This is a generalization of the earlier method to find the top seperating mutati
 # lightcoral 
 
 ```{r}
-clusterName <- 'lightcoral'
+clusterName <- "lightcoral"
 
-d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
-mat<-as.matrix(d)
+d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
+mat <- as.matrix(d)
 mat[1:4, 1:4]
 ```
 
@@ -49,12 +49,11 @@ mat[1:4, 1:4]
 For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants.
 
 ```{r message=FALSE}
-coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
+coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
 coverage$variantName <- rownames(coverage)
 head(coverage)
 
 coverage <- inner_join(coverage, annotations, by = "variantName")
-
 ```
 
 ## Method
@@ -81,7 +80,7 @@ heatmaply(mat)
 mat2 <- mat
 diag(mat2) <- 1
 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations
-selected_muts <- which(min_dist<0.95) # select those below 0.5 say
+selected_muts <- which(min_dist < 0.95) # select those below 0.5 say
 mat3 <- mat[selected_muts, selected_muts]
 ```
 
@@ -102,8 +101,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances:
 ```{r}
 mat <- mat3
 d_mat <- as.dist(mat)
-hc <- hclust(d_mat, "average")                   ## hierarchical clustering of mutations based on distance matrix
-par(cex=0.6)
+hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix
+par(cex = 0.6)
 plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
 ```
 No apparent clustering visible.
@@ -113,10 +112,10 @@ No apparent clustering visible.
 # sandybrown 
 
 ```{r}
-clusterName <- 'sandybrown'
+clusterName <- "sandybrown"
 
-d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
-mat<-as.matrix(d)
+d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
+mat <- as.matrix(d)
 mat[1:4, 1:4]
 ```
 
@@ -124,12 +123,11 @@ mat[1:4, 1:4]
 For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants.
 
 ```{r message=FALSE}
-coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
+coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
 coverage$variantName <- rownames(coverage)
 head(coverage)
 
 coverage <- inner_join(coverage, annotations, by = "variantName")
-
 ```
 
 ## Method
@@ -161,8 +159,8 @@ coverage %>% filter(variantName %in% colnames(mat2))
 To cluster mutations, we create a dendrogram based on the pairwise distances:
 ```{r}
 d_mat <- as.dist(mat)
-hc <- hclust(d_mat, "average")                   ## hierarchical clustering of mutations based on distance matrix
-par(cex=0.6)
+hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix
+par(cex = 0.6)
 plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
 ```
 There seems to be no signal at all, so I stop here.
@@ -174,10 +172,10 @@ There seems to be no signal at all, so I stop here.
 # skyblue3 
 
 ```{r}
-clusterName <- 'skyblue3'
+clusterName <- "skyblue3"
 
-d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
-mat<-as.matrix(d)
+d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
+mat <- as.matrix(d)
 mat[1:4, 1:4]
 ```
 
@@ -185,12 +183,11 @@ mat[1:4, 1:4]
 For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants.
 
 ```{r message=FALSE}
-coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
+coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1)
 coverage$variantName <- rownames(coverage)
 head(coverage)
 
 coverage <- inner_join(coverage, annotations, by = "variantName")
-
 ```
 
 ## Method
@@ -217,7 +214,7 @@ heatmaply(mat)
 mat2 <- mat
 diag(mat2) <- 1
 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations
-selected_muts <- which(min_dist<0.6) # select those below 0.5 say
+selected_muts <- which(min_dist < 0.6) # select those below 0.5 say
 mat3 <- mat[selected_muts, selected_muts]
 ```
 
@@ -239,8 +236,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances:
 ```{r}
 mat <- mat3
 d_mat <- as.dist(mat)
-hc <- hclust(d_mat, "average")                   ## hierarchical clustering of mutations based on distance matrix
-par(cex=0.6)
+hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix
+par(cex = 0.6)
 plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
 ```
 No apparent clustering visible.
diff --git a/experiments/data/markdowns/Pr9_topSeparators.Rmd b/experiments/data/markdowns/Pr9_topSeparators.Rmd
index 32782e8..577c5ac 100755
--- a/experiments/data/markdowns/Pr9_topSeparators.Rmd
+++ b/experiments/data/markdowns/Pr9_topSeparators.Rmd
@@ -17,9 +17,9 @@ output:
 ## Data 
 
 ```{r initialization, message = FALSE}
-source('../../workflow/resources/annotateVariants.R')
-sampleName <- 'Pr9'
-inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder'
+source("../../workflow/resources/annotateVariants.R")
+sampleName <- "Pr9"
+inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder"
 
 annotations <- annotate_variants(sampleName, inputFolder)
 ```