From b5d936a511e2c9e9c6ffec73a9f03554e7264ef6 Mon Sep 17 00:00:00 2001 From: JohannesGawron Date: Wed, 3 Jul 2024 18:17:40 +0200 Subject: [PATCH] redone the statistical testing for validation experiment --- Rcode/validation_statistical_test.R | 199 ------------------ .../data/markdowns/Br11_topSeparators.Rmd | 21 +- .../data/markdowns/Br16_AC_topSeparators.Rmd | 6 +- .../data/markdowns/Br23_topSeparators.Rmd | 6 +- .../data/markdowns/Br26_topSeparators.Rmd | 6 +- .../data/markdowns/Br38_topSeparators.Rmd | 6 +- .../data/markdowns/Br7_topSeparators.Rmd | 49 ++--- .../data/markdowns/Pr9_topSeparators.Rmd | 6 +- 8 files changed, 48 insertions(+), 251 deletions(-) delete mode 100755 Rcode/validation_statistical_test.R diff --git a/Rcode/validation_statistical_test.R b/Rcode/validation_statistical_test.R deleted file mode 100755 index 5383661..0000000 --- a/Rcode/validation_statistical_test.R +++ /dev/null @@ -1,199 +0,0 @@ -source("statistical_test_source.R") - -#comp_1 <- prop.test(x = c(y[1], y[2]), n = c(z[1], z[2])) -#comp_2 <- prop.test(x = c(y[2], y[3]), n = c(z[2], z[3])) -#comp_3 <- prop.test(x = c(y[3], y[4]), n = c(z[3], z[4])) - -#p_val <- c(comp_1$p.value, comp_2$p.value, comp_3$p.value) - -# Group the rows by the "value" column only -summary_df_filter %>% - group_by(value) %>% - summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>% - ungroup() %>% - - ggplot(aes(x = as.factor(value), y = prop)) + - geom_bar(stat = "identity", fill = colors[1:4], width = 0.8, position = position_dodge(width=0.6)) + - labs(x = "Primary Tumor complexity", y = "Proportion of oligoclonal Clusters")+ - theme_classic()#+ - #geom_signif(comparisons = list(c("100", "1000"), c("1000", "10000"), c("10000", "50000")), y_position = c(1, 1, 1), tip_length = 0.01, textsize = 3, - annotations = ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", "ns")))) - - -#Check oligoclonality of Clusters for the lowest complexity by number of cells in cluster - -centi_df <- summary_df_filter[summary_df_filter$value==100, ] - -#centi_df %>% -# group_by(cluster_category_no_WBCs) %>% -# summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>% -# ungroup() %>% - -# ggplot(aes(x = reorder(cluster_category_no_WBCs, +prop), y = prop)) + -# geom_bar(stat = "identity", position = "dodge", fill = colors[1:4]) + -# labs(x = "Cluster size", y = "Proportion oligoclonal Clusters")+ -# theme_classic() - - -centi_df %>% - group_by(cluster_category_no_WBCs) %>% - summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>% - ungroup() %>% - mutate(cluster_category_no_WBCs = as.numeric(cluster_category_no_WBCs)) %>% - ggplot(aes(x = cluster_category_no_WBCs, y = prop)) + - geom_line() + - labs(x = "Cluster size", y = "Proportion oligoclonal Clusters")+ - theme_classic() - - -centi_df$cluster_category_no_WBCs <- factor(centi_df$cluster_category_no_WBCs, levels = c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15")) - - -centi_df %>% - group_by(cluster_category_no_WBCs) %>% - summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>% - ungroup() %>% - - filter(cluster_category_no_WBCs %in% c("1","2","3","4","5","6","7","8")) %>% - ggplot(aes(x = cluster_category_no_WBCs, y = prop, fill = cluster_category_no_WBCs)) + - geom_bar(stat = "identity") + - labs(x = "Cluster size", y = "Proportion oligoclonal Clusters")+ - theme_classic() - -### Now take the fraction of each of the clones in the tumor and compute the -# theoretical probability to obtain an oligoclonal CTC-cluster to -# under a well-mixed model - -primary_904A <- read_delim("../../validation_experiment/Primary_tumor_csv_files/20220812.B-904A_R2_stats.csv", - col_names = F, delim = "\t") -primary_904B <- read_delim("../../validation_experiment/Primary_tumor_csv_files/20220812.B-904B_R2_stats.csv", - col_names = F, delim = "\t" ) - -colnames(primary_904A)[1] <- "barcodes" -colnames(primary_904A)[3] <- "counts" - -colnames(primary_904B)[1] <- "barcodes" -colnames(primary_904B)[3] <- "counts" - - -primary_904A_counts <- primary_904A %>% - filter(counts !=0) -primary_904B_counts <- primary_904B %>% - filter(counts !=0) - -length(intersect(primary_904A_counts$barcodes,primary_904B_counts$barcodes)) - -primary_904_counts <- rbind(primary_904A_counts,primary_904B_counts) -primary_904_counts <- primary_904_counts %>% - filter(barcodes %in% intersect(primary_904A_counts$barcodes,primary_904B_counts$barcodes)) - -primary_904_counts <- primary_904_counts %>% - group_by(barcodes) %>% - summarize(total_count = sum(counts)) - -primary_904_counts<- primary_904_counts %>% - mutate(relative_count = total_count/sum(total_count)) - - -primary_904_counts$relative_count_squared <- primary_904_counts$relative_count^2 - -probability_of_monoclonality <- c() -for (i in 2:9){ - probability_of_monoclonality <- c(probability_of_monoclonality, sum(primary_904_counts$relative_count^i)) -} - -theoretical_result <- data.frame(cluster_size = 2:9, - monoclonality = probability_of_monoclonality, - oligoclonality = 1-probability_of_monoclonality) - -ggplot(theoretical_result,aes(x = cluster_size, y = oligoclonality)) + - geom_line() - - -centi_df %>% - filter(grepl("_904_", centi_df$basename)) %>% - group_by(cluster_category_no_WBCs) %>% - filter(n() > 1) %>% - summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>% - ungroup() %>% - mutate(cluster_category_no_WBCs = as.numeric(cluster_category_no_WBCs)) %>% - ggplot(aes(x = cluster_category_no_WBCs, y = prop)) + - geom_point() + - labs(x = "Cluster size", y = "Proportion oligoclonal Clusters")+ - theme_classic() + - geom_line(theoretical_result[1:5,], mapping = aes(x = cluster_size, y = oligoclonality)) - -ggsave("../../validation_experiment/output/observed_oligoclonality_vs_predicted_904.png", plot = last_plot(), dpi = 300) - - - - -###### Do the same with the next tumor -primary_905A <- read_delim("../../validation_experiment/Primary_tumor_csv_files/20220812.B-905A_R2_stats.csv", - col_names = F, delim = "\t") -primary_905B <- read_delim("../../validation_experiment/Primary_tumor_csv_files/20220812.B-905B_R2_stats.csv", - col_names = F, delim = "\t" ) - -colnames(primary_905A)[1] <- "barcodes" -colnames(primary_905A)[3] <- "counts" - -colnames(primary_905B)[1] <- "barcodes" -colnames(primary_905B)[3] <- "counts" - - -primary_905A_counts <- primary_905A %>% - filter(counts !=0) -primary_905B_counts <- primary_905B %>% - filter(counts !=0) - -length(intersect(primary_905A_counts$barcodes,primary_905B_counts$barcodes)) - -primary_905_counts <- rbind(primary_905A_counts,primary_905B_counts) -primary_905_counts <- primary_905_counts %>% - filter(barcodes %in% intersect(primary_905A_counts$barcodes,primary_905B_counts$barcodes)) - -primary_905_counts <- primary_905_counts %>% - group_by(barcodes) %>% - summarize(total_count = sum(counts)) - -primary_905_counts<- primary_905_counts %>% - mutate(relative_count = total_count/sum(total_count)) - -summary(primary_905_counts) - - -theoretical_result_905 <- data.frame(cluster_size = 2:9, - monoclonality = probability_of_monoclonality, - oligoclonality = 1-probability_of_monoclonality) - - -centi_df %>% - filter(grepl("_905_", centi_df$basename)) %>% - group_by(cluster_category_no_WBCs) %>% -# filter(n() > 1) %>% - summarize(prop = mean(more_than_one == "Yes" & !is.na(more_than_one))) %>% - ungroup() %>% - mutate(cluster_category_no_WBCs = as.numeric(cluster_category_no_WBCs)) %>% - ggplot(aes(x = cluster_category_no_WBCs, y = prop)) + - geom_point() + - labs(x = "Cluster size", y = "Proportion oligoclonal Clusters")+ - theme_classic() + - geom_line(theoretical_result_905, mapping = aes(x = cluster_size, y = oligoclonality)) - - - - -ggplot(theoretical_result,aes(x = cluster_size, y = oligoclonality)) + - geom_line() - - - - - - -#save(summary_df, file="summary_df_nine_ninefive.rds") - - - diff --git a/experiments/data/markdowns/Br11_topSeparators.Rmd b/experiments/data/markdowns/Br11_topSeparators.Rmd index 169a17d..0a1e4ea 100755 --- a/experiments/data/markdowns/Br11_topSeparators.Rmd +++ b/experiments/data/markdowns/Br11_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization} -source('../../workflow/resources/annotateVariants.R') -sampleName <- 'Br11' -inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder' +source("../../workflow/resources/annotateVariants.R") +sampleName <- "Br11" +inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder" ``` #### Mutation distance matrix @@ -33,10 +33,10 @@ A **private branch** is defined as the path from a leaf to the node just below t This is a generalization of the earlier method to find the top seperating mutations of pairs of leafs. The generalization was necessary to handle the larger clusters that were broken in more than 2 pieces. ```{r} -clusterName <- 'lightcoral' +clusterName <- "lightcoral" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -44,13 +44,12 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) annotations <- annotate_variants(sampleName, inputFolder) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -79,7 +78,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.9) # select those below 0.5 say +selected_muts <- which(min_dist < 0.9) # select those below 0.5 say mat2 <- mat[selected_muts, selected_muts] ``` @@ -100,8 +99,8 @@ coverage %>% filter(variantName %in% colnames(mat2)) To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` diff --git a/experiments/data/markdowns/Br16_AC_topSeparators.Rmd b/experiments/data/markdowns/Br16_AC_topSeparators.Rmd index e6def00..f52339f 100755 --- a/experiments/data/markdowns/Br16_AC_topSeparators.Rmd +++ b/experiments/data/markdowns/Br16_AC_topSeparators.Rmd @@ -18,9 +18,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../../workflow/resources/annotateVariants.R') -sampleName <- 'Br16_AC' -inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder' +source("../../workflow/resources/annotateVariants.R") +sampleName <- "Br16_AC" +inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder" annotations <- annotate_variants(sampleName, inputFolder) ``` diff --git a/experiments/data/markdowns/Br23_topSeparators.Rmd b/experiments/data/markdowns/Br23_topSeparators.Rmd index 9dffb80..d56f2cf 100755 --- a/experiments/data/markdowns/Br23_topSeparators.Rmd +++ b/experiments/data/markdowns/Br23_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization} -source('../../workflow/resources/annotateVariants.R') -sampleName <- 'Br23' -inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder' +source("../../workflow/resources/annotateVariants.R") +sampleName <- "Br23" +inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder" ``` #### Mutation distance matrix diff --git a/experiments/data/markdowns/Br26_topSeparators.Rmd b/experiments/data/markdowns/Br26_topSeparators.Rmd index 5810861..22e53aa 100755 --- a/experiments/data/markdowns/Br26_topSeparators.Rmd +++ b/experiments/data/markdowns/Br26_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../../workflow/resources/annotateVariants.R') -sampleName <- 'Br26' -inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder' +source("../../workflow/resources/annotateVariants.R") +sampleName <- "Br26" +inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder" annotations <- annotate_variants(sampleName, inputFolder) ``` diff --git a/experiments/data/markdowns/Br38_topSeparators.Rmd b/experiments/data/markdowns/Br38_topSeparators.Rmd index 109013b..27231e9 100755 --- a/experiments/data/markdowns/Br38_topSeparators.Rmd +++ b/experiments/data/markdowns/Br38_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../../workflow/resources/annotateVariants.R') -sampleName <- 'Br38' -inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder' +source("../../workflow/resources/annotateVariants.R") +sampleName <- "Br38" +inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder" annotations <- annotate_variants(sampleName, inputFolder) ``` diff --git a/experiments/data/markdowns/Br7_topSeparators.Rmd b/experiments/data/markdowns/Br7_topSeparators.Rmd index b8a925c..902a6c7 100755 --- a/experiments/data/markdowns/Br7_topSeparators.Rmd +++ b/experiments/data/markdowns/Br7_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../../workflow/resources/annotateVariants.R') -sampleName <- 'Br7' -inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder' +source("../../workflow/resources/annotateVariants.R") +sampleName <- "Br7" +inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder" annotations <- annotate_variants(sampleName, inputFolder) ``` @@ -38,10 +38,10 @@ This is a generalization of the earlier method to find the top seperating mutati # lightcoral ```{r} -clusterName <- 'lightcoral' +clusterName <- "lightcoral" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -49,12 +49,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -81,7 +80,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.95) # select those below 0.5 say +selected_muts <- which(min_dist < 0.95) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -102,8 +101,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. @@ -113,10 +112,10 @@ No apparent clustering visible. # sandybrown ```{r} -clusterName <- 'sandybrown' +clusterName <- "sandybrown" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -124,12 +123,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -161,8 +159,8 @@ coverage %>% filter(variantName %in% colnames(mat2)) To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` There seems to be no signal at all, so I stop here. @@ -174,10 +172,10 @@ There seems to be no signal at all, so I stop here. # skyblue3 ```{r} -clusterName <- 'skyblue3' +clusterName <- "skyblue3" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -185,12 +183,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -217,7 +214,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.6) # select those below 0.5 say +selected_muts <- which(min_dist < 0.6) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -239,8 +236,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. diff --git a/experiments/data/markdowns/Pr9_topSeparators.Rmd b/experiments/data/markdowns/Pr9_topSeparators.Rmd index 32782e8..577c5ac 100755 --- a/experiments/data/markdowns/Pr9_topSeparators.Rmd +++ b/experiments/data/markdowns/Pr9_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../../workflow/resources/annotateVariants.R') -sampleName <- 'Pr9' -inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder' +source("../../workflow/resources/annotateVariants.R") +sampleName <- "Pr9" +inputFolder <- "/cluster/work/bewi/members/jgawron/projects/CTC/input_folder" annotations <- annotate_variants(sampleName, inputFolder) ```