From 6876c30b1656e6b526300f3ca802e83a08a419b8 Mon Sep 17 00:00:00 2001
From: JohannesGawron <johannes.gawron@bsse.ethz.ch>
Date: Wed, 14 Aug 2024 02:10:37 +0200
Subject: [PATCH] bugfix in extended_data_figure_7b.R and redoing
 extended_data_figure_7a.R

---
 ...g_the_clonality_of_barcoded_CTC_clusters.R |   8 +-
 .../extended_data_figure_7a.R                 | 177 ++++++++++++------
 .../extended_data_figure_7b.R                 |  52 +----
 3 files changed, 133 insertions(+), 104 deletions(-)

diff --git a/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R b/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R
index 050acb7..6ddfc0e 100644
--- a/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R
+++ b/experiments/barcoding_experiment/Inferring_the_clonality_of_barcoded_CTC_clusters.R
@@ -165,10 +165,10 @@ medium <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(52, 21))
 high <- data.frame(Clonality = c("Monoclonal", "Oligoclonal"), n = c(14, 40))
 
 
-# Generate a donut plot illustrating mono- and oligoclonal CTC cluster counts for each complexity level  
-PieDonut(low, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
-PieDonut(medium, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
-PieDonut(high, aes(Clonality, count=n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
+# Generate a donut plot illustrating mono- and oligoclonal CTC cluster counts for each complexity level
+PieDonut(low, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
+PieDonut(medium, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
+PieDonut(high, aes(Clonality, count = n), showPieName = FALSE, donutLabelSize = 0, labelpositionThreshold = 0.01, explode = 1, r0 = 0.5, r1 = 0.95, pieLabelSize = 0, showRatioDonut = FALSE)
 
 
 # Create a data frame specifying for each tumor sample the proportion of oligoclonal CTC clusters in categories "2" and "3+"
diff --git a/experiments/barcoding_experiment/extended_data_figure_7a.R b/experiments/barcoding_experiment/extended_data_figure_7a.R
index 71df106..90fd92f 100644
--- a/experiments/barcoding_experiment/extended_data_figure_7a.R
+++ b/experiments/barcoding_experiment/extended_data_figure_7a.R
@@ -1,77 +1,148 @@
-library(tidyverse)
+library(ggplot2)
+library(boot)
+library(smoothr)
 
-data <- readRDS("combi_df.rds")
-View(data)
+setwd("~/Downloads")
 
+tables <- list()
+filelist <- c("combined_140_order_filter_merge.rds", "combined_141_order_filter_merge.rds", "combined_902_order_filter_merge.rds", "combined_903_order_filter_merge.rds", "combined_904_order_filter_merge.rds", "combined_905_order_filter_merge.rds", "combined_910_order_filter_merge.rds")
+for (file in filelist) {
+  tables[[file]] <- readRDS(file)
+  tables[[file]]$observations <- paste0(file, tables[[file]]$observations)
+  first_row_with_nonzero_counts <- tables[[file]][tables[[file]]$freq != 0, ][1, ]
+  total_counts <- first_row_with_nonzero_counts$counts / first_row_with_nonzero_counts$freq
+  tables[[file]]$total_counts <- total_counts
+  cutoff <- quantile(tables[[file]]$prop_av, prob = 0.01)
+  tables[[file]] <- tables[[file]] %>%
+    filter(prop_av > cutoff) %>%
+    filter(prop_av != 0)
+}
 
-data <- data %>% mutate(counts / freq, complexity = as.numeric(complexity))
 
-summary(data)
+# Merge all tables
+merged_table <- do.call(rbind, tables)
+unique(merged_table$total_counts)
 
-data2 <- data %>%
-  group_by(observations) %>%
-  slice(rep(1:n(), counts / freq)) %>%
-  mutate(present = row_number() <= first(counts)) %>%
-  ungroup()
+merged_table <- merged_table %>% mutate(complexity = as.numeric(complexity))
 
-fit <- glm(present ~ log(prop_av), data = data2, family = binomial(link = "log"))
+summary(merged_table)
 
-summary(fit)
 
-confint(fit)
-coef(fit)
 
-data %>%
-  ggplot(aes(y = log(freq), x = log(prop_av))) +
-  geom_point() +
-  labs(x = "log(tumorVAF)", y = "log(fraction among CTC clusters)") +
-  geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2])
+merged_table2 <- merged_table %>%
+  group_by(observations) %>%
+  slice(rep(1:n(), total_counts)) %>%
+  mutate(present = as.integer(row_number() <= first(counts))) %>%
+  ungroup()
 
 
-data %>%
-  ggplot(aes(y = log(ratio), x = log(prop_av))) +
+fit <-
+  glm(
+    present ~ log(prop_av),
+    data = merged_table2, family = binomial(link = "logit")
+  )
+fit1 <-
+  glm(
+    present ~ prop_av,
+    data = merged_table2, family = binomial(link = "logit")
+  )
+fit2 <-
+  glm(
+    present ~ log(prop_av),
+    data = merged_table2, family = binomial(link = "log")
+  )
+fit3 <-
+  glm(present ~ logit(prop_av),
+    data = merged_table2, family = binomial(link = "logit")
+  )
+fit4 <-
+  glm(present ~ logit(prop_av),
+    data = merged_table2, family = binomial(link = "log")
+  )
+
+AIC(fit)
+AIC(fit1)
+AIC(fit2)
+AIC(fit3)
+AIC(fit4)
+
+### fit3 and fit4 have approximately the same AIC, but fit 3 transforms the x
+## coordinates to a well defined range, so I will use fit3.
+fit_linear_model <- lm(freq ~ log(prop_av), data = merged_table)
+AIC(fit_linear_model)
+
+merged_table %>%
+  ggplot(aes(y = freq, x = prop_av)) +
   geom_point() +
-  geom_abline(intercept = coef(fit)[1], slope = coef(fit)[2] - 1) +
-  labs(x = "log(VAF in primary tumor)", y = "log(fold-change in prevalence)") +
-  theme_minimal()
+  labs(x = "tumor VAF", y = "fraction among CTC clusters") +
+  geom_smooth(method = "lm", formula = y ~ x)
 
+merged_table %>%
+  ggplot(aes(y = freq, x = log(prop_av))) +
+  geom_point() +
+  labs(x = "tumor VAF", y = "fraction among CTC clusters") +
+  geom_smooth(method = "lm")
 
 
+merged_table %>%
+  ggplot(aes(y = logit(freq), x = logit(prop_av))) +
+  geom_point() +
+  labs(x = "logit(tumor VAF)", y = "logit(fraction among CTC clusters)") +
+  geom_smooth(method = "lm", formula = y ~ x)
 
-pdf("~/Desktop/clonal_prevalence.pdf", width = "3.6", height = "3.6")
-par(bty = "l")
-plot(
-  x = log(data$prop_av),
-  y = log(data$ratio),
-  cex = 0.8,
-  pch = 19,
-  col = "#81A9A9",
-  xlab = "log( VAF in primary tumor )",
-  ylab = "log( fold-change in prevalence in CTC clusters )"
-)
-abline(b = coef(fit)[2] - 1, a = coef(fit)[1], col = "#3C8181", lwd = 2.2)
-text(x = max(log(data$prop_av)) - 5, y = max(log(data$ratio)) - 3.5, labels = sprintf("slope = %f***", coef(fit)[2] - 1), pos = 4, col = "black")
-dev.off()
 
-summary(fit)
 
-#### The p-value is computed manually. Compare (Regression, Modelle, Methoden
-## und Anwendungen, second edition, page 204f; https://www.uni-goettingen.de/de/551357.html)
+fit_logit_logit_linear <-
+  lm(asin(sqrt(freq)) ~ asin(sqrt(prop_av)), data = merged_table)
+plot(fit_logit_logit_linear)
+primary_VAF <- seq(0.01, 0.55, 0.01)
 
-### The coefficient scaled by the standard error and then squared is
-## approximately xi-square (1) distributed with one degree of freedom.
+predicted <- predict(fit4,
+  newdata = data.frame(prop_av = primary_VAF),
+  type = "response"
+)
 
-### Also note that the number of data points 1798 is large enough (>50) for a
-## xi-square approximation to work, according to rule of thumb.
+color <- "#3C8181"
 
-pseudo_t_value <- as.numeric(coef(fit)["log(prop_av)"] - 1) / as.numeric(summary(fit)$coefficients["log(prop_av)", "Std. Error"])
+ggplot(merged_table2, aes(x = prop_av, y = present)) +
+  geom_jitter(width = 0, height = 0.1, color = color) +
+  geom_smooth(
+    method = "glm",
+    method.args = list(family = "binomial"), formula = y ~ logit(x), color = color
+  ) +
+  labs(
+    x = "Clonal frequency in primary tumor",
+    y = "P(barcode is present in CTC cluster)"
+  ) +
+  geom_abline(linetype = "dotted", slope = 1, intercept = 0)
 
-wald_statistics <- pseudo_t_value^2 ### xi^2_1 distributed
 
-p.value <- 1 - pchisq(wald_statistics, 1)
-print(p.value)
-cor(log(data$prop_av), log(data$ratio), method = "pearson")
-### The p-value is smaller that measurable by machine precision, so I suggest to report
-### the smallest p-value that the generic glm summary can output:
+merged_table %>%
+  ggplot(aes(y = freq, x = prop_av)) +
+  geom_point() +
+  labs(x = "tumor VAF", y = "fraction among CTC clusters") +
+  geom_line(data = data.frame(prop_av = primary_VAF, freq = predicted))
+
+means <- rep(NA, 100)
+for (i in 1:25) {
+  window_start <- (i - 1) / 50
+  window_end <- i / 50
+  means[i] <- merged_table2 %>%
+    dplyr::filter(prop_av > window_start) %>%
+    dplyr::filter(prop_av <= window_end) %>%
+    pull(present) %>%
+    mean()
+}
+
+ggplot(data.frame(y = means, x = 0:99), aes(x = x, y = y)) +
+  geom_point()
+
+ggsave(
+  "~/Desktop/clonal_prevalence2.pdf",
+  width = 3.6, height = 3.6, units = "in"
+)
 
-### p-value < 2.2e-16
+cor(x = merged_table2$prop_av, y = merged_table2$present, method = "spearman")
+cor.test(
+  x = merged_table2$prop_av, y = merged_table2$present, method = "spearman"
+)
diff --git a/experiments/barcoding_experiment/extended_data_figure_7b.R b/experiments/barcoding_experiment/extended_data_figure_7b.R
index 8761154..96ebd23 100644
--- a/experiments/barcoding_experiment/extended_data_figure_7b.R
+++ b/experiments/barcoding_experiment/extended_data_figure_7b.R
@@ -1,3 +1,6 @@
+## Author: Johannes Gawron
+## Colors and labels in the final figure may deviate through manual manipulation in Adobe Illustrator
+
 library(MCMCprecision)
 library(ggplot2)
 library(tidyverse)
@@ -54,17 +57,13 @@ for (mouse_model in mouse_models) {
   print(files)
   summary_temp <- summary_df_filter_combined[paste0(summary_df_filter_combined$basename, ".csv") %in% files, ]
 
-  summary_df_filter_combined <- summary_temp %>%
-    mutate(group = ifelse(value %in% c(100, 1000), "100-1000",
-      ifelse(value == 10000, "10000", "50000")
-    ))
 
-  total_number_of_clusters_by_size <- summary_df_filter_combined %>%
+  total_number_of_clusters_by_size <- summary_temp %>%
     group_by(cluster_size) %>%
     summarize(all_clusters = n())
 
 
-  number_of_monoclonal_cluster_by_size <- summary_df_filter_combined %>%
+  number_of_monoclonal_cluster_by_size <- summary_temp %>%
     filter(clonality == "mono") %>%
     group_by(cluster_size) %>%
     summarize(monoclonal_clusters = n())
@@ -161,50 +160,9 @@ for (mouse_model in mouse_models) {
 
 
 frequency_table <- mono_clusters / all_clusters
-rownames(all_clusters) <- 2:25
-rownames(mono_clusters) <- 2:25
 rownames(frequency_table) <- 2:25
 rownames(p_values_table) <- 2:25
 rownames(fold_change_table) <- 2:25
-rownames(expected_values_null_table) <- 2:25
-
-
-
-expected_values_null_table[is.na(expected_values_null_table)] <- 0
-
-expected_values_null_per_sample <- expected_values_null_table %>% apply(MARGIN = 2, FUN = sum)
-
-expected_proportion_null_per_sample <- expected_values_null_per_sample / all_clusters %>% apply(MARGIN = 2, FUN = sum)
-
-frequencies <- mono_clusters %>% apply(MARGIN = 2, FUN = sum) / all_clusters %>% apply(MARGIN = 2, FUN = sum)
-
-
-annotations <- ifelse(p_values_columns < 0.001, "***",
-  ifelse(p_values_columns < 0.01, "**",
-    ifelse(p_values_columns < 0.05, "*", "ns")
-  )
-)
-annotations <- paste0(mono_clusters %>% apply(MARGIN = 2, FUN = sum), "/", all_clusters %>% apply(MARGIN = 2, FUN = sum), "\n", annotations)
-annotation_data <- data.frame(SampleID = rownames(data), annotations = annotations, frequencies = frequencies, expected = expected_proportion_null_per_sample)
-
-annotation_data$SampleID <- factor(annotation_data$SampleID, levels = annotation_data$SampleID[c(5, 6, 2, 1, 3, 4, 7)])
-
-
-ggplot(annotation_data, aes(x = SampleID, y = frequencies)) +
-  ylim(0, 1.1) +
-  geom_bar(stat = "identity", fill = "#598F8F", alpha = 0.8, width = 0.8, position = position_dodge(width = 0.6)) +
-  geom_text(
-    data = annotation_data, aes(x = SampleID, y = frequencies, label = annotations),
-    position = position_dodge(width = 0.6), vjust = -0.5, size = 4
-  ) +
-  labs(x = "Sample ID", y = "Proportion of monoclonal Clusters") +
-  theme_classic() +
-  scale_x_discrete(limits = levels(annotation_data$SampleID)) +
-  theme(
-    axis.text = element_text(size = 14),
-    axis.title = element_text(size = 14)
-  ) +
-  geom_point(aes(y = expected), shape = 23, fill = "black", size = 4)