From 1e1015d50875eb8779293295790ba654c01cdc6d Mon Sep 17 00:00:00 2001 From: JohannesGawron Date: Mon, 10 Jun 2024 17:22:17 +0200 Subject: [PATCH] cleanup: remove Rcode/markdowns --- Rcode/markdowns/Br11.Rmd | 181 ------------------------------- Rcode/markdowns/Br16.Rmd | 181 ------------------------------- Rcode/markdowns/Br16_AC.Rmd | 181 ------------------------------- Rcode/markdowns/Br16_AC_max1.Rmd | 172 ----------------------------- Rcode/markdowns/Br16_AC_max2.Rmd | 180 ------------------------------ Rcode/markdowns/Br16_AC_max3.Rmd | 180 ------------------------------ Rcode/markdowns/Br16_AC_max4.Rmd | 180 ------------------------------ Rcode/markdowns/Br16_B.Rmd | 181 ------------------------------- Rcode/markdowns/Br16_B_max1.Rmd | 172 ----------------------------- Rcode/markdowns/Br16_B_max2.Rmd | 180 ------------------------------ Rcode/markdowns/Br16_B_max3.Rmd | 180 ------------------------------ Rcode/markdowns/Br16_B_max4.Rmd | 180 ------------------------------ Rcode/markdowns/Br16_C.Rmd | 181 ------------------------------- Rcode/markdowns/Br16_C_max1.Rmd | 172 ----------------------------- Rcode/markdowns/Br16_C_max2.Rmd | 180 ------------------------------ Rcode/markdowns/Br16_C_max3.Rmd | 180 ------------------------------ Rcode/markdowns/Br23.Rmd | 181 ------------------------------- Rcode/markdowns/Br26.Rmd | 181 ------------------------------- Rcode/markdowns/Br30.Rmd | 181 ------------------------------- Rcode/markdowns/Br37.Rmd | 181 ------------------------------- Rcode/markdowns/Br38.Rmd | 181 ------------------------------- Rcode/markdowns/Br39.Rmd | 181 ------------------------------- Rcode/markdowns/Br44.Rmd | 181 ------------------------------- Rcode/markdowns/Br45.Rmd | 181 ------------------------------- Rcode/markdowns/Br46.Rmd | 181 ------------------------------- Rcode/markdowns/Br53.Rmd | 181 ------------------------------- Rcode/markdowns/Br57.Rmd | 181 ------------------------------- Rcode/markdowns/Br61.Rmd | 181 ------------------------------- Rcode/markdowns/Br7.Rmd | 181 ------------------------------- Rcode/markdowns/Brx50.Rmd | 181 ------------------------------- Rcode/markdowns/LM2.Rmd | 181 ------------------------------- Rcode/markdowns/Lu2.Rmd | 181 ------------------------------- Rcode/markdowns/Lu7.Rmd | 181 ------------------------------- Rcode/markdowns/Ov8.Rmd | 181 ------------------------------- Rcode/markdowns/Pr6.Rmd | 181 ------------------------------- Rcode/markdowns/Pr9.Rmd | 181 ------------------------------- Rcode/markdowns/template.Rmd | 181 ------------------------------- 37 files changed, 6662 deletions(-) delete mode 100644 Rcode/markdowns/Br11.Rmd delete mode 100644 Rcode/markdowns/Br16.Rmd delete mode 100644 Rcode/markdowns/Br16_AC.Rmd delete mode 100644 Rcode/markdowns/Br16_AC_max1.Rmd delete mode 100644 Rcode/markdowns/Br16_AC_max2.Rmd delete mode 100644 Rcode/markdowns/Br16_AC_max3.Rmd delete mode 100644 Rcode/markdowns/Br16_AC_max4.Rmd delete mode 100644 Rcode/markdowns/Br16_B.Rmd delete mode 100644 Rcode/markdowns/Br16_B_max1.Rmd delete mode 100644 Rcode/markdowns/Br16_B_max2.Rmd delete mode 100644 Rcode/markdowns/Br16_B_max3.Rmd delete mode 100644 Rcode/markdowns/Br16_B_max4.Rmd delete mode 100644 Rcode/markdowns/Br16_C.Rmd delete mode 100644 Rcode/markdowns/Br16_C_max1.Rmd delete mode 100644 Rcode/markdowns/Br16_C_max2.Rmd delete mode 100644 Rcode/markdowns/Br16_C_max3.Rmd delete mode 100644 Rcode/markdowns/Br23.Rmd delete mode 100644 Rcode/markdowns/Br26.Rmd delete mode 100644 Rcode/markdowns/Br30.Rmd delete mode 100644 Rcode/markdowns/Br37.Rmd delete mode 100644 Rcode/markdowns/Br38.Rmd delete mode 100644 Rcode/markdowns/Br39.Rmd delete mode 100644 Rcode/markdowns/Br44.Rmd delete mode 100644 Rcode/markdowns/Br45.Rmd delete mode 100644 Rcode/markdowns/Br46.Rmd delete mode 100644 Rcode/markdowns/Br53.Rmd delete mode 100644 Rcode/markdowns/Br57.Rmd delete mode 100644 Rcode/markdowns/Br61.Rmd delete mode 100644 Rcode/markdowns/Br7.Rmd delete mode 100644 Rcode/markdowns/Brx50.Rmd delete mode 100644 Rcode/markdowns/LM2.Rmd delete mode 100644 Rcode/markdowns/Lu2.Rmd delete mode 100644 Rcode/markdowns/Lu7.Rmd delete mode 100644 Rcode/markdowns/Ov8.Rmd delete mode 100644 Rcode/markdowns/Pr6.Rmd delete mode 100644 Rcode/markdowns/Pr9.Rmd delete mode 100755 Rcode/markdowns/template.Rmd diff --git a/Rcode/markdowns/Br11.Rmd b/Rcode/markdowns/Br11.Rmd deleted file mode 100644 index 9d4db6d..0000000 --- a/Rcode/markdowns/Br11.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br11" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br11" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16.Rmd b/Rcode/markdowns/Br16.Rmd deleted file mode 100644 index e4bcf7f..0000000 --- a/Rcode/markdowns/Br16.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br16" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_AC.Rmd b/Rcode/markdowns/Br16_AC.Rmd deleted file mode 100644 index a828586..0000000 --- a/Rcode/markdowns/Br16_AC.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br16_AC" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_AC" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_AC_max1.Rmd b/Rcode/markdowns/Br16_AC_max1.Rmd deleted file mode 100644 index 35c4fce..0000000 --- a/Rcode/markdowns/Br16_AC_max1.Rmd +++ /dev/null @@ -1,172 +0,0 @@ ---- -title: "Br16_AC_max1" -author: "Johannes Gawron" -date: "2024-023-04" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_AC_max1" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal)) - } - }) -} - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, nClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, nClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_AC_max2.Rmd b/Rcode/markdowns/Br16_AC_max2.Rmd deleted file mode 100644 index 329c7f2..0000000 --- a/Rcode/markdowns/Br16_AC_max2.Rmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -title: "Br16_AC_max2" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_AC_max2" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_AC_max3.Rmd b/Rcode/markdowns/Br16_AC_max3.Rmd deleted file mode 100644 index 3ddb701..0000000 --- a/Rcode/markdowns/Br16_AC_max3.Rmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -title: "Br16_AC_max3" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_AC_max3" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_AC_max4.Rmd b/Rcode/markdowns/Br16_AC_max4.Rmd deleted file mode 100644 index 5103cd3..0000000 --- a/Rcode/markdowns/Br16_AC_max4.Rmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -title: "Br16_AC_max4" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_AC_max4" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_B.Rmd b/Rcode/markdowns/Br16_B.Rmd deleted file mode 100644 index 7923d92..0000000 --- a/Rcode/markdowns/Br16_B.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br16_B" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_B" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_B_max1.Rmd b/Rcode/markdowns/Br16_B_max1.Rmd deleted file mode 100644 index 7172010..0000000 --- a/Rcode/markdowns/Br16_B_max1.Rmd +++ /dev/null @@ -1,172 +0,0 @@ ---- -title: "Br16_B_max1" -author: "Johannes Gawron" -date: "2024-023-04" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_B_max1" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal)) - } - }) -} - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, nClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, nClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_B_max2.Rmd b/Rcode/markdowns/Br16_B_max2.Rmd deleted file mode 100644 index 9000389..0000000 --- a/Rcode/markdowns/Br16_B_max2.Rmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -title: "Br16_B_max2" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_B_max2" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_B_max3.Rmd b/Rcode/markdowns/Br16_B_max3.Rmd deleted file mode 100644 index ee68ad7..0000000 --- a/Rcode/markdowns/Br16_B_max3.Rmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -title: "Br16_B_max3" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_B_max3" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_B_max4.Rmd b/Rcode/markdowns/Br16_B_max4.Rmd deleted file mode 100644 index 429498f..0000000 --- a/Rcode/markdowns/Br16_B_max4.Rmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -title: "Br16_B_max4" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_B_max4" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_C.Rmd b/Rcode/markdowns/Br16_C.Rmd deleted file mode 100644 index f878018..0000000 --- a/Rcode/markdowns/Br16_C.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br16_C" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_C" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_C_max1.Rmd b/Rcode/markdowns/Br16_C_max1.Rmd deleted file mode 100644 index c43890b..0000000 --- a/Rcode/markdowns/Br16_C_max1.Rmd +++ /dev/null @@ -1,172 +0,0 @@ ---- -title: "Br16_C_max1" -author: "Johannes Gawron" -date: "2024-023-04" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_C_max1" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal)) - } - }) -} - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, nClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, nClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_C_max2.Rmd b/Rcode/markdowns/Br16_C_max2.Rmd deleted file mode 100644 index a91a921..0000000 --- a/Rcode/markdowns/Br16_C_max2.Rmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -title: "Br16_C_max2" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_C_max2" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br16_C_max3.Rmd b/Rcode/markdowns/Br16_C_max3.Rmd deleted file mode 100644 index d122632..0000000 --- a/Rcode/markdowns/Br16_C_max3.Rmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -title: "Br16_C_max3" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br16_C_max3" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br23.Rmd b/Rcode/markdowns/Br23.Rmd deleted file mode 100644 index 565ed4d..0000000 --- a/Rcode/markdowns/Br23.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br23" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br23" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br26.Rmd b/Rcode/markdowns/Br26.Rmd deleted file mode 100644 index 0e03473..0000000 --- a/Rcode/markdowns/Br26.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br26" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br26" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br30.Rmd b/Rcode/markdowns/Br30.Rmd deleted file mode 100644 index 6a62b5d..0000000 --- a/Rcode/markdowns/Br30.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br30" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br30" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br37.Rmd b/Rcode/markdowns/Br37.Rmd deleted file mode 100644 index 41878cf..0000000 --- a/Rcode/markdowns/Br37.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br37" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br37" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br38.Rmd b/Rcode/markdowns/Br38.Rmd deleted file mode 100644 index 41908d3..0000000 --- a/Rcode/markdowns/Br38.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br38" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br38" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br39.Rmd b/Rcode/markdowns/Br39.Rmd deleted file mode 100644 index b3cd1ad..0000000 --- a/Rcode/markdowns/Br39.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br39" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br39" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br44.Rmd b/Rcode/markdowns/Br44.Rmd deleted file mode 100644 index d3b7201..0000000 --- a/Rcode/markdowns/Br44.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br44" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br44" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br45.Rmd b/Rcode/markdowns/Br45.Rmd deleted file mode 100644 index a7f440b..0000000 --- a/Rcode/markdowns/Br45.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br45" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br45" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br46.Rmd b/Rcode/markdowns/Br46.Rmd deleted file mode 100644 index b334cd6..0000000 --- a/Rcode/markdowns/Br46.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br46" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br46" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br53.Rmd b/Rcode/markdowns/Br53.Rmd deleted file mode 100644 index 59a2d09..0000000 --- a/Rcode/markdowns/Br53.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br53" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br53" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br57.Rmd b/Rcode/markdowns/Br57.Rmd deleted file mode 100644 index 6bd3fab..0000000 --- a/Rcode/markdowns/Br57.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br57" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br57" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br61.Rmd b/Rcode/markdowns/Br61.Rmd deleted file mode 100644 index f91766d..0000000 --- a/Rcode/markdowns/Br61.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br61" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br61" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Br7.Rmd b/Rcode/markdowns/Br7.Rmd deleted file mode 100644 index 9fcaa9c..0000000 --- a/Rcode/markdowns/Br7.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Br7" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Br7" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Brx50.Rmd b/Rcode/markdowns/Brx50.Rmd deleted file mode 100644 index 625c68f..0000000 --- a/Rcode/markdowns/Brx50.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Brx50" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Brx50" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/LM2.Rmd b/Rcode/markdowns/LM2.Rmd deleted file mode 100644 index 1c3842a..0000000 --- a/Rcode/markdowns/LM2.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "LM2" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "LM2" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Lu2.Rmd b/Rcode/markdowns/Lu2.Rmd deleted file mode 100644 index 5949e05..0000000 --- a/Rcode/markdowns/Lu2.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Lu2" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Lu2" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Lu7.Rmd b/Rcode/markdowns/Lu7.Rmd deleted file mode 100644 index b8940f1..0000000 --- a/Rcode/markdowns/Lu7.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Lu7" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Lu7" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Ov8.Rmd b/Rcode/markdowns/Ov8.Rmd deleted file mode 100644 index 2ed18ec..0000000 --- a/Rcode/markdowns/Ov8.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Ov8" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Ov8" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Pr6.Rmd b/Rcode/markdowns/Pr6.Rmd deleted file mode 100644 index 0aea058..0000000 --- a/Rcode/markdowns/Pr6.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Pr6" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Pr6" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/Pr9.Rmd b/Rcode/markdowns/Pr9.Rmd deleted file mode 100644 index 05a0f34..0000000 --- a/Rcode/markdowns/Pr9.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "Pr9" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "Pr9" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - - diff --git a/Rcode/markdowns/template.Rmd b/Rcode/markdowns/template.Rmd deleted file mode 100755 index a9aa374..0000000 --- a/Rcode/markdowns/template.Rmd +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: "__tree__" -author: "Johannes Gawron" -date: "2024-03-07" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Splitting statistics - -This code analyses splitting statistics for CTC-clusters. - -The analysis takes a list of trees sampled from its posterior distribution as input and samples mutations placements for each of the trees. - - -## Configure the script -```{r config} -inputFolder <- "../../../input_folder" -simulationInputFolder <- "../../../simulations/simulations2" -treeName <- "__tree__" -nTreeSamplingEvents <- 100 -nMutationSamplingEvents <- 100 -``` - -## Loading data -```{r load} -source("../functions.R") - - -input <- load_data(inputFolder, treeName) - -postSampling <- input$postSampling -nClusters <- input$nClusters -ClusterID <- input$clusterID -nCells <- input$nCells -nMutations <- input$nMutations -nClusters <- input$nClusters -alleleCount <- input$alleleCount -mutatedReadCounts <- input$mutatedReadCounts -totalReadCounts <- input$totalReadCounts -sampleDescription <- input$sample_description -``` - - -## Sample description - -Each row corresponds to a cell. -Column description: - - Cluster: An number indicating which sample the cell belongs to. - - ClusterName: The name of the sample in the nodeDescription.tsv file - - WBC: a binary vector indicating whether the cell is a white blood cell (1) or not (0). - - color: Indicates the color of the cluster in the tree, as described in the nodeDescription.tsv - file. - -```{r Describe samples} -print(sampleDescription) -``` - - - -Get null distributions of relevant statistics, stratified by sample: -```{r} -cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector()) -cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector()) - -for (clusterSize in 2:5){ - try( - {treeNameSimulated <- paste(treeName, clusterSize, sep = '_') - - - inputSimulated <- load_data(simulationInputFolder, treeNameSimulated) - - postSamplingSimulated <- inputSimulated$postSampling - nClustersSimulated <- inputSimulated$nClusters - ClusterIDSimulated <- inputSimulated$clusterID - nCellsSimulated <- inputSimulated$nCells - nMutationsSimulated <- inputSimulated$nMutations - nClustersSimulated <- inputSimulated$nClusters - alleleCountSimulated <- inputSimulated$alleleCount - mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts - totalReadCountsSimulated <- inputSimulated$totalReadCounts - sampleDescriptionSimulated <- inputSimulated$sample_description - - distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated, - nMutationsSimulated, nClustersSimulated, - alleleCountSimulated, - mutatedReadCountsSimulated, totalReadCountsSimulated, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c("orchid", "orchid1", "orchid2", - "orchid3", "orchid4", "darkorchid", - "darkorchid1","darkorchid2", "darkorchid3", - "darkorchid4", "purple", "purple1", - "purple2", "purple3", "purple4")) - - - - plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) + - geom_boxplot()) - cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) )) - - ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are - ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more - ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters - ## of the same size, it is however the potential source of a future bug!! - - plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) + - geom_histogram(binwidth = 0.01)) - print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] )) - }) -} -``` - - - -Get the relevant statistics for each of the clusters of a dataset and output numbers of oligoclonal clusters: -```{r} -nTumorClusters <- 0 -nOligoclonalClusters1 <- 0 -nOligoclonalClusters2 <- 0 -splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) -splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector()) - -for(clusterSize in 2:5){ - try({ - clusterColor <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() == clusterSize) %>% - pull(color) %>% - unique() - - for(color in clusterColor){ - distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents, - cellPairSelection = c(color)) - - splittingProbs <- mean(distance$splittingProbs$Splitting_probability) - branchingProbs <- mean(distance$aggregatedBranchingProbabilities) - - nTumorClusters <- nTumorClusters + 1 - oligoclonal <- FALSE - print(clusterSize) - print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2]) - if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){ - nOligoclonalClusters1 <- nOligoclonalClusters1 + 1 - oligoclonal <- TRUE - } - splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - oligoclonal <- FALSE - if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){ - nOligoclonalClusters2 <- nOligoclonalClusters2 + 1 - oligoclonal <- TRUE - } - splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize)) - } - }) -} - - -numberOfCancerClusters <- sampleDescription %>% - filter(WBC ==0 & color != 'gray93') %>% - group_by(color) %>% - filter(n() > 1) %>% - pull(color) %>% - unique() %>% length() - -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName)) -print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName)) -print(splittingSummary1) -print(splittingSummary2) -``` - - -