From 95d6057207d68b5b3d17b0776cb8bfec30eb9d5b Mon Sep 17 00:00:00 2001 From: JohannesGawron Date: Mon, 10 Jun 2024 17:59:10 +0200 Subject: [PATCH] pre-commit run --- Rcode/.Rproj.user/shared/notebooks/paths | 1 + Rcode/ComputeSplittingStatistics.R | 159 ++-- Rcode/Summary_plots.R | 96 +- Rcode/annotateVariants.R | 69 +- Rcode/functions.R | 748 ++++++++-------- Rcode/simulateCTCclusters.R | 326 +++---- Rcode/statistical_test_source.R | 210 +++-- Rcode/validation_cluster_data.R | 21 +- experiments/data/htmls/Br30.html | 615 ------------- experiments/data/htmls/Br44.html | 605 ------------- experiments/data/htmls/Br46.html | 606 ------------- experiments/data/htmls/Br61.html | 823 ------------------ experiments/data/htmls/Br7.html | 766 ---------------- experiments/data/htmls/Lu2.html | 705 --------------- experiments/data/htmls/Lu7.html | 621 ------------- experiments/data/htmls/Pr9.html | 765 ---------------- .../data/markdowns/Br16_AC_topSeparators.Rmd | 64 +- .../data/markdowns/Br16_B_topSeparators.Rmd | 272 +++--- .../data/markdowns/Br23_topSeparators.Rmd | 38 +- .../data/markdowns/Br26_topSeparators.Rmd | 37 +- .../data/markdowns/Br38_topSeparators.Rmd | 23 +- .../data/markdowns/Br61_topSeparators.Rmd | 232 +++-- .../data/markdowns/Brx50_topSeparators.Rmd | 38 +- .../data/markdowns/LM2_topSeparators.Rmd | 309 ++++--- .../data/markdowns/Pr9_topSeparators.Rmd | 74 +- experiments/workflow/Snakefile | 24 +- experiments/workflow/resources/UnitTests.R | 470 +++++----- experiments/workflow/resources/functions.R | 752 ++++++++-------- experiments/workflow/rules/base.smk | 39 +- 29 files changed, 2044 insertions(+), 7464 deletions(-) delete mode 100644 experiments/data/htmls/Br30.html delete mode 100644 experiments/data/htmls/Br44.html delete mode 100644 experiments/data/htmls/Br46.html delete mode 100644 experiments/data/htmls/Br61.html delete mode 100644 experiments/data/htmls/Br7.html delete mode 100644 experiments/data/htmls/Lu2.html delete mode 100644 experiments/data/htmls/Lu7.html delete mode 100644 experiments/data/htmls/Pr9.html diff --git a/Rcode/.Rproj.user/shared/notebooks/paths b/Rcode/.Rproj.user/shared/notebooks/paths index d54f340..51b1f34 100644 --- a/Rcode/.Rproj.user/shared/notebooks/paths +++ b/Rcode/.Rproj.user/shared/notebooks/paths @@ -1 +1,2 @@ +/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/Rcode/renv/activate.R="42F1FF6D" /Users/jgawron/Documents/projects/CTC_backup/validation_experiment/foldchange_in_barcode_prevalence.R="097865BC" diff --git a/Rcode/ComputeSplittingStatistics.R b/Rcode/ComputeSplittingStatistics.R index 6c3b356..2e2dcd0 100755 --- a/Rcode/ComputeSplittingStatistics.R +++ b/Rcode/ComputeSplittingStatistics.R @@ -1,7 +1,7 @@ source("functions.R") ############ -#Config +# Config ############ inputFolder <- "../../input_folder" treeName <- "LM2" @@ -10,14 +10,14 @@ treeName <- "LM2" ############ -#Data preprocessing +# Data preprocessing ############ input <- load_data(inputFolder, treeName) postSampling <- input$postSampling nClusters <- input$nClusters ClusterID <- input$clusterID -nCells <- input$nCells +nCells <- input$nCells nMutations <- input$nMutations nClusters <- input$nClusters alleleCount <- input$alleleCount @@ -31,32 +31,33 @@ annotations <- input$annotations totalReadCountVector <- totalReadCounts %>% unlist() -fit1 <- glm(totalReadCountVector ~ 1, family = poisson(link = 'log')) +fit1 <- glm(totalReadCountVector ~ 1, family = poisson(link = "log")) fit2 <- glm.nb(totalReadCountVector ~ 1) -fit3 <- zeroinfl(totalReadCountVector ~1, dist = 'negbin') -fit4 <- zeroinfl(totalReadCountVector ~1, dist = 'poisson') +fit3 <- zeroinfl(totalReadCountVector ~ 1, dist = "negbin") +fit4 <- zeroinfl(totalReadCountVector ~ 1, dist = "poisson") summary(fit1) summary(fit2) exp(coef(fit)) coef(fit2) -coeficients <- exp(summary(fit3)$coefficients$count[,1]) +coeficients <- exp(summary(fit3)$coefficients$count[, 1]) exp(coef(fit3)) summary(fit4) simNew <- ifelse(rbinom(length(totalReadCountVector), size = 1, prob = exp(coef(fit3))[2]) > 0, - 0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961))) + 0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961)) +) sim <- data.frame(sim = vector(), run = vector()) -for(i in 1:100){ - +for (i in 1:100) { simNew <- ifelse(rbinom(length(totalReadCountVector), size = 1, prob = exp(coef(fit3))[2]) > 0, - 0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961))) - + 0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961)) + ) + sim <- rbind(sim, data.frame(sim = simNew, run = i)) } @@ -64,15 +65,15 @@ for(i in 1:100){ sim <- rbind(sim, data.frame(sim = totalReadCountVector, run = 0)) sim %>% - ggplot(aes(x = sim, group = run)) + - geom_histogram(data = sim[sim$run == 0,], alpha = 0.4, color = 'darkseagreen', fill = 'darkseagreen') + - geom_freqpoly(data = sim[sim$run != 0,], aes(x = sim), color = 'red', position = 'identity', alpha = 0.4) + ggplot(aes(x = sim, group = run)) + + geom_histogram(data = sim[sim$run == 0, ], alpha = 0.4, color = "darkseagreen", fill = "darkseagreen") + + geom_freqpoly(data = sim[sim$run != 0, ], aes(x = sim), color = "red", position = "identity", alpha = 0.4) ############ -#Unit testing +# Unit testing ############ @@ -93,7 +94,7 @@ test_ComputePerMutationProbabilityOfPolyclonality() ############ -#Main Analysis +# Main Analysis ############ @@ -103,7 +104,7 @@ mutationFilter <- apply(mutationDescription, 1, FUN = IsDriver, annotations) -readCounts <- read_delim('../../input_folder//LM2/LM2.txt', delim = '\t', col_names = FALSE) +readCounts <- read_delim("../../input_folder//LM2/LM2.txt", delim = "\t", col_names = FALSE) @@ -122,49 +123,53 @@ print(candidate_pairs$full_distance_matrix) splittingProbs <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount, - mutatedReadCounts, totalReadCounts, - nMutationSamplingEvents = 20, nTreeSamplingEvents = 20) + nMutations, nClusters, + alleleCount, + mutatedReadCounts, totalReadCounts, + nMutationSamplingEvents = 20, nTreeSamplingEvents = 20 +) -splittingProbs %>% group_by(Cluster) %>% summarize(meanSplittingProbability = mean(Splitting_probability)) +splittingProbs %>% + group_by(Cluster) %>% + summarize(meanSplittingProbability = mean(Splitting_probability)) ## Go through all clusters and compare all pairs of cells within each cluster with ## each other. Note that the cells from the clusters are adjacent to each other by ## design, so incrementing the index j by 1 makes sense distance <- vector() clusterIdentityofdistance <- vector() -system.time(for (c in 1:nClusters){ - cellsInCluster <- which(sampleDescription$Cluster == (c-1))-1 ## Make sure array indication is +system.time(for (c in 1:nClusters) { + cellsInCluster <- which(sampleDescription$Cluster == (c - 1)) - 1 ## Make sure array indication is ## compatible with cpp cluster_done <- 0 - for(i in cellsInCluster){ - if(cluster_done == 1){ + for (i in cellsInCluster) { + if (cluster_done == 1) { cluster_done <- 0 break } - if(sampleDescription$WBC[i+1] == 1) next + if (sampleDescription$WBC[i + 1] == 1) next j <- cellsInCluster[1] - while(j < i){ - if(cluster_done == 1){ + while (j < i) { + if (cluster_done == 1) { break } - if(sampleDescription$WBC[j+1] == 1){ + if (sampleDescription$WBC[j + 1] == 1) { j <- j + 1 next } print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " ")) - distance <- c(distance, produce_Distance_Posterior(i,j,postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount,sampleDescription$Cluster, - mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = 1000)) - clusterIdentityofdistance <- c(clusterIdentityofdistance, c-1) + distance <- c(distance, produce_Distance_Posterior(i, j, postSampling, treeName, nCells, + nMutations, nClusters, + alleleCount, sampleDescription$Cluster, + mutatedReadCounts, totalReadCounts, sampleDescription$WBC, + nSamplingEvents = 1000 + )) + clusterIdentityofdistance <- c(clusterIdentityofdistance, c - 1) j <- j + 1 cluster_done <- 1 } } - }) ######### @@ -187,51 +192,53 @@ system.time(for (c in 1:nClusters){ ## design, so incrementing the index j by 1 makes sense distance <- vector() clusterIdentityofdistance <- vector() -system.time(for (c in 1:nClusters){ - cellsInCluster <- which(sampleDescription$Cluster == (c-1))-1 ## Make sure array indication is - ## compatible with cpp +system.time(for (c in 1:nClusters) { + cellsInCluster <- which(sampleDescription$Cluster == (c - 1)) - 1 ## Make sure array indication is + ## compatible with cpp cluster_done <- 0 - for(i in cellsInCluster){ - if(cluster_done == 1){ + for (i in cellsInCluster) { + if (cluster_done == 1) { cluster_done <- 0 break } - if(sampleDescription$WBC[i+1] == 1) next + if (sampleDescription$WBC[i + 1] == 1) next j <- cellsInCluster[1] - while(j < i){ - if(cluster_done == 1){ + while (j < i) { + if (cluster_done == 1) { break } - if(sampleDescription$WBC[j+1] == 1){ + if (sampleDescription$WBC[j + 1] == 1) { j <- j + 1 next } print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " ")) - distance <- c(distance, produce_Distance_Posterior(i,j,postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount,sampleDescription$Cluster, - mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = 1000)) - clusterIdentityofdistance <- c(clusterIdentityofdistance, c-1) + distance <- c(distance, produce_Distance_Posterior(i, j, postSampling, treeName, nCells, + nMutations, nClusters, + alleleCount, sampleDescription$Cluster, + mutatedReadCounts, totalReadCounts, sampleDescription$WBC, + nSamplingEvents = 1000 + )) + clusterIdentityofdistance <- c(clusterIdentityofdistance, c - 1) j <- j + 1 cluster_done <- 1 } } - }) ######### intraClusterSplitMedianPlot <- ggplot(data.frame(Median_Distance = distance), aes(x = Median_Distance)) + - geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black", alpha = 0.7)+ - xlab("Median distance between of leaves within the same cluster") + ylab("total count") + + geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black", alpha = 0.7) + + xlab("Median distance between of leaves within the same cluster") + + ylab("total count") + ggtitle(treeName) + - labs(subtitle = "Histogram of similarities of cells within cluster",caption = "dashed red line indicates cutoff for oligoclonality") + + labs(subtitle = "Histogram of similarities of cells within cluster", caption = "dashed red line indicates cutoff for oligoclonality") + theme_minimal() + theme( plot.title = element_text(size = 24, face = "bold"), axis.title.x = element_text(size = 20), axis.title.y = element_text(size = 20), - plot.subtitle = element_text(size= 20), + plot.subtitle = element_text(size = 20), axis.text = element_text(size = 16), plot.caption = element_text(size = 14) ) @@ -240,47 +247,43 @@ plot(intraClusterSplitMedianPlot) summary(distance) print(intraClusterSplitMedianPlot) -#####Manually adapt +##### Manually adapt cutoffForOligoclonality <- 400 -intraClusterSplitMedianPlot + geom_vline(xintercept = cutoffForOligoclonality,color = "red", linetype = "dashed", size = 1) +intraClusterSplitMedianPlot + geom_vline(xintercept = cutoffForOligoclonality, color = "red", linetype = "dashed", size = 1) ### Now look at each cluster and determine whether at least one pair of cells split clusterSplits <- vector() -for (c in 1:nClusters){ - cellPairsInCluster <- which(clusterIdentityofdistance == (c-1)) +for (c in 1:nClusters) { + cellPairsInCluster <- which(clusterIdentityofdistance == (c - 1)) print(cellPairsInCluster) print("Distances:") print(distance[cellPairsInCluster]) - - if(length(cellPairsInCluster) == 0) next - else if (max(distance[cellPairsInCluster])>cutoffForOligoclonality){ - clusterSplits <- c(clusterSplits,1) - } - else { - clusterSplits <- c(clusterSplits,0) + + if (length(cellPairsInCluster) == 0) { + next + } else if (max(distance[cellPairsInCluster]) > cutoffForOligoclonality) { + clusterSplits <- c(clusterSplits, 1) + } else { + clusterSplits <- c(clusterSplits, 0) } } which(ClusterID == 25) -produce_Distance_Posterior(35,36, postSampling, "LM2") +produce_Distance_Posterior(35, 36, postSampling, "LM2") -produce_Distance_Posterior(35,36, postSampling, "LM2") +produce_Distance_Posterior(35, 36, postSampling, "LM2") -#compute_hamming_distance_distr <- function(leaf1, leaf2, postSampling){ - +# compute_hamming_distance_distr <- function(leaf1, leaf2, postSampling){ + # for (i in nrow(postSampling)){ # tree <- postSampling$Tree[i] - #tree <- postSampling$Tree[3200] ##Debugging - -# } -#} - - - +# tree <- postSampling$Tree[3200] ##Debugging +# } +# } diff --git a/Rcode/Summary_plots.R b/Rcode/Summary_plots.R index d9933d6..cb1636c 100755 --- a/Rcode/Summary_plots.R +++ b/Rcode/Summary_plots.R @@ -1,9 +1,15 @@ -data <- data.frame(sample = c('Br11','Br11','Br11','Br23','Br23','Br23','Br38','Br38','Br38','Br39','Br39','Br39','Br61','Br61','Br61', 'Brx50','Brx50','Brx50', 'LM2','LM2','LM2','Lu2','Lu2','Lu2', - 'Pr6','Pr6','Pr6','Pr9','Pr9','Pr9'), - Category = c('Oligoclonal', 'Monoclonal', 'Likely oligoclonal','Oligoclonal', 'Monoclonal', 'Likely oligoclonal','Oligoclonal', 'Monoclonal', 'Likely oligoclonal','Oligoclonal', 'Monoclonal', 'Likely oligoclonal', - 'Oligoclonal', 'Monoclonal', 'Likely oligoclonal','Oligoclonal', 'Monoclonal', 'Likely oligoclonal','Oligoclonal', 'Monoclonal', 'Likely oligoclonal','Oligoclonal', 'Monoclonal', 'Likely oligoclonal', - 'Oligoclonal', 'Monoclonal', 'Likely oligoclonal','Oligoclonal', 'Monoclonal', 'Likely oligoclonal'), - Counts = c(0,0,1,0,4,0,0,0,1,0,0,1, 3,3,2,0,2,0,2,7,5,0,1,0,0,1,0,0,0,2)) +data <- data.frame( + sample = c( + "Br11", "Br11", "Br11", "Br23", "Br23", "Br23", "Br38", "Br38", "Br38", "Br39", "Br39", "Br39", "Br61", "Br61", "Br61", "Brx50", "Brx50", "Brx50", "LM2", "LM2", "LM2", "Lu2", "Lu2", "Lu2", + "Pr6", "Pr6", "Pr6", "Pr9", "Pr9", "Pr9" + ), + Category = c( + "Oligoclonal", "Monoclonal", "Likely oligoclonal", "Oligoclonal", "Monoclonal", "Likely oligoclonal", "Oligoclonal", "Monoclonal", "Likely oligoclonal", "Oligoclonal", "Monoclonal", "Likely oligoclonal", + "Oligoclonal", "Monoclonal", "Likely oligoclonal", "Oligoclonal", "Monoclonal", "Likely oligoclonal", "Oligoclonal", "Monoclonal", "Likely oligoclonal", "Oligoclonal", "Monoclonal", "Likely oligoclonal", + "Oligoclonal", "Monoclonal", "Likely oligoclonal", "Oligoclonal", "Monoclonal", "Likely oligoclonal" + ), + Counts = c(0, 0, 1, 0, 4, 0, 0, 0, 1, 0, 0, 1, 3, 3, 2, 0, 2, 0, 2, 7, 5, 0, 1, 0, 0, 1, 0, 0, 0, 2) +) @@ -16,64 +22,74 @@ data %>% filter(Counts != 0) %>% ggplot(aes(x = "", y = Counts, fill = Category)) + geom_bar(stat = "identity") + -# coord_polar("y", start = 0) + - facet_wrap(~ sample, nrow = 2) + + # coord_polar("y", start = 0) + + facet_wrap(~sample, nrow = 2) + theme_void() + - theme(legend.position = "bottom", - legend.title = element_blank(), - legend.text = element_text(size = 20), - strip.text.x = element_text(size =20)) + + theme( + legend.position = "bottom", + legend.title = element_blank(), + legend.text = element_text(size = 20), + strip.text.x = element_text(size = 20) + ) + geom_text(aes(label = Counts), position = position_stack(vjust = 0.5), size = 7) + - scale_fill_manual(values=cbPalette) + scale_fill_manual(values = cbPalette) -data %>% group_by(Category) %>% dplyr::summarize(TotalCounts = sum(Counts)) %>% +data %>% + group_by(Category) %>% + dplyr::summarize(TotalCounts = sum(Counts)) %>% ggplot(aes(x = "", y = TotalCounts, fill = Category)) + geom_bar(stat = "identity") + -# coord_polar("y", start = 0) + + # coord_polar("y", start = 0) + theme_void() + - theme(legend.position = "bottom", - legend.title = element_blank(), - legend.text = element_text(size = 20), - strip.text.x = element_text(size =20)) + + theme( + legend.position = "bottom", + legend.title = element_blank(), + legend.text = element_text(size = 20), + strip.text.x = element_text(size = 20) + ) + geom_text(aes(label = TotalCounts), position = position_stack(vjust = 0.5), size = 10) + - scale_fill_manual(values=cbPalette) + scale_fill_manual(values = cbPalette) -summary_data <- data.frame(Sample = c('Br7', 'Br11', 'Br23', 'Br26', 'Br38', 'Br39', 'Br61', 'LM2', 'Lu2', 'Pr6', 'Pr9', 'Br16_B', 'Br16_C', 'Br16_AC'), Oligoclonal_moderate_functional_impact = c(0,0,0,0,0,0,1,1,0,0,0,1,0,1), Oligoclonal_high_functional_impact = c(0,0,0,0,0,0,3,3,0,0,1,5,0,0), Likely_oligoclonal = c(3,1,2,1,1,0,3,7,0,0,1,0,0,1), No_oligoclonality_detected = c(0,0,0,0,0,1,1,3,1,1,0,20,8,20)) +summary_data <- data.frame(Sample = c("Br7", "Br11", "Br23", "Br26", "Br38", "Br39", "Br61", "LM2", "Lu2", "Pr6", "Pr9", "Br16_B", "Br16_C", "Br16_AC"), Oligoclonal_moderate_functional_impact = c(0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1), Oligoclonal_high_functional_impact = c(0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 1, 5, 0, 0), Likely_oligoclonal = c(3, 1, 2, 1, 1, 0, 3, 7, 0, 0, 1, 0, 0, 1), No_oligoclonality_detected = c(0, 0, 0, 0, 0, 1, 1, 3, 1, 1, 0, 20, 8, 20)) -save.image(file = '~/Documents/CTC_backup/topSeparatingMutations/result_summary.RData') +save.image(file = "~/Documents/CTC_backup/topSeparatingMutations/result_summary.RData") library(tidyverse) -summary_data_long <- pivot_longer(summary_data, cols = c('Oligoclonal_moderate_functional_impact', 'Oligoclonal_high_functional_impact', 'Likely_oligoclonal', 'No_oligoclonality_detected'), names_to = 'Status') +summary_data_long <- pivot_longer(summary_data, cols = c("Oligoclonal_moderate_functional_impact", "Oligoclonal_high_functional_impact", "Likely_oligoclonal", "No_oligoclonality_detected"), names_to = "Status") -summary_data_long %>% filter(value != 0) %>% - ggplot(aes(x = '', y = value, fill = Status)) + +summary_data_long %>% + filter(value != 0) %>% + ggplot(aes(x = "", y = value, fill = Status)) + geom_bar(stat = "identity") + # coord_polar("y", start = 0) + - facet_wrap(~ Sample, nrow = 2) + + facet_wrap(~Sample, nrow = 2) + theme_void() + - theme(legend.position = "bottom", - legend.title = element_blank(), - legend.text = element_text(size = 12), - strip.text.x = element_text(size =20)) + + theme( + legend.position = "bottom", + legend.title = element_blank(), + legend.text = element_text(size = 12), + strip.text.x = element_text(size = 20) + ) + geom_text(aes(label = value), position = position_stack(vjust = 0.5), size = 5) + - scale_fill_manual(values=cbPalette) + scale_fill_manual(values = cbPalette) -summary_data_long %>% group_by(Status) %>% +summary_data_long %>% + group_by(Status) %>% summarize(totalCounts = sum(value)) %>% filter(totalCounts != 0) %>% - ggplot(aes(x = '', y = totalCounts, fill = Status)) + + ggplot(aes(x = "", y = totalCounts, fill = Status)) + geom_bar(stat = "identity") + # coord_polar("y", start = 0) + theme_void() + - theme(legend.position = "right", - legend.title = element_blank(), - legend.text = element_text(size = 12), - strip.text.x = element_text(size =20)) + + theme( + legend.position = "right", + legend.title = element_blank(), + legend.text = element_text(size = 12), + strip.text.x = element_text(size = 20) + ) + geom_text(aes(label = totalCounts), position = position_stack(vjust = 0.5), size = 7) + - scale_fill_manual(values=cbPalette) - - + scale_fill_manual(values = cbPalette) diff --git a/Rcode/annotateVariants.R b/Rcode/annotateVariants.R index 172d132..c77849e 100755 --- a/Rcode/annotateVariants.R +++ b/Rcode/annotateVariants.R @@ -1,59 +1,58 @@ library(tidyverse) -annotate_variants <- function(sampleName, inputFolder, variantList){ - - - data <- read_tsv(file.path(inputFolder,sampleName, paste0(sampleName,'.txt')), col_names = FALSE) - colnames(data)[1] <- 'CHROM' - colnames(data)[2] <- 'POS' - colnames(data)[3] <- 'REF' - colnames(data)[4] <- 'ALT' - - - +annotate_variants <- function(sampleName, inputFolder, variantList) { + data <- read_tsv(file.path(inputFolder, sampleName, paste0(sampleName, ".txt")), col_names = FALSE) + colnames(data)[1] <- "CHROM" + colnames(data)[2] <- "POS" + colnames(data)[3] <- "REF" + colnames(data)[4] <- "ALT" + + + # Read VCF file to extract column names - file <- file.path(inputFolder, 'filtered', 'vcf_files_annotated', paste0(sampleName, '.ann.vcf')) + file <- file.path(inputFolder, "filtered", "vcf_files_annotated", paste0(sampleName, ".ann.vcf")) lines <- readLines(file, warn = FALSE) vcf_names <- strsplit(lines[grep("^#CHROM", lines)], "\t")[[1]] - + # Read VCF file into a data frame - vcf <- read.table(file.path(inputFolder, 'filtered', 'vcf_files_annotated', paste0(sampleName, '.ann.vcf')), - comment.char = '#', sep = "\t", header = FALSE, col.names = vcf_names) - colnames(vcf)[1] <- '#CHROM' + vcf <- read.table(file.path(inputFolder, "filtered", "vcf_files_annotated", paste0(sampleName, ".ann.vcf")), + comment.char = "#", sep = "\t", header = FALSE, col.names = vcf_names + ) + colnames(vcf)[1] <- "#CHROM" # Extract functional annotations - include <- rep('NONE',nrow(vcf)) + include <- rep("NONE", nrow(vcf)) for (i in seq_along(vcf$INFO)) { - functionalAnnotation <- unlist(strsplit(strsplit(vcf$INFO[i], ';')[[1]][2], ',')) - if(any(sapply(strsplit(functionalAnnotation, '\\|'), "[[", 3) == 'MODERATE')){ - impact <- 'MODERATE' + functionalAnnotation <- unlist(strsplit(strsplit(vcf$INFO[i], ";")[[1]][2], ",")) + if (any(sapply(strsplit(functionalAnnotation, "\\|"), "[[", 3) == "MODERATE")) { + impact <- "MODERATE" include[i] <- impact } - if(any(sapply(strsplit(functionalAnnotation, '\\|'), "[[", 3) == 'HIGH')){ - impact <- 'HIGH' + if (any(sapply(strsplit(functionalAnnotation, "\\|"), "[[", 3) == "HIGH")) { + impact <- "HIGH" include[i] <- impact } } - - - + + + # Check and filter based on functional annotation includeFunctionalAnnotation <- logical(nrow(data)) for (i in seq_len(nrow(data))) { - subset_rows <- vcf[vcf$'#CHROM' == data$'CHROM'[i] & vcf$POS == data$POS[i], ] - + subset_rows <- vcf[vcf$"#CHROM" == data$"CHROM"[i] & vcf$POS == data$POS[i], ] + if (nrow(subset_rows) != 1) { print(subset_rows) - stop('More than one hit in the annotation file. ERROR') + stop("More than one hit in the annotation file. ERROR") } else { - includeFunctionalAnnotation[i] <- include[which(vcf$'#CHROM' == data$'CHROM'[i] & vcf$POS == data$POS[i])] + includeFunctionalAnnotation[i] <- include[which(vcf$"#CHROM" == data$"CHROM"[i] & vcf$POS == data$POS[i])] } } - + data$relevant <- includeFunctionalAnnotation - sum(data$relevant == 'FALSE') - - data <- data %>% mutate(variantName = paste(CHROM, POS, sep = '_')) - - return(data[, c('variantName', 'REF', 'ALT', 'relevant')]) + sum(data$relevant == "FALSE") + + data <- data %>% mutate(variantName = paste(CHROM, POS, sep = "_")) + + return(data[, c("variantName", "REF", "ALT", "relevant")]) } diff --git a/Rcode/functions.R b/Rcode/functions.R index 1df551d..e926029 100755 --- a/Rcode/functions.R +++ b/Rcode/functions.R @@ -1,6 +1,5 @@ - ############ -#Function Definitions +# Function Definitions ############ library(Rcpp) @@ -9,16 +8,16 @@ library(tidyverse) sourceCpp("mutations_placement.cpp") -source('UnitTests.R') +source("UnitTests.R") #' Takes a list of mutations and outputs which one of these is a driver. -#' +#' #' #' @param mutations a names vector containing chromosomes in the format "chrN" in the first -#' column and an integer chromosomal position on the second column +#' column and an integer chromosomal position on the second column #' @param annotations an annotation data frame. Must contain the columns #' - 'CGI-Oncogenic Summary': entry can be 'driver (oncodriveMUT)' or somerthing else #' - 'CGI-Oncogenic Prediction': entry can be 'oncogenic (predicted)' or something ele @@ -28,17 +27,17 @@ source('UnitTests.R') #' @export #' #' @examples -IsDriver <- function(mutations, annotations){ +IsDriver <- function(mutations, annotations) { annotated_mutations <- annotations %>% - filter(annotations$'#CHROM' == as.character(mutations[1]) & annotations$POS == as.numeric(mutations[2])) - + filter(annotations$"#CHROM" == as.character(mutations[1]) & annotations$POS == as.numeric(mutations[2])) + check <- annotated_mutations %>% - select(c('CGI-Oncogenic Summary','CGI-Oncogenic Prediction', 'CGI-External oncogenic annotation')) %in% - c('oncogenic (predicted)', 'driver (oncodriveMUT)') %>% + select(c("CGI-Oncogenic Summary", "CGI-Oncogenic Prediction", "CGI-External oncogenic annotation")) %in% + c("oncogenic (predicted)", "driver (oncodriveMUT)") %>% sum() - + driver <- FALSE - if(check > 0){ + if (check > 0) { driver <- TRUE } return(driver) @@ -50,43 +49,42 @@ IsDriver <- function(mutations, annotations){ -#Legacy -#Input: a tree in parent vector format, meaning that the i'th entry of the vector -# is te parent node of the entry i. Nodes are counted from zero and the root is +# Legacy +# Input: a tree in parent vector format, meaning that the i'th entry of the vector +# is te parent node of the entry i. Nodes are counted from zero and the root is # length(Tree) -#Output: A list with three entries: +# Output: A list with three entries: # - the first entry is a vector of nodes tracing back leaf 1 to the root # - the second entry is a vector of nodes tracing back leaf2 to the descendant # of the MRCA in the lineage # - the thirs entry is the MRCA -find_most_recent_common_ancestor <- function(treeParentVectorFormat, leaf1, leaf2){ - ##Trace back the lineage of the tree for one leaf. - ##Then trace back the lineage of the tree for the other leaf and for every node - ##whether is lies in the lineage of the first leaf. - ##The first node that does is the most recent common ancestor node. - ##Concatenating these two will form the shortest path through the tree. +find_most_recent_common_ancestor <- function(treeParentVectorFormat, leaf1, leaf2) { + ## Trace back the lineage of the tree for one leaf. + ## Then trace back the lineage of the tree for the other leaf and for every node + ## whether is lies in the lineage of the first leaf. + ## The first node that does is the most recent common ancestor node. + ## Concatenating these two will form the shortest path through the tree. ## If there is a mutation on the tree, then this means that the cells are - ##split by the tree, if there is none, then they aren't. - - ##Note that the nodes and leaves of the tree are encoded from 0 to the number of nodes minus 1 + ## split by the tree, if there is none, then they aren't. + + ## Note that the nodes and leaves of the tree are encoded from 0 to the number of nodes minus 1 ## Therefore, I add 1 to the indices to be compatible with R indication starting at 1 lineage1 <- leaf1 - repeat { - #print(treeParentVectorFormat[lineage1[length(lineage1)] + 1]) + repeat { + # print(treeParentVectorFormat[lineage1[length(lineage1)] + 1]) lineage1 <- c(lineage1, treeParentVectorFormat[lineage1[length(lineage1)] + 1]) - if(lineage1[length(lineage1)] == length(treeParentVectorFormat)) break + if (lineage1[length(lineage1)] == length(treeParentVectorFormat)) break } lineage2 <- leaf2 nextParent <- treeParentVectorFormat[leaf2 + 1] - while(!(nextParent %in% lineage1)) { + while (!(nextParent %in% lineage1)) { lineage2 <- c(lineage2, nextParent) nextParent <- treeParentVectorFormat[nextParent + 1] - #print(nextParent) - #print(!(nextParent %in% lineage1)) - + # print(nextParent) + # print(!(nextParent %in% lineage1)) } MRCA <- nextParent - return(list(lineage1,lineage2, MRCA)) + return(list(lineage1, lineage2, MRCA)) } @@ -97,45 +95,49 @@ find_most_recent_common_ancestor <- function(treeParentVectorFormat, leaf1, leaf # Take a tree and a pair of mutations and do the following: compute_pairwise_distance_of_leaves <- function(treeData, leaf1, leaf2, nCells, nMutations, nClusters, - alleleCount,ClusterID, - mutatedReadCounts, totalReadCounts,wbcStatus){ + alleleCount, ClusterID, + mutatedReadCounts, totalReadCounts, wbcStatus) { tree <- treeData$Tree treeParentVectorFormat <- as.numeric(unlist(strsplit(tree, " "))) dropoutRate <- treeData$DropoutRate seqErrRate <- treeData$SequencingErrorRate - + ### Now I need to compute the best mutation placement on the tree. This is done - ##using the scoreTree C++ function (taken from CTC_treeScoring.cpp). - - #print("Preprocess tree") - ancestorMatrix <- parentVector2ancMatrix(treeParentVectorFormat, - length(treeParentVectorFormat)) - - - #print("Find best Mutation placement") - bestMutationPlacement <- getMutationPlacement (nCells, nMutations, nClusters, - ancestorMatrix, alleleCount, - ClusterID,mutatedReadCounts, - totalReadCounts, - dropoutRate, seqErrRate, 1, - wbcStatus)## This crashes when executed on posterior sampling of Br61 - #print("Finding most recent common ancestor") - pairwiseGenealogy <- findMostRecentCommonAncestor(treeParentVectorFormat, leaf1,leaf2) - + ## using the scoreTree C++ function (taken from CTC_treeScoring.cpp). + + # print("Preprocess tree") + ancestorMatrix <- parentVector2ancMatrix( + treeParentVectorFormat, + length(treeParentVectorFormat) + ) + + + # print("Find best Mutation placement") + bestMutationPlacement <- getMutationPlacement( + nCells, nMutations, nClusters, + ancestorMatrix, alleleCount, + ClusterID, mutatedReadCounts, + totalReadCounts, + dropoutRate, seqErrRate, 1, + wbcStatus + ) ## This crashes when executed on posterior sampling of Br61 + # print("Finding most recent common ancestor") + pairwiseGenealogy <- findMostRecentCommonAncestor(treeParentVectorFormat, leaf1, leaf2) + positionOfMRCA <- which(pairwiseGenealogy[[1]] == pairwiseGenealogy[[3]]) firstLeafToMRCA <- pairwiseGenealogy[[1]][1:(positionOfMRCA)] - + secondLeafToMRCA <- pairwiseGenealogy[[2]] - - #print(firstLeafToMRCA) - #print(secondLeafToMRCA) - pathBetweenLeaves <- c(firstLeafToMRCA,rev(secondLeafToMRCA)) - #print(pathBetweenLeaves) - + + # print(firstLeafToMRCA) + # print(secondLeafToMRCA) + pathBetweenLeaves <- c(firstLeafToMRCA, rev(secondLeafToMRCA)) + # print(pathBetweenLeaves) + ## Now count an output how many of the mutations lie in the shortest path between the leaves. - ##This equals the Hamming distance between the inferred Genotypes of two leaves - ##Need to exclude the MRCA for this - + ## This equals the Hamming distance between the inferred Genotypes of two leaves + ## Need to exclude the MRCA for this + return(sum(bestMutationPlacement %in% pathBetweenLeaves[pathBetweenLeaves != firstLeafToMRCA[positionOfMRCA]])) } @@ -161,127 +163,131 @@ compute_pairwise_distance_of_leaves <- function(treeData, leaf1, leaf2, nCells, #' #' @return splittingFraction: The fraction of sampling events for which the pair of cells #' shows branching evolution -#' +#' #' @export #' #' @examples -produce_Distance_Posterior <- function(leaf1, leaf2,postSampling, treeName,nCells, +produce_Distance_Posterior <- function(leaf1, leaf2, postSampling, treeName, nCells, nMutations, nClusters, - alleleCount,ClusterID, - mutatedReadCounts, totalReadCounts,wbcStatus, nSamplingEvents = 20, clusterName = ""){ - + alleleCount, ClusterID, + mutatedReadCounts, totalReadCounts, wbcStatus, nSamplingEvents = 20, clusterName = "") { ## For each row in the posterior Sampling file, the distance of two leaves is computed - + print("Computing the posterior distribution") - + distance_statistics <- parallel::mclapply(postSampling, - FUN = computePairwiseDistanceOfLeavesGivenTree, leaf1,leaf2, - nCells, nMutations,nClusters, alleleCount, - ClusterID, mutatedReadCounts, totalReadCounts, wbcStatus, - nSamplingEvents) - - - dist_histogram <- lapply(distance_statistics, FUN = function(input_list_elements){ + FUN = computePairwiseDistanceOfLeavesGivenTree, leaf1, leaf2, + nCells, nMutations, nClusters, alleleCount, + ClusterID, mutatedReadCounts, totalReadCounts, wbcStatus, + nSamplingEvents + ) + + + dist_histogram <- lapply(distance_statistics, FUN = function(input_list_elements) { return(input_list_elements[1]) }) %>% unlist() - - totalNumberOfSplits <- lapply(distance_statistics, FUN = function(input_list_elements){ + + totalNumberOfSplits <- lapply(distance_statistics, FUN = function(input_list_elements) { return(input_list_elements[2]) - }) %>% unlist %>% sum() - - StatisticsOfMutationPlacement <- lapply(distance_statistics, FUN = function(input_list_elements){ + }) %>% + unlist() %>% + sum() + + StatisticsOfMutationPlacement <- lapply(distance_statistics, FUN = function(input_list_elements) { return(input_list_elements[3]) - }) %>% unlist - - + }) %>% unlist() + + totalNumberOfSamplingEvents <- nSamplingEvents * length(postSampling) - - - -# median_dist <- median(dist_histogram) - - - - -# plot( -# ggplot(data.frame(HammingDistance = dist_histogram), aes(x = HammingDistance)) + -# geom_histogram(binwidth = 1, fill = "skyblue", color = "skyblue", alpha = 0.7)+ -# xlab(sprintf("genetic distance between leaf %d and leaf %d", leaf1, leaf2)) + ylab("total count") + -# ggtitle("Posterior sampling of genetic distances") + -# geom_vline(xintercept = median_dist,color = "red", linetype = "dashed", linewidth = 1) + -# labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName),caption = "median indicated by dashed red line") + -# theme_minimal() + -# theme( -# plot.title = element_text(size = 20, face = "bold"), -# axis.title.x = element_text(size = 18), -# axis.title.y = element_text(size = 18), -# plot.subtitle = element_text(size= 18), -# axis.text = element_text(size = 16) -# ) -# ) - - + + + + # median_dist <- median(dist_histogram) + + + + + # plot( + # ggplot(data.frame(HammingDistance = dist_histogram), aes(x = HammingDistance)) + + # geom_histogram(binwidth = 1, fill = "skyblue", color = "skyblue", alpha = 0.7)+ + # xlab(sprintf("genetic distance between leaf %d and leaf %d", leaf1, leaf2)) + ylab("total count") + + # ggtitle("Posterior sampling of genetic distances") + + # geom_vline(xintercept = median_dist,color = "red", linetype = "dashed", linewidth = 1) + + # labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName),caption = "median indicated by dashed red line") + + # theme_minimal() + + # theme( + # plot.title = element_text(size = 20, face = "bold"), + # axis.title.x = element_text(size = 18), + # axis.title.y = element_text(size = 18), + # plot.subtitle = element_text(size= 18), + # axis.text = element_text(size = 16) + # ) + # ) + + data <- data.frame(StatisticsOfMutationPlacement = StatisticsOfMutationPlacement) - - + + sum(is.na(data$StatisticsOfMutationPlacement)) class(data$StatisticsOfMutationPlacement) - - + + ggplot(data = data, aes(x = StatisticsOfMutationPlacement, y = 1)) + geom_point() - - + + tryCatch( expr = { plot( ggplot(data, aes(x = StatisticsOfMutationPlacement)) + - geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7)+ - xlab("S") + ylab("total count") + + geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7) + + xlab("S") + + ylab("total count") + ggtitle("Posterior sampling of branching probabilites") + - geom_vline(xintercept = mean(StatisticsOfMutationPlacement),color = "blue", linetype = "dashed", linewidth = 1) + - labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName),caption = "mean indicated by dashed blue line") + + geom_vline(xintercept = mean(StatisticsOfMutationPlacement), color = "blue", linetype = "dashed", linewidth = 1) + + labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName), caption = "mean indicated by dashed blue line") + theme_minimal() + theme( plot.title = element_text(size = 20, face = "bold"), axis.title.x = element_text(size = 18), axis.title.y = element_text(size = 18), - plot.subtitle = element_text(size= 18), - axis.text = element_text(size = 16) + plot.subtitle = element_text(size = 18), + axis.text = element_text(size = 16) ) - ) + ) }, - error = function(e){ + error = function(e) { plot( ggplot(data, aes(x = log(StatisticsOfMutationPlacement))) + - geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7)+ - xlab("Maximal probability of branching evolution") + ylab("total count") + + geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7) + + xlab("Maximal probability of branching evolution") + + ylab("total count") + ggtitle("Posterior sampling of branching probabilites - Logarithmic Scale") + - geom_vline(xintercept = log(mean(StatisticsOfMutationPlacement)),color = "blue", linetype = "dashed", linewidth = 1) + - labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName),caption = "mean indicated by dashed red line") + + geom_vline(xintercept = log(mean(StatisticsOfMutationPlacement)), color = "blue", linetype = "dashed", linewidth = 1) + + labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName), caption = "mean indicated by dashed red line") + theme_minimal() + theme( plot.title = element_text(size = 20, face = "bold"), axis.title.x = element_text(size = 18), axis.title.y = element_text(size = 18), - plot.subtitle = element_text(size= 18), - axis.text = element_text(size = 16) + plot.subtitle = element_text(size = 18), + axis.text = element_text(size = 16) ) ) } ) - - - - - return(list(splittingFraction = totalNumberOfSplits/totalNumberOfSamplingEvents, branchingStatistics = StatisticsOfMutationPlacement)) + + + + + return(list(splittingFraction = totalNumberOfSplits / totalNumberOfSamplingEvents, branchingStatistics = StatisticsOfMutationPlacement)) } -#' This function identifies cells that belong to the same CTC cluster - also -#' those which have been physically split. For each pair of tumour cells from the +#' This function identifies cells that belong to the same CTC cluster - also +#' those which have been physically split. For each pair of tumour cells from the #' same CTC cluster, the distnace postior is computed. #' #' @param sampleDescription A data frame with the description of each sample. @@ -300,14 +306,14 @@ produce_Distance_Posterior <- function(leaf1, leaf2,postSampling, treeName,nCell #' @param totalReadCounts A tibble containing the total read counts. #' @param nMutationSamplingEvents The number of mutation that should be sampled #' per tree. -#' @param nTreeSamplingEvents The number of trees that should be sampled. +#' @param nTreeSamplingEvents The number of trees that should be sampled. #' @param cellPairSelection An optional parameter that takes a list of -#' pairs of strings-valued names of cells that should be analysed (the names as in the +#' pairs of strings-valued names of cells that should be analysed (the names as in the #' samples_nodeDescription.tsv file). #' It can also take a character vector, in which case the entries should be the color coded #' names of the clusters. #' -#' @return splittinProbs a vector that gives for each pair of cells the fraction of +#' @return splittinProbs a vector that gives for each pair of cells the fraction of #' trees for which they split #' aggregatedBranchingProbabilities: a vector of aggregated probabilities for all considered #' pairs of leaves and all sampled trees. At the moment only implement if cellPairSelection @@ -320,117 +326,115 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, nCel alleleCount, mutatedReadCounts, totalReadCounts, nMutationSamplingEvents = 1000, nTreeSamplingEvents = 500, - cellPairSelection = NA){ - + cellPairSelection = NA) { desired_values <- sample(1:length(postSampling), size = nTreeSamplingEvents, replace = FALSE) %>% sort() postSampling <- postSampling[desired_values] splittingProbs <- matrix(0, nrow = 0, ncol = 2) %>% as.data.frame() colnames(splittingProbs) <- c("Cluster", "Splitting_probability") aggregatedProbabilities <- vector() - if(class(cellPairSelection) == "list"){ + if (class(cellPairSelection) == "list") { counter <- 1 system.time( - for (it in cellPairSelection){ - leaf1 <- which(sampleDescription$ClusterName == it[1])-1 - leaf2 <- which(sampleDescription$ClusterName == it[2])-1 - - print(paste(paste("Computing genomic distances of leaves:", leaf1, sep = " "), leaf2, sep = " ")) - posterior <- produce_Distance_Posterior(leaf1,leaf2,postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount,sampleDescription$Cluster, - mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = nMutationSamplingEvents) - splittingProbs <- rbind(splittingProbs, data.frame(Cluster = as.character(counter), Splitting_probability = posterior$splittingFraction)) - aggregatedProbabilities <- c(aggregatedProbabilities, posterior$branchingStatistics) - counter <- counter + 1 - } + for (it in cellPairSelection) { + leaf1 <- which(sampleDescription$ClusterName == it[1]) - 1 + leaf2 <- which(sampleDescription$ClusterName == it[2]) - 1 + + print(paste(paste("Computing genomic distances of leaves:", leaf1, sep = " "), leaf2, sep = " ")) + posterior <- produce_Distance_Posterior(leaf1, leaf2, postSampling, treeName, nCells, + nMutations, nClusters, + alleleCount, sampleDescription$Cluster, + mutatedReadCounts, totalReadCounts, sampleDescription$WBC, + nSamplingEvents = nMutationSamplingEvents + ) + splittingProbs <- rbind(splittingProbs, data.frame(Cluster = as.character(counter), Splitting_probability = posterior$splittingFraction)) + aggregatedProbabilities <- c(aggregatedProbabilities, posterior$branchingStatistics) + counter <- counter + 1 + } ) - } - - else if(class(cellPairSelection) == 'character'){ - CTCclusters <- unique(cellPairSelection) - CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite","gray93"))] - system.time( - for(it in CTCclusters){ - cellsInCluster <- which(sampleDescription$color %in% it)-1 ## Make sure array indication is - ## compatible with cpp - cluster_done <- 0 - for(i in cellsInCluster){ - if(cluster_done == 1){ - cluster_done <- 0 + } else if (class(cellPairSelection) == "character") { + CTCclusters <- unique(cellPairSelection) + CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite", "gray93"))] + system.time( + for (it in CTCclusters) { + cellsInCluster <- which(sampleDescription$color %in% it) - 1 ## Make sure array indication is + ## compatible with cpp + cluster_done <- 0 + for (i in cellsInCluster) { + if (cluster_done == 1) { + cluster_done <- 0 + break + } + if (sampleDescription$WBC[i + 1] == 1) next + j <- cellsInCluster[1] + while (j < i) { + if (cluster_done == 1) { break } - if(sampleDescription$WBC[i+1] == 1) next - j <- cellsInCluster[1] - while(j < i){ - if(cluster_done == 1){ - break - } - if(sampleDescription$WBC[j+1] == 1){ - j <- j + 1 - next - } - print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " ")) - posterior <- produce_Distance_Posterior(i,j,postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount,sampleDescription$Cluster, - mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = nMutationSamplingEvents, clusterName = it) - splittingProbs <- rbind(splittingProbs, data.frame(Cluster = it,Splitting_probability = posterior$splittingFraction)) - aggregatedProbabilities <- c(aggregatedProbabilities, posterior$branchingStatistics) + if (sampleDescription$WBC[j + 1] == 1) { j <- j + 1 - cluster_done <- 1 + next } + print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " ")) + posterior <- produce_Distance_Posterior(i, j, postSampling, treeName, nCells, + nMutations, nClusters, + alleleCount, sampleDescription$Cluster, + mutatedReadCounts, totalReadCounts, sampleDescription$WBC, + nSamplingEvents = nMutationSamplingEvents, clusterName = it + ) + splittingProbs <- rbind(splittingProbs, data.frame(Cluster = it, Splitting_probability = posterior$splittingFraction)) + aggregatedProbabilities <- c(aggregatedProbabilities, posterior$branchingStatistics) + j <- j + 1 + cluster_done <- 1 } - } - ) - } - - - else{ + } + ) + } else { CTCclusters <- unique(sampleDescription$color) - CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite","gray93"))] + CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite", "gray93"))] system.time( - for(it in CTCclusters){ - cellsInCluster <- which(sampleDescription$color %in% it)-1 ## Make sure array indication is + for (it in CTCclusters) { + cellsInCluster <- which(sampleDescription$color %in% it) - 1 ## Make sure array indication is ## compatible with cpp - #cluster_done <- 0 - for(i in cellsInCluster){ + # cluster_done <- 0 + for (i in cellsInCluster) { # if(cluster_done == 1){ # cluster_done <- 0 # break - #} - if(sampleDescription$WBC[i+1] == 1) next + # } + if (sampleDescription$WBC[i + 1] == 1) next j <- cellsInCluster[1] - while(j < i){ - #if(cluster_done == 1){ + while (j < i) { + # if(cluster_done == 1){ # break - #} - if(sampleDescription$WBC[j+1] == 1){ + # } + if (sampleDescription$WBC[j + 1] == 1) { j <- j + 1 next } print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " ")) - posterior <- produce_Distance_Posterior(i,j,postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount,sampleDescription$Cluster, - mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = nMutationSamplingEvents, clusterName = it) - splittingProbs <- rbind(splittingProbs, data.frame(Cluster = it,Splitting_probability = posterior$splittingFraction)) + posterior <- produce_Distance_Posterior(i, j, postSampling, treeName, nCells, + nMutations, nClusters, + alleleCount, sampleDescription$Cluster, + mutatedReadCounts, totalReadCounts, sampleDescription$WBC, + nSamplingEvents = nMutationSamplingEvents, clusterName = it + ) + splittingProbs <- rbind(splittingProbs, data.frame(Cluster = it, Splitting_probability = posterior$splittingFraction)) j <- j + 1 - #cluster_done <- 1 + # cluster_done <- 1 } } - } ) } plot( - splittingProbs %>% group_by(Cluster) %>% summarize(meanSplittingProbability = mean(Splitting_probability)) %>% - ggplot(aes(x = Cluster, y = meanSplittingProbability)) + - geom_col() + - theme_minimal() + splittingProbs %>% group_by(Cluster) %>% summarize(meanSplittingProbability = mean(Splitting_probability)) %>% + ggplot(aes(x = Cluster, y = meanSplittingProbability)) + + geom_col() + + theme_minimal() ) - + return(list(splittingProbs = splittingProbs, aggregatedBranchingProbabilities = aggregatedProbabilities)) } @@ -443,124 +447,140 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, nCel # Loads all necessary data for the CTC-project. # Specifically it return a named list as follows: -# postSampling: Loads the posterior sampling tsv as a list of named vectors with the +# postSampling: Loads the posterior sampling tsv as a list of named vectors with the # following columns: # the (unnormalised) LogScore, estimated sequencing error rate, -# the estimated dropout rate, logTau and the Tree in parent vector +# the estimated dropout rate, logTau and the Tree in parent vector # format meaning that the i'th entry of the vector -# is te parent node of the entry i. Nodes are counted from zero and the root is +# is te parent node of the entry i. Nodes are counted from zero and the root is # length(Tree) # nClusters: The total number of CTC-clusters # ClusterID: The Cluster-ID maps cells identities to the cell-clusters they belong to. -# The i-th entry having value x means that +# The i-th entry having value x means that # Cells i is in the j-th cluster (7th row in the description file) # nCells: total number of Cells in the experiment # nMutations: Total number of considered mutations in the dataset # nClusters: Total number of CTC-clusters in the experiment # alleleCount: Total number of cells per Cluster*2 -# ClusterID: Number +# ClusterID: Number # mutatedReadCounts: A vector containing the number of of mutated reads for each # cluster # total Read Counts: vector containingtotal read count for each cluster -# wbcStatus: A vector that has value 1 if cell i is a white blood cells and +# wbcStatus: A vector that has value 1 if cell i is a white blood cells and # 0 else -load_data <- function(inputFolder, treeName){ - ##Define paths - - posteriorSamplingFile <- sprintf("%s/%s/%s_postSampling.tsv", inputFolder, treeName,treeName) - - countFile <- sprintf("%s/%s/%s.txt", inputFolder, treeName,treeName) +load_data <- function(inputFolder, treeName) { + ## Define paths + + posteriorSamplingFile <- sprintf("%s/%s/%s_postSampling.tsv", inputFolder, treeName, treeName) + + countFile <- sprintf("%s/%s/%s.txt", inputFolder, treeName, treeName) descriptionFile <- sprintf("%s/%s/%s_samples_nodeDescription.tsv", inputFolder, treeName, treeName) - - - ##Load data - + + + ## Load data + postSampling <- read_delim(posteriorSamplingFile, - delim = "\t", col_names = c("LogScore", "SequencingErrorRate","DropoutRate", "LogTau", "Tree")) + delim = "\t", col_names = c("LogScore", "SequencingErrorRate", "DropoutRate", "LogTau", "Tree") + ) postSampling <- split(postSampling, seq(nrow(postSampling))) - - + + counts <- read_delim(countFile, - delim = "\t", col_names = FALSE) + delim = "\t", col_names = FALSE + ) description <- read_delim(descriptionFile, - delim = "\t", col_names = c("Cluster", "CellCount", "TCs", "WBCs", "Description")) + delim = "\t", col_names = c("Cluster", "CellCount", "TCs", "WBCs", "Description") + ) nCells <- sum(description$CellCount) - nClusters <- nrow(description) + nClusters <- nrow(description) nMutations <- nrow(counts) - alleleCount <- description$CellCount*2 - - + alleleCount <- description$CellCount * 2 + + description <- description %>% mutate(color = regmatches(Description, regexpr("color=([a-zA-Z]+[0-9]*)", Description)) %>% substr(start = 7, stop = (nchar(.)))) - - - + + + ClusterID <- vector() - for(i in 1:nClusters) ClusterID <- c(ClusterID, rep.int(i-1,description$CellCount[i])) + for (i in 1:nClusters) ClusterID <- c(ClusterID, rep.int(i - 1, description$CellCount[i])) ## Note that Cpp counts arrays from zero, so the cluster IDs are counted likewise ## in order to be compatible with Cpp code. - - ##Pull apart the count file into counts for mutated read and total counts respectively - mutatedReadCounts <- matrix(0,nrow = nMutations, ncol = 0) - for (j in 1:nClusters){ - mutatedReadCounts <- cbind(mutatedReadCounts,counts[,4+2*j]) + + ## Pull apart the count file into counts for mutated read and total counts respectively + mutatedReadCounts <- matrix(0, nrow = nMutations, ncol = 0) + for (j in 1:nClusters) { + mutatedReadCounts <- cbind(mutatedReadCounts, counts[, 4 + 2 * j]) } - - totalReadCounts <- matrix(0,nrow = nMutations, ncol = 0) - for (j in 1:nClusters){ - totalReadCounts <- cbind(totalReadCounts,counts[,4+2*j-1]) + + totalReadCounts <- matrix(0, nrow = nMutations, ncol = 0) + for (j in 1:nClusters) { + totalReadCounts <- cbind(totalReadCounts, counts[, 4 + 2 * j - 1]) } - - - wildtypeReadCounts <- totalReadCounts - mutatedReadCounts - - - mutatedReadCounts <- mutatedReadCounts %>% t() %>% as.data.frame() %>% as.list() - wildtypeReadCounts <- wildtypeReadCounts %>% t() %>% as.data.frame() %>% as.list() - totalReadCounts <- totalReadCounts %>% t() %>% as.data.frame() %>% as.list() - - - mutationDescription <- counts[,1:4] - - ##wbc status indicates which of the cells is a white blood cells and which one isn't. - ##So far, the cells are arbitrary, and I will assign the fist cells from a cluster to be WBCs. + + + wildtypeReadCounts <- totalReadCounts - mutatedReadCounts + + + mutatedReadCounts <- mutatedReadCounts %>% + t() %>% + as.data.frame() %>% + as.list() + wildtypeReadCounts <- wildtypeReadCounts %>% + t() %>% + as.data.frame() %>% + as.list() + totalReadCounts <- totalReadCounts %>% + t() %>% + as.data.frame() %>% + as.list() + + + mutationDescription <- counts[, 1:4] + + ## wbc status indicates which of the cells is a white blood cells and which one isn't. + ## So far, the cells are arbitrary, and I will assign the fist cells from a cluster to be WBCs. wbcStatus <- rep(0, nCells) - - for(i in 1:nClusters){ + + for (i in 1:nClusters) { j <- 1 - while(j <= description$WBCs[i]){ #Iterating over the number of White blood cells of a cluster - wbcStatus[which(ClusterID == i-1)[1] + j-1] <- 1 # and identifying the first cell + while (j <= description$WBCs[i]) { # Iterating over the number of White blood cells of a cluster + wbcStatus[which(ClusterID == i - 1)[1] + j - 1] <- 1 # and identifying the first cell # that belongs to a cluster and counting from then on - ## Note: The cluster IDs are counted from zero! - j<- j+1 + ## Note: The cluster IDs are counted from zero! + j <- j + 1 } } - - sample_description <- data.frame(Cluster = ClusterID, - ClusterName = description$Cluster[ClusterID + 1], - WBC = wbcStatus, - color = description$color[ClusterID + 1]) - + + sample_description <- data.frame( + Cluster = ClusterID, + ClusterName = description$Cluster[ClusterID + 1], + WBC = wbcStatus, + color = description$color[ClusterID + 1] + ) + sample_description <- sample_description %>% mutate(single_cell = !(duplicated(Cluster)) & !(duplicated(Cluster, fromLast = TRUE))) - - -# annotations <- read_delim('../../input_folder/filtered/CGI/LM2_cgi/alterations.tsv', delim = '\t') - - return(list("postSampling" = postSampling, "nClusters" = nClusters, - "clusterID" = ClusterID, "nCells" = nCells, - "nMutations" = nMutations, "nClusters" = nClusters, - "alleleCount" = alleleCount, - "mutatedReadCounts" = mutatedReadCounts, - "totalReadCounts" = totalReadCounts, "wbcStatus" = wbcStatus, - "sample_description" = sample_description, - "mutationDescription" = mutationDescription, -# "annotations" = annotations, - "sampleName" = treeName, "directory" = inputFolder)) + + + # annotations <- read_delim('../../input_folder/filtered/CGI/LM2_cgi/alterations.tsv', delim = '\t') + + return(list( + "postSampling" = postSampling, "nClusters" = nClusters, + "clusterID" = ClusterID, "nCells" = nCells, + "nMutations" = nMutations, "nClusters" = nClusters, + "alleleCount" = alleleCount, + "mutatedReadCounts" = mutatedReadCounts, + "totalReadCounts" = totalReadCounts, "wbcStatus" = wbcStatus, + "sample_description" = sample_description, + "mutationDescription" = mutationDescription, + # "annotations" = annotations, + "sampleName" = treeName, "directory" = inputFolder + )) } @@ -575,103 +595,105 @@ load_data <- function(inputFolder, treeName){ #' lies in the 1% quantile of the set of all pairwise genetic distances. #' As the distance the Hamming distance is chosen. #' -#' @param inputFolder -#' @param treeName +#' @param inputFolder +#' @param treeName #' #' @return #' monoclonal_pairs: A list of pairs of cell names that are similar to each other. #' distance_matrix: A matrix indicates all pairwise distnaces of suggested genotypes. #' full_distance_matrix: The full pairwise distance matrix of all genotypes. -#' -#' +#' +#' #' @export #' #' @examples -load_monoclonal_pairs <- function(inputFolder, treeName, cutoff = ""){ - data_file <- sprintf("%s/%s/%s_genotypes.ped", inputFolder, treeName,treeName) - - data <- read_delim(data_file, delim = '\t',col_names = FALSE) +load_monoclonal_pairs <- function(inputFolder, treeName, cutoff = "") { + data_file <- sprintf("%s/%s/%s_genotypes.ped", inputFolder, treeName, treeName) + + data <- read_delim(data_file, delim = "\t", col_names = FALSE) data2 <- data %>% select(!2:6) - + distance_matrix <- matrix(0, nrow = nrow(data2), ncol = nrow(data2)) - - - for(i in 1:nrow(data2)){ + + + for (i in 1:nrow(data2)) { j <- 1 - while(j < i){ - row_i <- data2 %>% select(!1) %>% slice(i) - row_j <- data2 %>% select(!1) %>% slice(j) - - distance_matrix[i,j] <- sum(!(row_i == row_j)) - j <- j+1 + while (j < i) { + row_i <- data2 %>% + select(!1) %>% + slice(i) + row_j <- data2 %>% + select(!1) %>% + slice(j) + + distance_matrix[i, j] <- sum(!(row_i == row_j)) + j <- j + 1 } } - - + + distance_vector <- as.vector(distance_matrix[lower.tri(distance_matrix)]) - - - - if(class(cutoff) == "numeric"){ - monoclonal_candidate_cutoff <- cutoff - } - else{ + + + + if (class(cutoff) == "numeric") { + monoclonal_candidate_cutoff <- cutoff + } else { monoclonal_candidate_cutoff <- quantile(distance_vector, probs = 0.01) } - - + + sum(distance_vector <= monoclonal_candidate_cutoff) which(distance_vector <= monoclonal_candidate_cutoff) - + print("1% quantile of genetic distances:") print(monoclonal_candidate_cutoff) - + plot( - ggplot(data.frame(x = distance_vector),aes(x = x))+ - geom_histogram(binwidth = 2) + - geom_vline(xintercept = monoclonal_candidate_cutoff, linetype = "dashed", color = "red") + ggplot(data.frame(x = distance_vector), aes(x = x)) + + geom_histogram(binwidth = 2) + + geom_vline(xintercept = monoclonal_candidate_cutoff, linetype = "dashed", color = "red") ) - - + + candidates <- list() candidate_index <- vector() iterator <- 0 number_of_output_pairs <- 15 - for (count in 0:monoclonal_candidate_cutoff){ + for (count in 0:monoclonal_candidate_cutoff) { all_elements <- which(distance_matrix == count) all_elements_list <- list() - for (it in all_elements){ - coordinates1 <- ((it-1) %% dim(distance_matrix)[2]) + 1 - coordinates2 <- ((it-1) %/% dim(distance_matrix)[2]) + 1 + for (it in all_elements) { + coordinates1 <- ((it - 1) %% dim(distance_matrix)[2]) + 1 + coordinates2 <- ((it - 1) %/% dim(distance_matrix)[2]) + 1 all_elements_list <- append(all_elements_list, list(c(coordinates1, coordinates2))) } - - for (it in all_elements_list){ - if(it[1] <= it[2]) next - - - #Check whether the candidate pair of cells consists of single tumour cells: - - - - candidates <- c(candidates,list(c(as.character(data2[it[1],1]),as.character(data2[it[2],1])))) + + for (it in all_elements_list) { + if (it[1] <= it[2]) next + + + # Check whether the candidate pair of cells consists of single tumour cells: + + + + candidates <- c(candidates, list(c(as.character(data2[it[1], 1]), as.character(data2[it[2], 1])))) candidate_index <- c(candidate_index, it[1], it[2]) iterator <- iterator + 1 - - if(iterator == number_of_output_pairs) break + + if (iterator == number_of_output_pairs) break } - if(iterator == number_of_output_pairs) break - } - if(length(unique(sort(candidate_index)))!=0){ - distance_matrix2 <- distance_matrix[unique(sort(candidate_index)),unique(sort(candidate_index))] - colnames(distance_matrix2) <- data2[unique(sort(candidate_index)),1]$X1 + if (iterator == number_of_output_pairs) break } - else{ + if (length(unique(sort(candidate_index))) != 0) { + distance_matrix2 <- distance_matrix[unique(sort(candidate_index)), unique(sort(candidate_index))] + colnames(distance_matrix2) <- data2[unique(sort(candidate_index)), 1]$X1 + } else { distance_matrix2 <- 0 } - - + + distance_matrix <- as.data.frame(distance_matrix) colnames(distance_matrix) <- data2$X1 rownames(distance_matrix) <- data2$X1 @@ -681,8 +703,8 @@ load_monoclonal_pairs <- function(inputFolder, treeName, cutoff = ""){ -callGenotypes <- function(){ - +callGenotypes <- function() { + } @@ -691,7 +713,7 @@ callGenotypes <- function(){ #################### -##Helper Functions## +## Helper Functions## #################### #' Forking-based parallelisation. @@ -711,7 +733,7 @@ callGenotypes <- function(){ #' array of “rank” (==length(dim(.))) one higher than the result of FUN(X[[i]]). #' @param USE.NAMES logical; if TRUE and if X is character, use X as names for #' the result unless it had names already. Since this argument follows ... its -#' name cannot be abbreviated. +#' name cannot be abbreviated. #' #' @returnFor sapply(simplify = TRUE) and replicate(simplify = TRUE): if X has #' length zero or n = 0, an empty list. Otherwise an atomic vector or matrix or @@ -723,12 +745,12 @@ callGenotypes <- function(){ #' @export #' #' @examples -#mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) { +# mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) { # FUN <- match.fun(FUN) # answer <- parallel::mclapply(X = X, FUN = FUN, ...) -# if (USE.NAMES && is.character(X) && is.null(names(answer))) +# if (USE.NAMES && is.character(X) && is.null(names(answer))) # names(answer) <- X -# if (!isFALSE(simplify) && length(answer)) +# if (!isFALSE(simplify) && length(answer)) # simplify2array(answer, higher = (simplify == "array")) # else answer -#} +# } diff --git a/Rcode/simulateCTCclusters.R b/Rcode/simulateCTCclusters.R index 5ba38dc..82961e6 100644 --- a/Rcode/simulateCTCclusters.R +++ b/Rcode/simulateCTCclusters.R @@ -1,4 +1,4 @@ -source('functions.R') +source("functions.R") library(viridis) library(VGAM) library(pscl) @@ -6,7 +6,7 @@ library(MASS) library(boot) ############ -#Config +# Config ############ inputFolder <- "../../projects/CTC_backup/input_folder" treeName <- "Br23" @@ -17,57 +17,59 @@ input <- load_data(inputFolder, treeName) ############ -#Exploratory data analysis +# Exploratory data analysis ############ -#input <- load_data(inputFolder, treeName) -#totalReadCounts <- input$totalReadCounts -#sampleDescription <- input$sample_description +# input <- load_data(inputFolder, treeName) +# totalReadCounts <- input$totalReadCounts +# sampleDescription <- input$sample_description -#totalReadCountVector <- totalReadCounts %>% unlist() +# totalReadCountVector <- totalReadCounts %>% unlist() -sum(totalReadCountVector == 0)/length(totalReadCountVector) +sum(totalReadCountVector == 0) / length(totalReadCountVector) -#fit1 <- glm(totalReadCountVector ~ 1, family = poisson(link = 'log')) +# fit1 <- glm(totalReadCountVector ~ 1, family = poisson(link = 'log')) fit2 <- glm.nb(totalReadCountVector ~ 1) -fit3 <- zeroinfl(totalReadCountVector ~1, dist = 'negbin') -#fit4 <- zeroinfl(totalReadCountVector ~1, dist = 'poisson') +fit3 <- zeroinfl(totalReadCountVector ~ 1, dist = "negbin") +# fit4 <- zeroinfl(totalReadCountVector ~1, dist = 'poisson') -#summary(fit1) +# summary(fit1) summary(fit2) summary(fit3) -#summary(fit4) -#exp(coef(fit)) -#coef(fit2) +# summary(fit4) +# exp(coef(fit)) +# coef(fit2) -#exp(summary(fit3)$coefficients$zero[1]) +# exp(summary(fit3)$coefficients$zero[1]) -#parameterOfNegBinom <- exp(summary(fit3)$coefficients$count[,1]) +# parameterOfNegBinom <- exp(summary(fit3)$coefficients$count[,1]) -#exp(coef(fit3)) -#summary(fit4) +# exp(coef(fit3)) +# summary(fit4) -#sim() <- +# sim() <- simNew <- ifelse(rbinom(length(totalReadCountVector), size = 1, prob = exp(coef(fit3))[2]) > 0, - 0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961))) + 0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961)) +) sim <- data.frame(sim = vector(), run = vector()) -for(i in 1:100){ +for (i in 1:100) { simNew <- ifelse(rbinom(length(totalReadCountVector), size = 1, prob = exp(coef(fit3))[2]) > 0, - 0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961))) - - #simNew <- rnegbin(length(totalReadCountVector), exp(coef(fit2)), theta = 0.9222) + 0, rnegbin(length(totalReadCountVector), exp(coef(fit3))[1], theta = exp(-0.76961)) + ) + + # simNew <- rnegbin(length(totalReadCountVector), exp(coef(fit2)), theta = 0.9222) sim <- rbind(sim, data.frame(sim = simNew, run = i)) } @@ -75,9 +77,9 @@ for(i in 1:100){ sim <- rbind(sim, data.frame(sim = totalReadCountVector, run = 0)) sim %>% - ggplot(aes(x = sim, group = run)) + - geom_histogram(data = sim[sim$run == 0,], alpha = 0.4, color = 'darkseagreen', fill = 'darkseagreen') + - geom_freqpoly(data = sim[sim$run != 0,], aes(x = sim), color = 'red', position = 'identity', alpha = 0.4) + ggplot(aes(x = sim, group = run)) + + geom_histogram(data = sim[sim$run == 0, ], alpha = 0.4, color = "darkseagreen", fill = "darkseagreen") + + geom_freqpoly(data = sim[sim$run != 0, ], aes(x = sim), color = "red", position = "identity", alpha = 0.4) @@ -87,25 +89,24 @@ sim %>% #' #' @param input The loaded dataset #' @param zeroInfl If this boolean value is FALSE, then a negative binomial is fit to the data -#' +#' #' #' @return The parameters of the distribution. If zeroInfl is false, then the zero probability #' is set to 0. #' @export #' #' @examples -fitReadCountDistribution <- function(input, zeroInfl = TRUE){ +fitReadCountDistribution <- function(input, zeroInfl = TRUE) { totalReadCounts <- input$totalReadCounts sampleDescription <- input$sample_description totalReadCountVector <- totalReadCounts %>% unlist() - - if(zeroInfl == TRUE){ - fit <- zeroinfl(totalReadCountVector ~ 1, dist = 'negbin') - return(list(zeroProb = inv.logit(summary(fit)$coefficients$zero[1]), theta = exp(summary(fit)$coefficients$count[2,1]), expValue = exp(summary(fit)$coefficients$count[1,1]) )) - } - else{ - fit <- glm.nb(totalReadCountVector ~ 1) - return(list(zeroProb = 0, theta = summary(fit)$theta, expValue = exp(coef(fit))) ) + + if (zeroInfl == TRUE) { + fit <- zeroinfl(totalReadCountVector ~ 1, dist = "negbin") + return(list(zeroProb = inv.logit(summary(fit)$coefficients$zero[1]), theta = exp(summary(fit)$coefficients$count[2, 1]), expValue = exp(summary(fit)$coefficients$count[1, 1]))) + } else { + fit <- glm.nb(totalReadCountVector ~ 1) + return(list(zeroProb = 0, theta = summary(fit)$theta, expValue = exp(coef(fit)))) } } @@ -114,8 +115,8 @@ fitReadCountDistribution <- function(input, zeroInfl = TRUE){ -#comparing different models it looks like a zero-inflated beta binomial model is -#appropriate to simulate read counts The coefficients are determined in fit3 +# comparing different models it looks like a zero-inflated beta binomial model is +# appropriate to simulate read counts The coefficients are determined in fit3 #' From a number of wildtyoe and mutated genotypes, read counts are simulated as #' follows: @@ -127,48 +128,47 @@ fitReadCountDistribution <- function(input, zeroInfl = TRUE){ #' model, given the total read counts sampled in step 2. #' 4. Each allele may flip its genotype at rate "errorRate". #' -#' @param nWildtypeAlleles -#' @param nMutatedAlleles -#' @param dropoutRate -#' @param errorRate -#' @param mu -#' @param theta +#' @param nWildtypeAlleles +#' @param nMutatedAlleles +#' @param dropoutRate +#' @param errorRate +#' @param mu +#' @param theta #' #' @return A pair of read counts; the first one being the total number of reads -#' and the second one being the number of mutated reads. +#' and the second one being the number of mutated reads. #' @export #' #' @examples -simulateReads <- function(nWildtypeAlleles,nMutatedAlleles, dropoutRate, errorRate, readCountFit){ - #draw from a binomial model to simulate dropouts - nWildtypeAlleles <- rbinom(1, size = nWildtypeAlleles,prob = (1-dropoutRate)) - nMutatedAlleles <- rbinom(1, size = nMutatedAlleles, prob = (1-dropoutRate)) - - - - - #draw from a negative-binomial to simulate the total read count +simulateReads <- function(nWildtypeAlleles, nMutatedAlleles, dropoutRate, errorRate, readCountFit) { + # draw from a binomial model to simulate dropouts + nWildtypeAlleles <- rbinom(1, size = nWildtypeAlleles, prob = (1 - dropoutRate)) + nMutatedAlleles <- rbinom(1, size = nMutatedAlleles, prob = (1 - dropoutRate)) + + + + + # draw from a negative-binomial to simulate the total read count isZero <- rbinom(1, size = 1, p = readCountFit$zeroProb) - if(isZero == TRUE){ + if (isZero == TRUE) { nReads <- 0 - } - else{ + } else { nReads <- rnegbin(1, mu = readCountFit$expValue, theta = readCountFit$theta) } - - - #draw from a beta-binomial to simulate overdispersion through multiple- - #displacement amplification + + + # draw from a beta-binomial to simulate overdispersion through multiple- + # displacement amplification nWildtypeReads <- rbetabinom.ab(n = 1, size = nReads, shape1 = nWildtypeAlleles, shape2 = nMutatedAlleles) - + nMutatedReads <- nReads - nWildtypeReads - - #randomly flip the genotypes of reads with a certain error rate - falsePositives <- rbinom(1, size = nReads-nMutatedReads, prob = errorRate) + + # randomly flip the genotypes of reads with a certain error rate + falsePositives <- rbinom(1, size = nReads - nMutatedReads, prob = errorRate) falseNegatives <- rbinom(1, size = nMutatedReads, prob = errorRate) - + nMutatedReads <- nMutatedReads + falsePositives - falseNegatives - + return(list(read_counts = c(nReads, nMutatedReads))) } @@ -179,18 +179,17 @@ simulateReads <- function(nWildtypeAlleles,nMutatedAlleles, dropoutRate, errorRa -#' Calls genotypes of single cells based on the CTC-SCITE algorithm +#' Calls genotypes of single cells based on the CTC-SCITE algorithm #' #' @param nTreeSamplingEvents number of sampled trees. Appricimated postserior gets better the higher this number is. #' @param input The loaded data. #' #' @return returns a data frame in long format that gives the genotype and the -#' posterior genotype probability for each cell and sample. +#' posterior genotype probability for each cell and sample. #' @export #' #' @examples -getGenotypeMatrix <- function(nTreeSamplingEvents = 100, input){ - +getGenotypeMatrix <- function(nTreeSamplingEvents = 100, input) { postSampling <- input$postSampling nCells <- input$nCells nMutations <- input$nMutations @@ -199,55 +198,61 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 100, input){ ClusterID <- input$clusterID mutatedReadCounts <- input$mutatedReadCounts totalReadCounts <- input$totalReadCounts - - + + desired_values <- sample(1:length(postSampling), size = nTreeSamplingEvents, replace = FALSE) %>% sort() - postSampling <- postSampling[desired_values] - postSamplingTrees <- lapply(postSampling, FUN = function(entry){return(entry$Tree)}) - - - logGenotypes <- getProbabilityOfBeingMutated(postSampling, nCells, nMutations, nClusters, - alleleCount, ClusterID, mutatedReadCounts, totalReadCounts, - rep(0,nCells)) - + postSampling <- postSampling[desired_values] + postSamplingTrees <- lapply(postSampling, FUN = function(entry) { + return(entry$Tree) + }) + + + logGenotypes <- getProbabilityOfBeingMutated( + postSampling, nCells, nMutations, nClusters, + alleleCount, ClusterID, mutatedReadCounts, totalReadCounts, + rep(0, nCells) + ) + genotypes_wide <- lapply(logGenotypes, FUN = exp) - genotypes_wide <- data.frame(do.call(cbind,genotypes_wide)) - + genotypes_wide <- data.frame(do.call(cbind, genotypes_wide)) + genotypes <- genotypes_wide %>% as_tibble() %>% rownames_to_column("Mutation") %>% - pivot_longer(-Mutation,names_to = "Sample", values_to = "Posterior") - + pivot_longer(-Mutation, names_to = "Sample", values_to = "Posterior") + ggplot(genotypes, aes(Mutation, Sample)) + geom_tile(aes(fill = Posterior)) + scale_fill_viridis() - + genotypes$WBC <- input$sample_description$WBC[(genotypes$Sample %>% substr(start = 2, stop = nchar(.)) %>% as.numeric())] - - genotypes %>% mutate(WBC = as.factor(WBC)) %>% - filter(genotypes$WBC ==1) %>% + + genotypes %>% + mutate(WBC = as.factor(WBC)) %>% + filter(genotypes$WBC == 1) %>% ggplot(mapping = aes(x = Posterior, alpha = 0.6)) + geom_histogram(position = "identity", binwidth = 0.005) - - - + + + genotypes <- genotypes %>% mutate(Mutation = as.numeric(Mutation), Genotype = as.integer(Posterior > 0.5)) - - genotypes %>% filter(Sample %in% - paste0("X",which(input$sample_description$single_cell == TRUE & input$sample_description$WBC == FALSE))) %>% + + genotypes %>% + filter(Sample %in% + paste0("X", which(input$sample_description$single_cell == TRUE & input$sample_description$WBC == FALSE))) %>% ggplot(aes(Mutation, Sample)) + geom_tile(aes(fill = Genotype)) + scale_fill_viridis() - + return(genotypes) } #' Creates the input dataset for CTC-SCITE run with simulated CTC-clusters. -#' +#' #' For the simulation, the follwing steps were performed: #' 1. A zero-inflated negative binomial distribution is fit to the total read counts of a sample. #' 2. For a new cell cluster, total read counts for each genomic position are sampled from the distribution fit in (1). @@ -278,105 +283,112 @@ getGenotypeMatrix <- function(nTreeSamplingEvents = 100, input){ #' @export #' #' @examples -simulateCTCclusters <- function(samplingSize, clusterSizeVector, input, output_directory, output_label, dropoutRate = 0.3, errorRate = 0.001, seed = 123, zeroInflated = TRUE){ +simulateCTCclusters <- function(samplingSize, clusterSizeVector, input, output_directory, output_label, dropoutRate = 0.3, errorRate = 0.001, seed = 123, zeroInflated = TRUE) { set.seed(seed) - color_palette <- list("orchid", "orchid1", "orchid2", "orchid3", "orchid4", "darkorchid", "darkorchid1","darkorchid2", "darkorchid3", "darkorchid4", "purple", "purple1", "purple2", "purple3", "purple4") - - + color_palette <- list("orchid", "orchid1", "orchid2", "orchid3", "orchid4", "darkorchid", "darkorchid1", "darkorchid2", "darkorchid3", "darkorchid4", "purple", "purple1", "purple2", "purple3", "purple4") + + fit <- fitReadCountDistribution(input, zeroInfl = zeroInflated) - + print("Calling genotypes") genotypes <- getGenotypeMatrix(nTreeSamplingEvents = samplingSize, input = input) - - if(sum(clusterSizeVector) >= length(unique(genotypes$Sample))){ - stop('You want to sample more genotypes than can be provided') + + if (sum(clusterSizeVector) >= length(unique(genotypes$Sample))) { + stop("You want to sample more genotypes than can be provided") } - - - genotypesOutputFormat <- data.frame(matrix(0, nrow =input$nMutations, ncol = 0)) + + + genotypesOutputFormat <- data.frame(matrix(0, nrow = input$nMutations, ncol = 0)) sampleDescriptionOutputFormat <- data.frame(matrix(0, nrow = 0, ncol = 5)) - colnames(sampleDescriptionOutputFormat) <- c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description") - - cellIDs <- paste0("X",1:nrow(input$sample_description)) + colnames(sampleDescriptionOutputFormat) <- c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description") + + cellIDs <- paste0("X", 1:nrow(input$sample_description)) cells <- sample(size = sum(clusterSizeVector), x = cellIDs, replace = FALSE) iterator <- 0 - - - for (clusterSize in 1:length(clusterSizeVector)){ + + + for (clusterSize in 1:length(clusterSizeVector)) { clustersBySize <- 1 - while(clustersBySize <= clusterSizeVector[clusterSize]){ + while (clustersBySize <= clusterSizeVector[clusterSize]) { print(paste("Simulating CTC cluster", iterator)) print(paste("Number of cells:", clusterSize)) genotype <- genotypes %>% filter(Sample == cells[clustersBySize]) %>% arrange(Mutation) - + genotype <- pull(genotype, Genotype) - + nMutatedAlleles <- clusterSize * genotype nAllelesTotal <- clusterSize * rep(2, length(genotype)) nWildtypeAlleles <- nAllelesTotal - nMutatedAlleles data <- data.frame(nWildtypeAlleles = nWildtypeAlleles, nMutatedAlleles = nMutatedAlleles) print("Starting simulation of read counts") - reads <- apply(data, FUN = function(x){return(simulateReads(x[1], x[2], dropoutRate, errorRate, fit)$read_counts)}, MARGIN = 1) %>% t() + reads <- apply(data, FUN = function(x) { + return(simulateReads(x[1], x[2], dropoutRate, errorRate, fit)$read_counts) + }, MARGIN = 1) %>% t() genotypesOutputFormat <- cbind(genotypesOutputFormat, reads) print("Done") - - newSample <- data.frame(sample_name = paste0(input$sampleName, '_sim', iterator), - total_number_cells = clusterSize, tumor_cells = clusterSize, - WBCs = 0, - description = - paste0('[color=', color_palette[[iterator+1]], ',label="', input$sampleName, '_sim', iterator , '",fillcolor=', color_palette[[iterator+1]], ',image="../CTC-cluster-icons/cluster_', clusterSize,'-0.png"]') ) + + newSample <- data.frame( + sample_name = paste0(input$sampleName, "_sim", iterator), + total_number_cells = clusterSize, tumor_cells = clusterSize, + WBCs = 0, + description = + paste0("[color=", color_palette[[iterator + 1]], ',label="', input$sampleName, "_sim", iterator, '",fillcolor=', color_palette[[iterator + 1]], ',image="../CTC-cluster-icons/cluster_', clusterSize, '-0.png"]') + ) sampleDescriptionOutputFormat <- rbind(sampleDescriptionOutputFormat, newSample) - + iterator <- iterator + 1 clustersBySize <- clustersBySize + 1 } } print("Writing output files") - - - dir.create(file.path(output_directory,paste(input$sampleName, output_label, sep = '_')), recursive = TRUE) - description_data <- read_delim(file.path(input$directory, input$sampleName, paste0(input$sampleName, '_samples_nodeDescription.tsv')), delim = '\t', col_names = FALSE, quote = "none") - colnames(description_data) <- c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description") + + + dir.create(file.path(output_directory, paste(input$sampleName, output_label, sep = "_")), recursive = TRUE) + description_data <- read_delim(file.path(input$directory, input$sampleName, paste0(input$sampleName, "_samples_nodeDescription.tsv")), delim = "\t", col_names = FALSE, quote = "none") + colnames(description_data) <- c("sample_name", "total_number_cells", "tumor_cells", "WBCs", "description") description_data <- rbind(description_data, sampleDescriptionOutputFormat) - write_delim(x = description_data, file = file.path(output_directory,paste(input$sampleName, output_label, sep = '_'), paste0(input$sampleName, '_', output_label, '_samples_nodeDescription.tsv')), delim = '\t', col_names = FALSE, quote = "none", escape = "none") - - read_data <- read_delim(file.path(input$directory, input$sampleName, paste0(input$sampleName, '.txt')), delim = '\t', col_names = FALSE, escape_backslash = TRUE) + write_delim(x = description_data, file = file.path(output_directory, paste(input$sampleName, output_label, sep = "_"), paste0(input$sampleName, "_", output_label, "_samples_nodeDescription.tsv")), delim = "\t", col_names = FALSE, quote = "none", escape = "none") + + read_data <- read_delim(file.path(input$directory, input$sampleName, paste0(input$sampleName, ".txt")), delim = "\t", col_names = FALSE, escape_backslash = TRUE) read_data <- cbind(read_data, genotypesOutputFormat) - write_delim(x = read_data, file = file.path(output_directory,paste(input$sampleName, output_label, sep = '_'), paste0(input$sampleName,'_', output_label ,'.txt')), delim = '\t', col_names = FALSE, quote = "none", escape = "none") + write_delim(x = read_data, file = file.path(output_directory, paste(input$sampleName, output_label, sep = "_"), paste0(input$sampleName, "_", output_label, ".txt")), delim = "\t", col_names = FALSE, quote = "none", escape = "none") } - -#c("Br11", "Br16_AC_max2", "Br16_AC_max3", "Br16_AC_max4", "Br16_B_max1", "Br16_B_max2", "Br16_B_max3", "Br16_B_max4", "Br16_C_max1", "Br16_C_max2", "Br16_C_max3", "Br23", "Br26", "Br30", "Br37", "Br38", "Br39", "Br44", "Br45", "Br46", "Br53", "Br57", "Brx50", "Lu2", "Lu7", "Ov8", "Pr6", "Pr9") -for(tree in c("Br11", "Br16_AC_max2", "Br16_AC_max3", "Br16_AC_max4", "Br16_B_max1", "Br16_B_max2", "Br16_B_max3", "Br16_B_max4", "Br16_C_max1", "Br16_C_max2", "Br16_C_max3", "Br23", "Br26", "Br30", "Br37", "Br38", "Br39", "Br44", "Br45", "Br46", "Br53", "Br57", "Brx50", "Lu2", "Lu7", "Ov8", "Pr6", "Pr9")){ - +# c("Br11", "Br16_AC_max2", "Br16_AC_max3", "Br16_AC_max4", "Br16_B_max1", "Br16_B_max2", "Br16_B_max3", "Br16_B_max4", "Br16_C_max1", "Br16_C_max2", "Br16_C_max3", "Br23", "Br26", "Br30", "Br37", "Br38", "Br39", "Br44", "Br45", "Br46", "Br53", "Br57", "Brx50", "Lu2", "Lu7", "Ov8", "Pr6", "Pr9") + +for (tree in c("Br11", "Br16_AC_max2", "Br16_AC_max3", "Br16_AC_max4", "Br16_B_max1", "Br16_B_max2", "Br16_B_max3", "Br16_B_max4", "Br16_C_max1", "Br16_C_max2", "Br16_C_max3", "Br23", "Br26", "Br30", "Br37", "Br38", "Br39", "Br44", "Br45", "Br46", "Br53", "Br57", "Brx50", "Lu2", "Lu7", "Ov8", "Pr6", "Pr9")) { + } -simulation_metadata <- list("Br11" = c(0,4,0,0,0), "Br16_AC_max2", - "Br16_AC_max3", "Br16_AC_max4", - "Br16_B_max1", "Br16_B_max2", - "Br16_B_max3", "Br16_B_max4", - "Br16_C_max1", "Br16_C_max2", - "Br16_C_max3", "Br23", "Br26", - "Br30", "Br37", "Br38", "Br39", - "Br44", "Br45", "Br46", "Br53", - "Br57", "Brx50", "Lu2", "Lu7", "Ov8", - "Pr6", "Pr9") +simulation_metadata <- list( + "Br11" = c(0, 4, 0, 0, 0), "Br16_AC_max2", + "Br16_AC_max3", "Br16_AC_max4", + "Br16_B_max1", "Br16_B_max2", + "Br16_B_max3", "Br16_B_max4", + "Br16_C_max1", "Br16_C_max2", + "Br16_C_max3", "Br23", "Br26", + "Br30", "Br37", "Br38", "Br39", + "Br44", "Br45", "Br46", "Br53", + "Br57", "Brx50", "Lu2", "Lu7", "Ov8", + "Pr6", "Pr9" +) tree <- "Pr9" output_label <- 3 -clusterSizeVector <- c(0,0,3,0,0) +clusterSizeVector <- c(0, 0, 3, 0, 0) treeName <- tree -print(paste("Running simulation for",tree)) +print(paste("Running simulation for", tree)) input <- load_data(inputFolder, treeName) -simulateCTCclusters(samplingSize = 100, clusterSizeVector = clusterSizeVector, input, - output_directory = "../../simulations/simulations2",output_label = output_label, - dropoutRate = 0.35, errorRate = 0.0015, seed = 124, - zeroInflated = TRUE) - +simulateCTCclusters( + samplingSize = 100, clusterSizeVector = clusterSizeVector, input, + output_directory = "../../simulations/simulations2", output_label = output_label, + dropoutRate = 0.35, errorRate = 0.0015, seed = 124, + zeroInflated = TRUE +) diff --git a/Rcode/statistical_test_source.R b/Rcode/statistical_test_source.R index 3d073c4..8f0c06d 100644 --- a/Rcode/statistical_test_source.R +++ b/Rcode/statistical_test_source.R @@ -8,17 +8,18 @@ library(purrr, quietly = TRUE) -reverse_paste <- function(filename, string){ - paste0(string,filename) +reverse_paste <- function(filename, string) { + paste0(string, filename) } -load_cluster_df <- function(filename){ - df <- read_delim(filename, delim = "\t",col_names = FALSE) +load_cluster_df <- function(filename) { + df <- read_delim(filename, delim = "\t", col_names = FALSE) colnames(df)[1] <- "barcodes" colnames(df)[3] <- "counts" - df <- df %>% arrange(desc(counts)) %>% - mutate(prop_col = counts/sum(counts), cumprop_col = cumsum(prop_col)) + df <- df %>% + arrange(desc(counts)) %>% + mutate(prop_col = counts / sum(counts), cumprop_col = cumsum(prop_col)) return(df) } @@ -32,67 +33,70 @@ myfiles <- map(files, load_cluster_df) #' Load cluster files. -#' +#' #' load_cluster_files() laods all the CTC-cluster csv-files from the validation #' experiment and respecitve summaries intointo memory. -#' +#' #' #' @return A named list. #' summary: A data frame which contains sumamry statistics -#' cluster_data: a list. Each entry is a data frame with 4 colums: The first +#' cluster_data: a list. Each entry is a data frame with 4 colums: The first #' is a unique barcode identifier, and the third is the total read count. -#' +#' #' @export #' #' @examples load_cluster_files() #' @noRD -load_cluster_files <- function(){ - - +load_cluster_files <- function() { files <- list.files(path = "../../validation_experiment/Cluster_csv_files/", pattern = "\\.csv$") - - + + files <- map_chr(files, reverse_paste, "../../validation_experiment/Cluster_csv_files/") - + df_list <- map(files, load_cluster_df) save(df_list, file = "../../validation") - - -# create empty data frame to store summary information -summary_df <- data.frame(basename = character(), - num_rows_accumulating_nine = integer(), - num_rows_accumulating_ninefive = integer(), - stringsAsFactors = FALSE) - -for (i in seq_along(df_list)) { - df <- df_list[[i]] - cumprop_col <- df[, "cumprop_col"] - num_rows_accumulating_nine <- sum(cumprop_col <= 0.90) - num_rows_accumulating_ninefive <- sum(cumprop_col <= 0.95) - basename <- names(df_list)[i] - # determine the value based on the basename - value <- switch(substr(basename, 1, 4), - "10k_" = 10000, - "50k_" = 50000, - "100_" = 100, - "1000" = 1000, - NA) # if no match, assign NA - # determine if num_rows_accumulating > 1 - more_than_one <- ifelse(num_rows_accumulating_ninefive > 0, "Yes", "No") - prop_col_1 <- df$prop_col[1] - prop_col_2 <- df$prop_col[2] - summary_row <- data.frame(basename = basename, - num_rows_accumulating_nine = num_rows_accumulating_nine, - num_rows_accumulating_ninefive = num_rows_accumulating_ninefive, - prop_col_1 = prop_col_1, - prop_col_2 = prop_col_2, - value = value, - more_than_one = more_than_one, - stringsAsFactors = FALSE) - summary_df <- rbind(summary_df, summary_row) + + + # create empty data frame to store summary information + summary_df <- data.frame( + basename = character(), + num_rows_accumulating_nine = integer(), + num_rows_accumulating_ninefive = integer(), + stringsAsFactors = FALSE + ) + + for (i in seq_along(df_list)) { + df <- df_list[[i]] + cumprop_col <- df[, "cumprop_col"] + num_rows_accumulating_nine <- sum(cumprop_col <= 0.90) + num_rows_accumulating_ninefive <- sum(cumprop_col <= 0.95) + basename <- names(df_list)[i] + # determine the value based on the basename + value <- switch(substr(basename, 1, 4), + "10k_" = 10000, + "50k_" = 50000, + "100_" = 100, + "1000" = 1000, + NA + ) # if no match, assign NA + # determine if num_rows_accumulating > 1 + more_than_one <- ifelse(num_rows_accumulating_ninefive > 0, "Yes", "No") + prop_col_1 <- df$prop_col[1] + prop_col_2 <- df$prop_col[2] + summary_row <- data.frame( + basename = basename, + num_rows_accumulating_nine = num_rows_accumulating_nine, + num_rows_accumulating_ninefive = num_rows_accumulating_ninefive, + prop_col_1 = prop_col_1, + prop_col_2 = prop_col_2, + value = value, + more_than_one = more_than_one, + stringsAsFactors = FALSE + ) + summary_df <- rbind(summary_df, summary_row) } - save(summary_df, file="../../validation_experiment/output/summary_df_nine_ninefive.rds") + save(summary_df, file = "../../validation_experiment/output/summary_df_nine_ninefive.rds") return(list("summary" = summary_df, "cluster_data" = df_list)) } @@ -100,7 +104,7 @@ input_data <- load_cluster_files() ################# -####Debugging#### +#### Debugging#### ################# @@ -111,32 +115,35 @@ df_list <- list() basename <- gsub("\\.csv", "", "50k_7_910_b_S276_R2_001.fastq.gz_stats.csv") -df <- read_delim(paste0("../../validation_experiment/Cluster_csv_files/",file), delim = "\t",col_names = F) +df <- read_delim(paste0("../../validation_experiment/Cluster_csv_files/", file), delim = "\t", col_names = F) colnames(df)[1] <- "barcodes" colnames(df)[3] <- "counts" df_sorted <- df %>% arrange(desc(counts)) df_sorted <- df_sorted %>% - mutate(prop_col = counts/sum(counts), cumprop_col = cumsum(prop_col)) + mutate(prop_col = counts / sum(counts), cumprop_col = cumsum(prop_col)) df_list[[basename]] <- df_sorted for (file in files) { basename <- gsub("\\.csv", "", file) - tryCatch({ - df <- read_delim(paste0("../../validation_experiment/Cluster_csv_files/",file), delim = "\t",col_names = F) - colnames(df)[1] <- "barcodes" - colnames(df)[3] <- "counts" - df_sorted <- df %>% arrange(desc(counts)) - - #df_sorted <- df[order(-df[, 3]), ] #Sort data frame for decreasing read counts - df_sorted <- df_sorted %>% - mutate(prop_col = counts/sum(counts), cumprop_col = cumsum(prop_col)) - # Calculate the read proportion for each barcode and - # the cumulative proportion - - df_list[[basename]] <- df_sorted - }, error = function(e) { - cat("Error reading ", file, "- skipping\n") - }) + tryCatch( + { + df <- read_delim(paste0("../../validation_experiment/Cluster_csv_files/", file), delim = "\t", col_names = F) + colnames(df)[1] <- "barcodes" + colnames(df)[3] <- "counts" + df_sorted <- df %>% arrange(desc(counts)) + + # df_sorted <- df[order(-df[, 3]), ] #Sort data frame for decreasing read counts + df_sorted <- df_sorted %>% + mutate(prop_col = counts / sum(counts), cumprop_col = cumsum(prop_col)) + # Calculate the read proportion for each barcode and + # the cumulative proportion + + df_list[[basename]] <- df_sorted + }, + error = function(e) { + cat("Error reading ", file, "- skipping\n") + } + ) } ############## @@ -149,35 +156,53 @@ for (file in files) { summary_df$value[summary_df$basename %like% "^1000"] <- 1000 summary_df$value[is.na(summary_df$value)] <- 10000 -summary_df$cluster_size <- ifelse(grepl("_0_", summary_df$basename), "0", - ifelse(grepl("_1_", summary_df$basename), "1", - ifelse(grepl("_2_", summary_df$basename), "2", - ifelse(grepl("_3_", summary_df$basename), "3", - ifelse(grepl("_4_", summary_df$basename), "4", - ifelse(grepl("_5_", summary_df$basename), "5", - ifelse(grepl("_6_", summary_df$basename), "6", - ifelse(grepl("_7_", summary_df$basename), "7", - ifelse(grepl("_8_", summary_df$basename), "8", - ifelse(grepl("_9_", summary_df$basename), "9", - ifelse(grepl("_10_", summary_df$basename), "10", - ifelse(grepl("_10plus_", summary_df$basename), "11", - ifelse(grepl("_11_", summary_df$basename), "11", - ifelse(grepl("_12_", summary_df$basename), "12", - ifelse(grepl("_13_", summary_df$basename), "13", - ifelse(grepl("_14_", summary_df$basename), "14", - ifelse(grepl("_20_", summary_df$basename), "20", - ifelse(grepl("_25_", summary_df$basename), "25", - NA)))))))))))))))))) +summary_df$cluster_size <- ifelse(grepl("_0_", summary_df$basename), "0", + ifelse(grepl("_1_", summary_df$basename), "1", + ifelse(grepl("_2_", summary_df$basename), "2", + ifelse(grepl("_3_", summary_df$basename), "3", + ifelse(grepl("_4_", summary_df$basename), "4", + ifelse(grepl("_5_", summary_df$basename), "5", + ifelse(grepl("_6_", summary_df$basename), "6", + ifelse(grepl("_7_", summary_df$basename), "7", + ifelse(grepl("_8_", summary_df$basename), "8", + ifelse(grepl("_9_", summary_df$basename), "9", + ifelse(grepl("_10_", summary_df$basename), "10", + ifelse(grepl("_10plus_", summary_df$basename), "11", + ifelse(grepl("_11_", summary_df$basename), "11", + ifelse(grepl("_12_", summary_df$basename), "12", + ifelse(grepl("_13_", summary_df$basename), "13", + ifelse(grepl("_14_", summary_df$basename), "14", + ifelse(grepl("_20_", summary_df$basename), "20", + ifelse(grepl("_25_", summary_df$basename), "25", + NA + ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ) +) summary_df$cluster_category_no_WBCs <- "None" -for (i in c(0:15,25)){ - pattern <- paste0("_",paste0(as.character(i),"_")) +for (i in c(0:15, 25)) { + pattern <- paste0("_", paste0(as.character(i), "_")) summary_df$cluster_category_no_WBCs[grepl(pattern, summary_df$basename)] <- as.character(i) } -#summary_df$cluster_category_no_WBCs <- ifelse(grepl("_0_", summary_df$basename), "0", +# summary_df$cluster_category_no_WBCs <- ifelse(grepl("_0_", summary_df$basename), "0", # ifelse(grepl("_1_", summary_df$basename), "1", # ifelse(grepl("_2_", summary_df$basename), "2", # ifelse(grepl("_3_", summary_df$basename), "3", @@ -202,4 +227,3 @@ prop_comp <- summary_df_filter %>% y <- prop_comp$num_more_than_one z <- prop_comp$num_more_than_one + prop_comp$num_total - diff --git a/Rcode/validation_cluster_data.R b/Rcode/validation_cluster_data.R index 706ff6e..fe1e350 100644 --- a/Rcode/validation_cluster_data.R +++ b/Rcode/validation_cluster_data.R @@ -5,33 +5,34 @@ library(DescTools, quietly = TRUE) library(ggplot2, quietly = TRUE) library(purrr, quietly = TRUE) -reverse_paste <- function(filename, string){ - paste0(string,filename) +reverse_paste <- function(filename, string) { + paste0(string, filename) } -reverse_paste <- function(filename, string){ - paste0(string,filename) +reverse_paste <- function(filename, string) { + paste0(string, filename) } -load_cluster_df <- function(filename){ - df <- read_delim(filename, delim = "\t",col_names = FALSE, col_select = c(1,3)) +load_cluster_df <- function(filename) { + df <- read_delim(filename, delim = "\t", col_names = FALSE, col_select = c(1, 3)) colnames(df)[1] <- "barcodes" colnames(df)[2] <- "counts" - df <- df %>% arrange(desc(counts)) %>% - mutate(prop_col = counts/sum(counts), cumprop_col = cumsum(prop_col)) + df <- df %>% + arrange(desc(counts)) %>% + mutate(prop_col = counts / sum(counts), cumprop_col = cumsum(prop_col)) return(df) } files <- list.files(path = "../../validation_experiment/Cluster_csv_files/", pattern = "\\.csv$") -#files <- map_chr(files, reverse_paste, "../../validation_experiment/Cluster_csv_files/") +# files <- map_chr(files, reverse_paste, "../../validation_experiment/Cluster_csv_files/") myfiles <- map(files, load_cluster_df) -names(myfiles)<- files +names(myfiles) <- files save(myfiles, file = "../../validation_experiment/validation_cluster_data.Rdata") load("../../validation_experiment/validation_cluster_data.Rdata") names(myfiles) diff --git a/experiments/data/htmls/Br30.html b/experiments/data/htmls/Br30.html deleted file mode 100644 index a5a2529..0000000 --- a/experiments/data/htmls/Br30.html +++ /dev/null @@ -1,615 +0,0 @@ - - - - - - - - - - - - - - - -Br30 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Splitting statistics

-

This code analyses splitting statistics for CTC-clusters.

-

The analysis takes a list of trees sampled from its posterior -distribution as input and samples mutations placements for each of the -trees.

-
-
-

Configure the script

-
inputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/input_folder"
-simulationInputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2"
-treeName <- "Br30"
-nTreeSamplingEvents <- 1000
-nMutationSamplingEvents <- 1000
-
-
-

Loading data

-
source("/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/experiments/workflow/resources/functions.R")
-
## ── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
-## ✔ dplyr     1.1.4     ✔ readr     2.1.5
-## ✔ forcats   1.0.0     ✔ stringr   1.5.1
-## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
-## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
-## ✔ purrr     1.0.2     
-## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
-## ✖ dplyr::filter() masks stats::filter()
-## ✖ dplyr::lag()    masks stats::lag()
-## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
input <- load_data(inputFolder, treeName)
-
## Rows: 40000 Columns: 5
-## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 31 Columns: 22
-## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (19): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 9 Columns: 5
-## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
postSampling <- input$postSampling
-nClusters <- input$nClusters
-ClusterID <- input$clusterID
-nCells <- input$nCells  
-nMutations <- input$nMutations
-nClusters <- input$nClusters
-alleleCount <- input$alleleCount
-mutatedReadCounts <- input$mutatedReadCounts
-totalReadCounts <- input$totalReadCounts
-sampleDescription <- input$sample_description
-
-
-

Sample description

-

Each row corresponds to a cell. Column description: - Cluster: An -number indicating which sample the cell belongs to. - ClusterName: The -name of the sample in the nodeDescription.tsv file - WBC: a binary -vector indicating whether the cell is a white blood cell (1) or not (0). -- color: Indicates the color of the cluster in the tree, as described in -the nodeDescription.tsv file.

-
print(sampleDescription)
-
##    Cluster ClusterName WBC      color single_cell
-## 1        0      Br30_1   0     gray93        TRUE
-## 2        1     Br30_11   1 ghostwhite        TRUE
-## 3        2     Br30_12   1 lightcoral       FALSE
-## 4        2     Br30_12   1 lightcoral       FALSE
-## 5        2     Br30_12   1 lightcoral       FALSE
-## 6        3     Br30_13   0     gray93        TRUE
-## 7        4      Br30_2   0     gray93        TRUE
-## 8        5      Br30_3   0     gray93        TRUE
-## 9        6      Br30_6   0     gray93        TRUE
-## 10       7      Br30_7   1 sandybrown       FALSE
-## 11       7      Br30_7   0 sandybrown       FALSE
-## 12       8      Br30_8   1   skyblue3       FALSE
-## 13       8      Br30_8   0   skyblue3       FALSE
-

Get null distributions of relevant statistics, stratified by -sample:

-
cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector())
-cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector())
-
-for (clusterSize in 2:5){
-  try(
-  {treeNameSimulated <- paste(treeName, clusterSize, sep = '_')
-
-
-  inputSimulated <- load_data(simulationInputFolder, treeNameSimulated)
-
-  postSamplingSimulated <- inputSimulated$postSampling
-  nClustersSimulated <- inputSimulated$nClusters
-  ClusterIDSimulated <- inputSimulated$clusterID
-  nCellsSimulated <- inputSimulated$nCells  
-  nMutationsSimulated <- inputSimulated$nMutations
-  nClustersSimulated <- inputSimulated$nClusters
-  alleleCountSimulated <- inputSimulated$alleleCount
-  mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts
-  totalReadCountsSimulated <- inputSimulated$totalReadCounts
-  sampleDescriptionSimulated <- inputSimulated$sample_description
-  
-  distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated,
-                     nMutationsSimulated, nClustersSimulated,
-                     alleleCountSimulated,
-                     mutatedReadCountsSimulated, totalReadCountsSimulated,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c("orchid", "orchid1", "orchid2",
-                                           "orchid3", "orchid4", "darkorchid",
-                                           "darkorchid1","darkorchid2", "darkorchid3",
-                                           "darkorchid4", "purple", "purple1",
-                                           "purple2", "purple3", "purple4"))
-
-  
-
-  plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) +
-    geom_boxplot())
-  cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) ))
-  
-  ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are
-  ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more
-  ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters
-  ## of the same size, it is however the potential source of a future bug!!
-  
-  plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) +
-    geom_histogram(binwidth = 0.01))
-  print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  })
-}
-
## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br30_2/Br30_2_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br30_3/Br30_3_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br30_4/Br30_4_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br30_5/Br30_5_postSampling.tsv' does not exist.
-

Get the relevant statistics for each of the clusters of a dataset and -output numbers of oligoclonal clusters:

-
nTumorClusters <- 0
-nOligoclonalClusters1 <- 0
-nOligoclonalClusters2 <- 0
-splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-
-for(clusterSize in 2:5){
-  try({
-    clusterColor <- sampleDescription %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() == clusterSize) %>%
-    pull(color) %>%
-    unique() 
-    
-    for(color in clusterColor){
-      distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells,
-                     nMutations, nClusters,
-                     alleleCount,
-                     mutatedReadCounts, totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c(color))
-
-      splittingProbs <- mean(distance$splittingProbs$Splitting_probability)
-      branchingProbs <- mean(distance$aggregatedBranchingProbabilities)
-    
-      nTumorClusters <- nTumorClusters + 1
-      oligoclonal <- FALSE
-      print(clusterSize)
-      print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])
-      if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){
-        nOligoclonalClusters1 <- nOligoclonalClusters1 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-      oligoclonal <- FALSE
-      if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){
-        nOligoclonalClusters2 <- nOligoclonalClusters2 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-    }
-  })
-}
-
-
-numberOfCancerClusters <- sampleDescription %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() > 1) %>%
-    pull(color) %>%
-    unique() %>% length() 
-
-print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName))
-
## [1] "0 out of 0 clusters were found to be oligoclonal in Br30, using method 1"
-
print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName))
-
## [1] "0 out of 0 clusters were found to be oligoclonal in Br30, using method 2"
-
print(splittingSummary1)
-
## [1] Color       Oligoclonal ClusterSize
-## <0 Zeilen> (oder row.names mit Länge 0)
-
print(splittingSummary2)
-
## [1] Color       Oligoclonal ClusterSize
-## <0 Zeilen> (oder row.names mit Länge 0)
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/experiments/data/htmls/Br44.html b/experiments/data/htmls/Br44.html deleted file mode 100644 index 552cfb3..0000000 --- a/experiments/data/htmls/Br44.html +++ /dev/null @@ -1,605 +0,0 @@ - - - - - - - - - - - - - - - -Br44 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Splitting statistics

-

This code analyses splitting statistics for CTC-clusters.

-

The analysis takes a list of trees sampled from its posterior -distribution as input and computes the mutations placement probability -distribution for each oneof them. From this distribution we derive a -score that quantifies the probability that two cells have experienced -divergent evolution. This score is called the splitting score.

-
-
-

Configure the script

-
inputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/input_folder"
-simulationInputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2"
-treeName <- "Br44"
-nTreeSamplingEvents <- 1000
-nMutationSamplingEvents <- 1000
-
-
-

Loading data

-
source("/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/experiments/workflow/resources/functions.R")
-
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
-## ✔ dplyr     1.1.4     ✔ readr     2.1.5
-## ✔ forcats   1.0.0     ✔ stringr   1.5.1
-## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
-## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
-## ✔ purrr     1.0.2     
-## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
-## ✖ dplyr::filter() masks stats::filter()
-## ✖ dplyr::lag()    masks stats::lag()
-## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
input <- load_data(inputFolder, treeName)
-
## Rows: 0 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## 
-## ??? (5): LogScore, SequencingErrorRate, DropoutRate, LogTau, Tree
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 1 Columns: 16
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (13): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 6 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
-
-

Sample description

-

Each row corresponds to a cell. Column description: - Cluster: An -number indicating the sample the cell belongs to. - ClusterName: The -name of the sample in the nodeDescription.tsv file - WBC: a binary -vector indicating whether the cell is a white blood cell (1) or not (0). -- color: Indicates the color of the cluster in the tree, as described in -the nodeDescription.tsv file.

-
print(input$sample_description)
-
##   Cluster ClusterName WBC      color single_cell
-## 1       0      Br44_1   1 lightcoral        TRUE
-## 2       1      Br44_3   0     gray93        TRUE
-## 3       2      Br44_5   0     gray93        TRUE
-## 4       3      Br44_6   1 ghostwhite        TRUE
-## 5       4      Br44_7   1 ghostwhite        TRUE
-## 6       5      Br44_8   1 ghostwhite        TRUE
-
-
-

General overview

-

We sample 1000 trees.

-

For each pair of cells in the same cluster and each sampled tree we -compute the splitting score, that is, the probability that the two cells -have experienced divergent evolution. A low splitting score (close to 0) -indicates that the two cells are likely genealogically closely related, -while a high splitting score (close to 1) indicates that the two cells -have evolved in a divergent manner.

-

Throughout the sampling of trees, this gives rise to an empirical -distribution of splitting scores for each pair of cells in the same -cluster. Intuitively, this distribution takes into account the -uncertainty in the tree estimation. To be able to interpret the -splitting score appropriately (e.g. to answer the question when is a -splitting score is high enough to call oligo-clonality) we need to -calibrate our expectations.

-

We do this by assessing the distributions of splitting scores when we -know that the clusters is mono-clonal. To this end, we simulate -reference and alternative read count data of monoclonal clusters of -different sizes (2,3,4 and 5-cell clusters) and add these to the -original dataset, run the tree inference algorithm and compute the -splitting score distributions for all pairs of cells in the same -cluster. To ensure that the simulated data does not confound the tree -inference too much, we do this one cluster at a time.

-

For each simulated cluster we pick one pair of cells and printed the -splitting score distribution below. With high number of sampled trees, -the distributions of all pairs of cells from the same cluster are very -similar, since the model treats all cells from the same cluster as -interchangeable.

-

Finally, we print the empirical distribution of the the splitting -scores for all clusters of the same size.

-

The latter is used to specify the cutoff for oligo-clonality: It is -defined as the 95%-percentile of the aggregated distribution of -splitting scores.

-
cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector())
-cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector())
-
-for (clusterSize in 2:5){
-  try(
-  {treeNameSimulated <- paste(treeName, clusterSize, sep = '_')
-
-
-  inputSimulated <- load_data(simulationInputFolder, treeNameSimulated)
-
-  sampleDescriptionSimulated <- inputSimulated$sample_description
-  
-  distance <- computeClusterSplits(inputSimulated$sample_description, inputSimulated$postSampling, treeNameSimulated, inputSimulated$nCells,
-                     inputSimulated$nMutations, inputSimulated$nClusters,
-                     inputSimulated$alleleCount,
-                     inputSimulated$mutatedReadCounts, inputSimulated$totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c("orchid", "orchid1", "orchid2",
-                                           "orchid3", "orchid4", "darkorchid",
-                                           "darkorchid1","darkorchid2", "darkorchid3",
-                                           "darkorchid4", "purple", "purple1",
-                                           "purple2", "purple3", "purple4"))
-
-  
-  cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) ))
-
-  plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) +
-    geom_histogram(binwidth = 0.01))
-  cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  })
-}
-
## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br44_2/Br44_2_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br44_3/Br44_3_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br44_4/Br44_4_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br44_5/Br44_5_postSampling.tsv' does not exist.
-
print(cutoffsBranchingProbabilities)
-
## [1] clusterSize Cutoff     
-## <0 Zeilen> (oder row.names mit Länge 0)
-

Now we can compute the aggregated splitting score distributions for -each cluster. The distribution’s mean is compared to the cutoffs -computed above, and if it is higher than the cutoff, we call the cluster -oligo-clonal.

-
nTumorClusters <- 0
-nOligoclonalClusters2 <- 0
-splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-
-for(clusterSize in 2:5){
-  try({
-    clusterColor <- input$sample_description %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() == clusterSize) %>%
-    pull(color) %>%
-    unique() 
-    
-    for(color in clusterColor){
-      distance <- computeClusterSplits(input$sample_description, input$postSampling, treeName, input$nCells,
-                     input$nMutations, input$nClusters,
-                     input$alleleCount,
-                     input$mutatedReadCounts, input$totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c(color))
-
-      splittingProbs <- mean(distance$splittingProbs$Splitting_probability)
-      branchingProbs <- mean(distance$aggregatedBranchingProbabilities)
-      
-      nTumorClusters <- nTumorClusters + 1
-      oligoclonal <- FALSE
-
-      if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){
-        nOligoclonalClusters2 <- nOligoclonalClusters2 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-    }
-  })
-}
-
-
-numberOfCancerClusters <- input$sample_description %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() > 1) %>%
-    pull(color) %>%
-    unique() %>% length() 
-
-print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName))
-
## [1] "0 out of 0 clusters were found to be oligoclonal in Br44, using method 2"
-
print(splittingSummary2)
-
## [1] Color       Oligoclonal ClusterSize
-## <0 Zeilen> (oder row.names mit Länge 0)
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/experiments/data/htmls/Br46.html b/experiments/data/htmls/Br46.html deleted file mode 100644 index 540f87c..0000000 --- a/experiments/data/htmls/Br46.html +++ /dev/null @@ -1,606 +0,0 @@ - - - - - - - - - - - - - - - -Br46 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Splitting statistics

-

This code analyses splitting statistics for CTC-clusters.

-

The analysis takes a list of trees sampled from its posterior -distribution as input and computes the mutations placement probability -distribution for each oneof them. From this distribution we derive a -score that quantifies the probability that two cells have experienced -divergent evolution. This score is called the splitting score.

-
-
-

Configure the script

-
inputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/input_folder"
-simulationInputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2"
-treeName <- "Br46"
-nTreeSamplingEvents <- 1000
-nMutationSamplingEvents <- 1000
-
-
-

Loading data

-
source("/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/experiments/workflow/resources/functions.R")
-
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
-## ✔ dplyr     1.1.4     ✔ readr     2.1.5
-## ✔ forcats   1.0.0     ✔ stringr   1.5.1
-## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
-## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
-## ✔ purrr     1.0.2     
-## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
-## ✖ dplyr::filter() masks stats::filter()
-## ✖ dplyr::lag()    masks stats::lag()
-## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
input <- load_data(inputFolder, treeName)
-
## Rows: 40000 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 11 Columns: 16
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (13): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 6 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
-
-

Sample description

-

Each row corresponds to a cell. Column description: - Cluster: An -number indicating the sample the cell belongs to. - ClusterName: The -name of the sample in the nodeDescription.tsv file - WBC: a binary -vector indicating whether the cell is a white blood cell (1) or not (0). -- color: Indicates the color of the cluster in the tree, as described in -the nodeDescription.tsv file.

-
print(input$sample_description)
-
##   Cluster ClusterName WBC      color single_cell
-## 1       0   Br46_1_45   0     gray93        TRUE
-## 2       1   Br46_2_46   0     gray93        TRUE
-## 3       2   Br46_3_47   0     gray93        TRUE
-## 4       3   Br46_4_48   1 ghostwhite        TRUE
-## 5       4   Br46_5_49   1 ghostwhite        TRUE
-## 6       5      Br46_6   1 ghostwhite        TRUE
-
-
-

General overview

-

We sample 1000 trees.

-

For each pair of cells in the same cluster and each sampled tree we -compute the splitting score, that is, the probability that the two cells -have experienced divergent evolution. A low splitting score (close to 0) -indicates that the two cells are likely genealogically closely related, -while a high splitting score (close to 1) indicates that the two cells -have evolved in a divergent manner.

-

Throughout the sampling of trees, this gives rise to an empirical -distribution of splitting scores for each pair of cells in the same -cluster. Intuitively, this distribution takes into account the -uncertainty in the tree estimation. To be able to interpret the -splitting score appropriately (e.g. to answer the question when is a -splitting score is high enough to call oligo-clonality) we need to -calibrate our expectations.

-

We do this by assessing the distributions of splitting scores when we -know that the clusters is mono-clonal. To this end, we simulate -reference and alternative read count data of monoclonal clusters of -different sizes (2,3,4 and 5-cell clusters) and add these to the -original dataset, run the tree inference algorithm and compute the -splitting score distributions for all pairs of cells in the same -cluster. To ensure that the simulated data does not confound the tree -inference too much, we do this one cluster at a time.

-

For each simulated cluster we pick one pair of cells and printed the -splitting score distribution below. With high number of sampled trees, -the distributions of all pairs of cells from the same cluster are very -similar, since the model treats all cells from the same cluster as -interchangeable.

-

Finally, we print the empirical distribution of the the splitting -scores for all clusters of the same size.

-

The latter is used to specify the cutoff for oligo-clonality: It is -defined as the 95%-percentile of the aggregated distribution of -splitting scores.

-
cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector())
-cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector())
-
-for (clusterSize in 2:5){
-  try(
-  {treeNameSimulated <- paste(treeName, clusterSize, sep = '_')
-
-
-  inputSimulated <- load_data(simulationInputFolder, treeNameSimulated)
-
-  sampleDescriptionSimulated <- inputSimulated$sample_description
-  
-  distance <- computeClusterSplits(inputSimulated$sample_description, inputSimulated$postSampling, treeNameSimulated, inputSimulated$nCells,
-                     inputSimulated$nMutations, inputSimulated$nClusters,
-                     inputSimulated$alleleCount,
-                     inputSimulated$mutatedReadCounts, inputSimulated$totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c("orchid", "orchid1", "orchid2",
-                                           "orchid3", "orchid4", "darkorchid",
-                                           "darkorchid1","darkorchid2", "darkorchid3",
-                                           "darkorchid4", "purple", "purple1",
-                                           "purple2", "purple3", "purple4"))
-
-  
-  cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) ))
-
-  plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) +
-    geom_histogram(binwidth = 0.01))
-  cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  })
-}
-
## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br46_2/Br46_2_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br46_3/Br46_3_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br46_4/Br46_4_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Br46_5/Br46_5_postSampling.tsv' does not exist.
-
print(cutoffsBranchingProbabilities)
-
## [1] clusterSize Cutoff     
-## <0 Zeilen> (oder row.names mit Länge 0)
-

Now we can compute the aggregated splitting score distributions for -each cluster. The distribution’s mean is compared to the cutoffs -computed above, and if it is higher than the cutoff, we call the cluster -oligo-clonal.

-
nTumorClusters <- 0
-nOligoclonalClusters2 <- 0
-splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-
-for(clusterSize in 2:5){
-  try({
-    clusterColor <- input$sample_description %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() == clusterSize) %>%
-    pull(color) %>%
-    unique() 
-    
-    for(color in clusterColor){
-      distance <- computeClusterSplits(input$sample_description, input$postSampling, treeName, input$nCells,
-                     input$nMutations, input$nClusters,
-                     input$alleleCount,
-                     input$mutatedReadCounts, input$totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c(color))
-
-      splittingProbs <- mean(distance$splittingProbs$Splitting_probability)
-      branchingProbs <- mean(distance$aggregatedBranchingProbabilities)
-      
-      nTumorClusters <- nTumorClusters + 1
-      oligoclonal <- FALSE
-
-      if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){
-        nOligoclonalClusters2 <- nOligoclonalClusters2 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-    }
-  })
-}
-
-
-numberOfCancerClusters <- input$sample_description %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() > 1) %>%
-    pull(color) %>%
-    unique() %>% length() 
-
-print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName))
-
## [1] "0 out of 0 clusters were found to be oligoclonal in Br46, using method 2"
-
print(splittingSummary2)
-
## [1] Color       Oligoclonal ClusterSize
-## <0 Zeilen> (oder row.names mit Länge 0)
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/experiments/data/htmls/Br61.html b/experiments/data/htmls/Br61.html deleted file mode 100644 index f371fe8..0000000 --- a/experiments/data/htmls/Br61.html +++ /dev/null @@ -1,823 +0,0 @@ - - - - - - - - - - - - - - - -Br61 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Splitting statistics

-

This code analyses splitting statistics for CTC-clusters.

-

The analysis takes a list of trees sampled from its posterior -distribution as input and computes the mutations placement probability -distribution for each oneof them. From this distribution we derive a -score that quantifies the probability that two cells have experienced -divergent evolution. This score is called the splitting score.

-
-
-

Configure the script

-
inputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/input_folder"
-simulationInputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2"
-treeName <- "Br61"
-nTreeSamplingEvents <- 1000
-nMutationSamplingEvents <- 1000
-
-
-

Loading data

-
source("/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/experiments/workflow/resources/functions.R")
-
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
-## ✔ dplyr     1.1.4     ✔ readr     2.1.5
-## ✔ forcats   1.0.0     ✔ stringr   1.5.1
-## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
-## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
-## ✔ purrr     1.0.2     
-## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
-## ✖ dplyr::filter() masks stats::filter()
-## ✖ dplyr::lag()    masks stats::lag()
-## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
input <- load_data(inputFolder, treeName)
-
## Rows: 40000 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 280 Columns: 90
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (87): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 43 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
-
-

Sample description

-

Each row corresponds to a cell. Column description: - Cluster: An -number indicating the sample the cell belongs to. - ClusterName: The -name of the sample in the nodeDescription.tsv file - WBC: a binary -vector indicating whether the cell is a white blood cell (1) or not (0). -- color: Indicates the color of the cluster in the tree, as described in -the nodeDescription.tsv file.

-
print(input$sample_description)
-
##    Cluster ClusterName WBC          color single_cell
-## 1        0  Br61_CTC_1   0         gray93        TRUE
-## 2        1 Br61_CTC_10   0         gray93        TRUE
-## 3        2 Br61_CTC_12   0         gray93        TRUE
-## 4        3 Br61_CTC_13   0         gray93        TRUE
-## 5        4 Br61_CTC_14   1     lightcoral        TRUE
-## 6        5 Br61_CTC_15   0     lightcoral        TRUE
-## 7        6 Br61_CTC_16   0     lightcoral        TRUE
-## 8        7 Br61_CTC_17   0         gray93        TRUE
-## 9        8 Br61_CTC_18   0     sandybrown        TRUE
-## 10       9 Br61_CTC_19   0     sandybrown        TRUE
-## 11      10 Br61_CTC_20   0         gray93        TRUE
-## 12      11 Br61_CTC_22   0       skyblue3        TRUE
-## 13      12 Br61_CTC_23   0       skyblue3       FALSE
-## 14      12 Br61_CTC_23   0       skyblue3       FALSE
-## 15      13 Br61_CTC_24   0         khaki3       FALSE
-## 16      13 Br61_CTC_24   0         khaki3       FALSE
-## 17      14 Br61_CTC_25   0         gray93        TRUE
-## 18      15 Br61_CTC_26   1        thistle        TRUE
-## 19      16 Br61_CTC_27   0        thistle        TRUE
-## 20      17 Br61_CTC_28   0         gray93        TRUE
-## 21      18 Br61_CTC_29   0         gray93        TRUE
-## 22      19 Br61_CTC_31   1  darkseagreen4       FALSE
-## 23      19 Br61_CTC_31   0  darkseagreen4       FALSE
-## 24      20 Br61_CTC_32   0         gray93        TRUE
-## 25      21 Br61_CTC_33   0   lemonchiffon        TRUE
-## 26      22 Br61_CTC_34   0   lemonchiffon        TRUE
-## 27      23 Br61_CTC_35   0         gray93        TRUE
-## 28      24 Br61_CTC_36   1           gold       FALSE
-## 29      24 Br61_CTC_36   0           gold       FALSE
-## 30      25 Br61_CTC_38   0           plum       FALSE
-## 31      25 Br61_CTC_38   0           plum       FALSE
-## 32      26 Br61_CTC_39   1    yellowgreen       FALSE
-## 33      26 Br61_CTC_39   0    yellowgreen       FALSE
-## 34      27  Br61_CTC_4   0         gray93        TRUE
-## 35      28 Br61_CTC_40   0     violetred3        TRUE
-## 36      29 Br61_CTC_41   1     violetred3        TRUE
-## 37      30 Br61_CTC_42   0     violetred3        TRUE
-## 38      31 Br61_CTC_43   1 lightslateblue        TRUE
-## 39      32 Br61_CTC_44   0 lightslateblue        TRUE
-## 40      33 Br61_CTC_45   0 paleturquoise3        TRUE
-## 41      34 Br61_CTC_46   0 paleturquoise3        TRUE
-## 42      35 Br61_CTC_47   1   navajowhite2       FALSE
-## 43      35 Br61_CTC_47   1   navajowhite2       FALSE
-## 44      35 Br61_CTC_47   1   navajowhite2       FALSE
-## 45      35 Br61_CTC_47   1   navajowhite2       FALSE
-## 46      36 Br61_CTC_48   1        crimson       FALSE
-## 47      36 Br61_CTC_48   1        crimson       FALSE
-## 48      36 Br61_CTC_48   1        crimson       FALSE
-## 49      36 Br61_CTC_48   1        crimson       FALSE
-## 50      37 Br61_CTC_49   1      cadetblue       FALSE
-## 51      37 Br61_CTC_49   1      cadetblue       FALSE
-## 52      37 Br61_CTC_49   1      cadetblue       FALSE
-## 53      37 Br61_CTC_49   1      cadetblue       FALSE
-## 54      37 Br61_CTC_49   1      cadetblue       FALSE
-## 55      38  Br61_CTC_5   0         gray93        TRUE
-## 56      39 Br61_CTC_50   1     ghostwhite        TRUE
-## 57      40  Br61_CTC_6   0         gray93        TRUE
-## 58      41  Br61_CTC_8   0         gray93        TRUE
-## 59      42  Br61_CTC_9   0         gray93        TRUE
-
-
-

General overview

-

We sample 1000 trees.

-

For each pair of cells in the same cluster and each sampled tree we -compute the splitting score, that is, the probability that the two cells -have experienced divergent evolution. A low splitting score (close to 0) -indicates that the two cells are likely genealogically closely related, -while a high splitting score (close to 1) indicates that the two cells -have evolved in a divergent manner.

-

Throughout the sampling of trees, this gives rise to an empirical -distribution of splitting scores for each pair of cells in the same -cluster. Intuitively, this distribution takes into account the -uncertainty in the tree estimation. To be able to interpret the -splitting score appropriately (e.g. to answer the question when is a -splitting score is high enough to call oligo-clonality) we need to -calibrate our expectations.

-

We do this by assessing the distributions of splitting scores when we -know that the clusters is mono-clonal. To this end, we simulate -reference and alternative read count data of monoclonal clusters of -different sizes (2,3,4 and 5-cell clusters) and add these to the -original dataset, run the tree inference algorithm and compute the -splitting score distributions for all pairs of cells in the same -cluster. To ensure that the simulated data does not confound the tree -inference too much, we do this one cluster at a time.

-

For each simulated cluster we pick one pair of cells and printed the -splitting score distribution below. With high number of sampled trees, -the distributions of all pairs of cells from the same cluster are very -similar, since the model treats all cells from the same cluster as -interchangeable.

-

Finally, we print the empirical distribution of the the splitting -scores for all clusters of the same size.

-

The latter is used to specify the cutoff for oligo-clonality: It is -defined as the 95%-percentile of the aggregated distribution of -splitting scores.

-
cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector())
-cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector())
-
-for (clusterSize in 2:5){
-  try(
-  {treeNameSimulated <- paste(treeName, clusterSize, sep = '_')
-
-
-  inputSimulated <- load_data(simulationInputFolder, treeNameSimulated)
-
-  sampleDescriptionSimulated <- inputSimulated$sample_description
-  
-  distance <- computeClusterSplits(inputSimulated$sample_description, inputSimulated$postSampling, treeNameSimulated, inputSimulated$nCells,
-                     inputSimulated$nMutations, inputSimulated$nClusters,
-                     inputSimulated$alleleCount,
-                     inputSimulated$mutatedReadCounts, inputSimulated$totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c("orchid", "orchid1", "orchid2",
-                                           "orchid3", "orchid4", "darkorchid",
-                                           "darkorchid1","darkorchid2", "darkorchid3",
-                                           "darkorchid4", "purple", "purple1",
-                                           "purple2", "purple3", "purple4"))
-
-  
-  cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) ))
-
-  plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) +
-    geom_histogram(binwidth = 0.01))
-  cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  })
-}
-
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
-## e.g.:
-##   dat <- vroom(...)
-##   problems(dat)
-
## Rows: 47013 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 280 Columns: 98
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (95): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 47 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
## [1] "Computing genomic distances of leaves: 60 59"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 62 61"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 64 63"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 66 65"
-## [1] "Computing the posterior distribution"
-

-
## Rows: 12100 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 280 Columns: 96
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (93): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 46 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-

-
## [1] "Computing genomic distances of leaves: 60 59"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 63 62"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 66 65"
-## [1] "Computing the posterior distribution"
-

-
## Rows: 12353 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 280 Columns: 94
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (91): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 45 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-

-
## [1] "Computing genomic distances of leaves: 60 59"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 64 63"
-## [1] "Computing the posterior distribution"
-

-
## Rows: 12809 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 280 Columns: 94
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (91): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 45 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-

-
## [1] "Computing genomic distances of leaves: 60 59"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 65 64"
-## [1] "Computing the posterior distribution"
-

-
print(cutoffsBranchingProbabilities)
-
##   clusterSize       Cutoff
-## 1           2 1.714184e-04
-## 2           3 4.937160e-01
-## 3           4 7.319855e-08
-## 4           5 1.151823e-02
-

Now we can compute the aggregated splitting score distributions for -each cluster. The distribution’s mean is compared to the cutoffs -computed above, and if it is higher than the cutoff, we call the cluster -oligo-clonal.

-
nTumorClusters <- 0
-nOligoclonalClusters2 <- 0
-splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-
-for(clusterSize in 2:5){
-  try({
-    clusterColor <- input$sample_description %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() == clusterSize) %>%
-    pull(color) %>%
-    unique() 
-    
-    for(color in clusterColor){
-      distance <- computeClusterSplits(input$sample_description, input$postSampling, treeName, input$nCells,
-                     input$nMutations, input$nClusters,
-                     input$alleleCount,
-                     input$mutatedReadCounts, input$totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c(color))
-
-      splittingProbs <- mean(distance$splittingProbs$Splitting_probability)
-      branchingProbs <- mean(distance$aggregatedBranchingProbabilities)
-      
-      nTumorClusters <- nTumorClusters + 1
-      oligoclonal <- FALSE
-
-      if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){
-        nOligoclonalClusters2 <- nOligoclonalClusters2 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-    }
-  })
-}
-
## [1] "Computing genomic distances of leaves: 6 5"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 9 8"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 15 14"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 25 24"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 30 29"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 36 34"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 40 39"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 12 11"
-## [1] "Computing the posterior distribution"
-

-
numberOfCancerClusters <- input$sample_description %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() > 1) %>%
-    pull(color) %>%
-    unique() %>% length() 
-
-print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName))
-
## [1] "7 out of 8 clusters were found to be oligoclonal in Br61, using method 2"
-
print(splittingSummary2)
-
##            Color Oligoclonal ClusterSize
-## 1     lightcoral        TRUE           2
-## 2     sandybrown        TRUE           2
-## 3         khaki3        TRUE           2
-## 4   lemonchiffon        TRUE           2
-## 5           plum        TRUE           2
-## 6     violetred3        TRUE           2
-## 7 paleturquoise3        TRUE           2
-## 8       skyblue3       FALSE           3
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/experiments/data/htmls/Br7.html b/experiments/data/htmls/Br7.html deleted file mode 100644 index f497f97..0000000 --- a/experiments/data/htmls/Br7.html +++ /dev/null @@ -1,766 +0,0 @@ - - - - - - - - - - - - - - - -Br7 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Splitting statistics

-

This code analyses splitting statistics for CTC-clusters.

-

The analysis takes a list of trees sampled from its posterior -distribution as input and samples mutations placements for each of the -trees.

-
-
-

Configure the script

-
inputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/input_folder"
-simulationInputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2"
-treeName <- "Br7"
-nTreeSamplingEvents <- 1000
-nMutationSamplingEvents <- 1000
-
-
-

Loading data

-
source("/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/experiments/workflow/resources/functions.R")
-
## ── Attaching core tidyverse packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
-## ✔ dplyr     1.1.4     ✔ readr     2.1.5
-## ✔ forcats   1.0.0     ✔ stringr   1.5.1
-## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
-## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
-## ✔ purrr     1.0.2     
-## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
-## ✖ dplyr::filter() masks stats::filter()
-## ✖ dplyr::lag()    masks stats::lag()
-## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
input <- load_data(inputFolder, treeName)
-
## Rows: 40000 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 40 Columns: 26
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (23): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 11 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
postSampling <- input$postSampling
-nClusters <- input$nClusters
-ClusterID <- input$clusterID
-nCells <- input$nCells  
-nMutations <- input$nMutations
-nClusters <- input$nClusters
-alleleCount <- input$alleleCount
-mutatedReadCounts <- input$mutatedReadCounts
-totalReadCounts <- input$totalReadCounts
-sampleDescription <- input$sample_description
-
-
-

Sample description

-

Each row corresponds to a cell. Column description: - Cluster: An -number indicating which sample the cell belongs to. - ClusterName: The -name of the sample in the nodeDescription.tsv file - WBC: a binary -vector indicating whether the cell is a white blood cell (1) or not (0). -- color: Indicates the color of the cluster in the tree, as described in -the nodeDescription.tsv file.

-
print(sampleDescription)
-
##    Cluster ClusterName WBC        color single_cell
-## 1        0    Br7_1_BS   1   lightcoral       FALSE
-## 2        0    Br7_1_BS   1   lightcoral       FALSE
-## 3        0    Br7_1_BS   0   lightcoral       FALSE
-## 4        0    Br7_1_BS   0   lightcoral       FALSE
-## 5        1    Br7_1_SG   1   sandybrown       FALSE
-## 6        1    Br7_1_SG   1   sandybrown       FALSE
-## 7        1    Br7_1_SG   1   sandybrown       FALSE
-## 8        2       Br7_2   0       gray93        TRUE
-## 9        3       Br7_3   0     skyblue3       FALSE
-## 10       3       Br7_3   0     skyblue3       FALSE
-## 11       3       Br7_3   0     skyblue3       FALSE
-## 12       4       Br7_4   0       gray93        TRUE
-## 13       5   Br7_4_WBC   1   ghostwhite        TRUE
-## 14       6       Br7_5   0       gray93        TRUE
-## 15       7   Br7_5_WBC   1   ghostwhite        TRUE
-## 16       8       Br7_6   1      thistle       FALSE
-## 17       8       Br7_6   1      thistle       FALSE
-## 18       8       Br7_6   0      thistle       FALSE
-## 19       8       Br7_6   0      thistle       FALSE
-## 20       9       Br7_7   1 lemonchiffon       FALSE
-## 21       9       Br7_7   1 lemonchiffon       FALSE
-## 22      10    Br7_7_12   1   violetred3       FALSE
-## 23      10    Br7_7_12   1   violetred3       FALSE
-## 24      10    Br7_7_12   1   violetred3       FALSE
-

Get null distributions of relevant statistics, stratified by -sample:

-
cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector())
-cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector())
-
-for (clusterSize in 2:5){
-  try(
-  {treeNameSimulated <- paste(treeName, clusterSize, sep = '_')
-
-
-  inputSimulated <- load_data(simulationInputFolder, treeNameSimulated)
-
-  postSamplingSimulated <- inputSimulated$postSampling
-  nClustersSimulated <- inputSimulated$nClusters
-  ClusterIDSimulated <- inputSimulated$clusterID
-  nCellsSimulated <- inputSimulated$nCells  
-  nMutationsSimulated <- inputSimulated$nMutations
-  nClustersSimulated <- inputSimulated$nClusters
-  alleleCountSimulated <- inputSimulated$alleleCount
-  mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts
-  totalReadCountsSimulated <- inputSimulated$totalReadCounts
-  sampleDescriptionSimulated <- inputSimulated$sample_description
-  
-  distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated,
-                     nMutationsSimulated, nClustersSimulated,
-                     alleleCountSimulated,
-                     mutatedReadCountsSimulated, totalReadCountsSimulated,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c("orchid", "orchid1", "orchid2",
-                                           "orchid3", "orchid4", "darkorchid",
-                                           "darkorchid1","darkorchid2", "darkorchid3",
-                                           "darkorchid4", "purple", "purple1",
-                                           "purple2", "purple3", "purple4"))
-
-  
-
-  plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) +
-    geom_boxplot())
-  cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) ))
-  
-  ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are
-  ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more
-  ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters
-  ## of the same size, it is however the potential source of a future bug!!
-  
-  plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) +
-    geom_histogram(binwidth = 0.01))
-  print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  })
-}
-
## Rows: 15273 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 40 Columns: 34
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (31): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 15 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
## [1] "Computing genomic distances of leaves: 25 24"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 27 26"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 29 28"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 31 30"
-## [1] "Computing the posterior distribution"
-

-
##   clusterSize     Cutoff
-## 1           2 0.03323302
-
## Rows: 13973 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 40 Columns: 32
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (29): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 14 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-

-
## [1] "Computing genomic distances of leaves: 25 24"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 28 27"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 31 30"
-## [1] "Computing the posterior distribution"
-

-
##   clusterSize      Cutoff
-## 1           3 0.009729615
-
## Rows: 0 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## 
-## ??? (5): LogScore, SequencingErrorRate, DropoutRate, LogTau, Tree
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 40 Columns: 30
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (27): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 13 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
## Error in sample.int(length(x), size, replace, prob) : 
-##   kann keine Stichprobe größer als die Grundgesamtheit nehmen,
-##  wenn 'replace = FALSE'
-
## Rows: 0 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## 
-## ??? (5): LogScore, SequencingErrorRate, DropoutRate, LogTau, Tree
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 40 Columns: 30
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (27): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 13 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-

-
## Error in sample.int(length(x), size, replace, prob) : 
-##   kann keine Stichprobe größer als die Grundgesamtheit nehmen,
-##  wenn 'replace = FALSE'
-

Get the relevant statistics for each of the clusters of a dataset and -output numbers of oligoclonal clusters:

-
nTumorClusters <- 0
-nOligoclonalClusters1 <- 0
-nOligoclonalClusters2 <- 0
-splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-
-for(clusterSize in 2:5){
-  try({
-    clusterColor <- sampleDescription %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() == clusterSize) %>%
-    pull(color) %>%
-    unique() 
-    
-    for(color in clusterColor){
-      distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells,
-                     nMutations, nClusters,
-                     alleleCount,
-                     mutatedReadCounts, totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c(color))
-
-      splittingProbs <- mean(distance$splittingProbs$Splitting_probability)
-      branchingProbs <- mean(distance$aggregatedBranchingProbabilities)
-    
-      nTumorClusters <- nTumorClusters + 1
-      oligoclonal <- FALSE
-      print(clusterSize)
-      print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])
-      if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){
-        nOligoclonalClusters1 <- nOligoclonalClusters1 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-      oligoclonal <- FALSE
-      if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){
-        nOligoclonalClusters2 <- nOligoclonalClusters2 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-    }
-  })
-}
-
## [1] "Computing genomic distances of leaves: 3 2"
-## [1] "Computing the posterior distribution"
-

-
## [1] 2
-## [1] 0.06828773
-## [1] "Computing genomic distances of leaves: 18 17"
-## [1] "Computing the posterior distribution"
-

-
## [1] 2
-## [1] 0.06828773
-## [1] "Computing genomic distances of leaves: 9 8"
-## [1] "Computing the posterior distribution"
-

-
## [1] 3
-## [1] 0.04414859
-
numberOfCancerClusters <- sampleDescription %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() > 1) %>%
-    pull(color) %>%
-    unique() %>% length() 
-
-print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName))
-
## [1] "3 out of 3 clusters were found to be oligoclonal in Br7, using method 1"
-
print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName))
-
## [1] "3 out of 3 clusters were found to be oligoclonal in Br7, using method 2"
-
print(splittingSummary1)
-
##        Color Oligoclonal ClusterSize
-## 1 lightcoral        TRUE           2
-## 2    thistle        TRUE           2
-## 3   skyblue3        TRUE           3
-
print(splittingSummary2)
-
##        Color Oligoclonal ClusterSize
-## 1 lightcoral        TRUE           2
-## 2    thistle        TRUE           2
-## 3   skyblue3        TRUE           3
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/experiments/data/htmls/Lu2.html b/experiments/data/htmls/Lu2.html deleted file mode 100644 index b88ddc9..0000000 --- a/experiments/data/htmls/Lu2.html +++ /dev/null @@ -1,705 +0,0 @@ - - - - - - - - - - - - - - - -Lu2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Splitting statistics

-

This code analyses splitting statistics for CTC-clusters.

-

The analysis takes a list of trees sampled from its posterior -distribution as input and samples mutations placements for each of the -trees.

-
-
-

Configure the script

-
inputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/input_folder"
-simulationInputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2"
-treeName <- "Lu2"
-nTreeSamplingEvents <- 1000
-nMutationSamplingEvents <- 1000
-
-
-

Loading data

-
source("/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/experiments/workflow/resources/functions.R")
-
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
-## ✔ dplyr     1.1.4     ✔ readr     2.1.5
-## ✔ forcats   1.0.0     ✔ stringr   1.5.1
-## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
-## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
-## ✔ purrr     1.0.2     
-## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
-## ✖ dplyr::filter() masks stats::filter()
-## ✖ dplyr::lag()    masks stats::lag()
-## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
input <- load_data(inputFolder, treeName)
-
## Rows: 40000 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 158 Columns: 22
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (19): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 9 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
-
-

Sample description

-

Each row corresponds to a cell. Column description: - Cluster: An -number indicating the sample the cell belongs to. - ClusterName: The -name of the sample in the nodeDescription.tsv file - WBC: a binary -vector indicating whether the cell is a white blood cell (1) or not (0). -- color: Indicates the color of the cluster in the tree, as described in -the nodeDescription.tsv file.

-
print(input$sample_description)
-
##    Cluster ClusterName WBC      color single_cell
-## 1        0      Lu2_10   0     gray93        TRUE
-## 2        1      Lu2_11   0 lightcoral       FALSE
-## 3        1      Lu2_11   0 lightcoral       FALSE
-## 4        2      Lu2_13   0     gray93        TRUE
-## 5        3      Lu2_15   0     gray93        TRUE
-## 6        4      Lu2_17   0     gray93        TRUE
-## 7        5       Lu2_6   0     gray93        TRUE
-## 8        6       Lu2_7   0     gray93        TRUE
-## 9        7       Lu2_8   0     gray93        TRUE
-## 10       8       Lu2_9   0     gray93        TRUE
-
-
-

General overview

-

We sample 1000 many trees.

-

For each pair of cells in the same cluster and each sampled tree we -compute the splitting score, that is, the probability that the two cells -have experienced divergent evolution. A low splitting score (close to 0) -indicates that the two cells are likely genealogically closely related, -while a high splitting score (close to 1) indicates that the two cells -have evolved in a divergent manner.

-

Throughout the sampling of trees, this gives rise to an empirical -distribution of splitting scores for each pair of cells in the same -cluster. Intuitively, this distribution takes into account the -uncertainty in the tree estimation. To be able to interpret the -splitting score appropriately (e.g. to answer the question when is a -splitting score is high enough to call oligo-clonality) we need to -calibrate our expectations.

-

We do this by assessing the distributions of splitting scores when we -know that the clusters is mono-clonal. To this end, we simulate -reference and alternative read count data of monoclonal clusters of -different sizes (2,3,4 and 5-cell clusters) and add these to the -original dataset, run the tree inference algorithm and compute the -splitting score distributions for all pairs of cells in the same -cluster. To ensure that the simulated data does not confound the tree -inference too much, we do this one cluster at a time.

-

For each simulated cluster we pick one pair of cells and printed the -splitting score distribution below. With high number of sampled trees, -the distributions of all pairs of cells from the same cluster are very -similar, since the model treats all cells from the same cluster as -interchangeable.

-

Finally, we print the empirical distribution of the the splitting -scores for all clusters of the same size.

-

The latter is used to specify the cutoff for oligo-clonality: It is -defined as the 95%-percentile of the aggregated distribution of -splitting scores.

-
cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector())
-cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector())
-
-for (clusterSize in 2:5){
-  try(
-  {treeNameSimulated <- paste(treeName, clusterSize, sep = '_')
-
-
-  inputSimulated <- load_data(simulationInputFolder, treeNameSimulated)
-
-  sampleDescriptionSimulated <- inputSimulated$sample_description
-  
-  distance <- computeClusterSplits(inputSimulated$sample_description, inputSimulated$postSampling, treeNameSimulated, inputSimulated$nCells,
-                     inputSimulated$nMutations, inputSimulated$nClusters,
-                     inputSimulated$alleleCount,
-                     inputSimulated$mutatedReadCounts, inputSimulated$totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c("orchid", "orchid1", "orchid2",
-                                           "orchid3", "orchid4", "darkorchid",
-                                           "darkorchid1","darkorchid2", "darkorchid3",
-                                           "darkorchid4", "purple", "purple1",
-                                           "purple2", "purple3", "purple4"))
-
-  
-  cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) ))
-
-  plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) +
-    geom_histogram(binwidth = 0.01))
-  cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  })
-}
-
## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Lu2_2/Lu2_2_postSampling.tsv' does not exist.
-
## Rows: 7301 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 158 Columns: 28
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (25): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 12 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
## [1] "Computing genomic distances of leaves: 11 10"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 14 13"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 17 16"
-## [1] "Computing the posterior distribution"
-

-
## Rows: 7513 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 158 Columns: 26
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (23): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 11 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-

-
## [1] "Computing genomic distances of leaves: 11 10"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 15 14"
-## [1] "Computing the posterior distribution"
-

-
## Rows: 6916 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 158 Columns: 26
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (23): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 11 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-

-
## [1] "Computing genomic distances of leaves: 11 10"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 16 15"
-## [1] "Computing the posterior distribution"
-

-
print(cutoffsBranchingProbabilities)
-
##   clusterSize       Cutoff
-## 1           3 7.288943e-10
-## 2           4 1.400082e-09
-## 3           5 2.603720e-10
-

Now we can compute the aggregated splitting score distributions for -each cluster. The distribution’s mean is compared to the cutoffs -computed above, and if it is higher than the cutoff, we call the cluster -oligo-clonal.

-
nTumorClusters <- 0
-nOligoclonalClusters2 <- 0
-splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-
-for(clusterSize in 2:5){
-  try({
-    clusterColor <- input$sample_description %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() == clusterSize) %>%
-    pull(color) %>%
-    unique() 
-    
-    for(color in clusterColor){
-      distance <- computeClusterSplits(input$sample_description, input$postSampling, treeName, input$nCells,
-                     input$nMutations, input$nClusters,
-                     input$alleleCount,
-                     input$mutatedReadCounts, input$totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c(color))
-
-      splittingProbs <- mean(distance$splittingProbs$Splitting_probability)
-      branchingProbs <- mean(distance$aggregatedBranchingProbabilities)
-    
-      nTumorClusters <- nTumorClusters + 1
-      oligoclonal <- FALSE
-
-      if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){
-        nOligoclonalClusters2 <- nOligoclonalClusters2 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-    }
-  })
-}
-
## [1] "Computing genomic distances of leaves: 2 1"
-## [1] "Computing the posterior distribution"
-

-
## Error in if (branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize ==  : 
-##   Argument hat Länge 0
-
numberOfCancerClusters <- input$sample_description %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() > 1) %>%
-    pull(color) %>%
-    unique() %>% length() 
-
-print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName))
-
## [1] "0 out of 1 clusters were found to be oligoclonal in Lu2, using method 2"
-
print(splittingSummary2)
-
## [1] Color       Oligoclonal ClusterSize
-## <0 Zeilen> (oder row.names mit Länge 0)
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/experiments/data/htmls/Lu7.html b/experiments/data/htmls/Lu7.html deleted file mode 100644 index 0508eea..0000000 --- a/experiments/data/htmls/Lu7.html +++ /dev/null @@ -1,621 +0,0 @@ - - - - - - - - - - - - - - - -Lu7 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Splitting statistics

-

This code analyses splitting statistics for CTC-clusters.

-

The analysis takes a list of trees sampled from its posterior -distribution as input and samples mutations placements for each of the -trees.

-
-
-

Configure the script

-
inputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/input_folder"
-simulationInputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2"
-treeName <- "Lu7"
-nTreeSamplingEvents <- 1000
-nMutationSamplingEvents <- 1000
-
-
-

Loading data

-
source("/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/experiments/workflow/resources/functions.R")
-
## ── Attaching core tidyverse packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
-## ✔ dplyr     1.1.4     ✔ readr     2.1.5
-## ✔ forcats   1.0.0     ✔ stringr   1.5.1
-## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
-## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
-## ✔ purrr     1.0.2     
-## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
-## ✖ dplyr::filter() masks stats::filter()
-## ✖ dplyr::lag()    masks stats::lag()
-## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
input <- load_data(inputFolder, treeName)
-
## Rows: 40000 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 47 Columns: 42
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (39): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 19 Columns: 5
-## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
postSampling <- input$postSampling
-nClusters <- input$nClusters
-ClusterID <- input$clusterID
-nCells <- input$nCells  
-nMutations <- input$nMutations
-nClusters <- input$nClusters
-alleleCount <- input$alleleCount
-mutatedReadCounts <- input$mutatedReadCounts
-totalReadCounts <- input$totalReadCounts
-sampleDescription <- input$sample_description
-
-
-

Sample description

-

Each row corresponds to a cell. Column description: - Cluster: An -number indicating which sample the cell belongs to. - ClusterName: The -name of the sample in the nodeDescription.tsv file - WBC: a binary -vector indicating whether the cell is a white blood cell (1) or not (0). -- color: Indicates the color of the cluster in the tree, as described in -the nodeDescription.tsv file.

-
print(sampleDescription)
-
##    Cluster ClusterName WBC      color single_cell
-## 1        0       Lu7_1   0     gray93        TRUE
-## 2        1      Lu7_10   0     gray93        TRUE
-## 3        2      Lu7_11   0     gray93        TRUE
-## 4        3      Lu7_12   0     gray93        TRUE
-## 5        4      Lu7_13   0     gray93        TRUE
-## 6        5      Lu7_14   0     gray93        TRUE
-## 7        6      Lu7_15   0     gray93        TRUE
-## 8        7      Lu7_16   0     gray93        TRUE
-## 9        8      Lu7_17   0     gray93        TRUE
-## 10       9      Lu7_18   1 ghostwhite        TRUE
-## 11      10      Lu7_19   1 ghostwhite        TRUE
-## 12      11       Lu7_2   0     gray93        TRUE
-## 13      12       Lu7_3   0     gray93        TRUE
-## 14      13       Lu7_4   0     gray93        TRUE
-## 15      14       Lu7_5   0     gray93        TRUE
-## 16      15       Lu7_6   0     gray93        TRUE
-## 17      16       Lu7_7   0     gray93        TRUE
-## 18      17       Lu7_8   0     gray93        TRUE
-## 19      18       Lu7_9   0     gray93        TRUE
-

Get null distributions of relevant statistics, stratified by -sample:

-
cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector())
-cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector())
-
-for (clusterSize in 2:5){
-  try(
-  {treeNameSimulated <- paste(treeName, clusterSize, sep = '_')
-
-
-  inputSimulated <- load_data(simulationInputFolder, treeNameSimulated)
-
-  postSamplingSimulated <- inputSimulated$postSampling
-  nClustersSimulated <- inputSimulated$nClusters
-  ClusterIDSimulated <- inputSimulated$clusterID
-  nCellsSimulated <- inputSimulated$nCells  
-  nMutationsSimulated <- inputSimulated$nMutations
-  nClustersSimulated <- inputSimulated$nClusters
-  alleleCountSimulated <- inputSimulated$alleleCount
-  mutatedReadCountsSimulated <- inputSimulated$mutatedReadCounts
-  totalReadCountsSimulated <- inputSimulated$totalReadCounts
-  sampleDescriptionSimulated <- inputSimulated$sample_description
-  
-  distance <- computeClusterSplits(sampleDescriptionSimulated, postSamplingSimulated, treeNameSimulated, nCellsSimulated,
-                     nMutationsSimulated, nClustersSimulated,
-                     alleleCountSimulated,
-                     mutatedReadCountsSimulated, totalReadCountsSimulated,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c("orchid", "orchid1", "orchid2",
-                                           "orchid3", "orchid4", "darkorchid",
-                                           "darkorchid1","darkorchid2", "darkorchid3",
-                                           "darkorchid4", "purple", "purple1",
-                                           "purple2", "purple3", "purple4"))
-
-  
-
-  plot(ggplot(distance$splittingProbs, aes(x = "Values", y = Splitting_probability, fill = 'Splitting_probability')) +
-    geom_boxplot())
-  cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) ))
-  
-  ##Note that the way the aggregatedBranchingProbabilities are computed all pairs of cells from the same cluster are
-  ## taken into account. This has the effect that clusters with more cells would be counted more often and contribute more
-  ## to the shape of the final distribution. This is no problem right now as we only aggregate counts from clusters
-  ## of the same size, it is however the potential source of a future bug!!
-  
-  plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) +
-    geom_histogram(binwidth = 0.01))
-  print(data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  })
-}
-
## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Lu7_2/Lu7_2_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Lu7_3/Lu7_3_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Lu7_4/Lu7_4_postSampling.tsv' does not exist.
-## Error : '/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2/Lu7_5/Lu7_5_postSampling.tsv' does not exist.
-

Get the relevant statistics for each of the clusters of a dataset and -output numbers of oligoclonal clusters:

-
nTumorClusters <- 0
-nOligoclonalClusters1 <- 0
-nOligoclonalClusters2 <- 0
-splittingSummary1 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-
-for(clusterSize in 2:5){
-  try({
-    clusterColor <- sampleDescription %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() == clusterSize) %>%
-    pull(color) %>%
-    unique() 
-    
-    for(color in clusterColor){
-      distance <- computeClusterSplits(sampleDescription, postSampling, treeName, nCells,
-                     nMutations, nClusters,
-                     alleleCount,
-                     mutatedReadCounts, totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c(color))
-
-      splittingProbs <- mean(distance$splittingProbs$Splitting_probability)
-      branchingProbs <- mean(distance$aggregatedBranchingProbabilities)
-    
-      nTumorClusters <- nTumorClusters + 1
-      oligoclonal <- FALSE
-      print(clusterSize)
-      print(cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])
-      if(splittingProbs > (cutoffsSplittingProbs[(cutoffsSplittingProbs$clusterSize == clusterSize), 2])){
-        nOligoclonalClusters1 <- nOligoclonalClusters1 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary1 <- rbind(splittingSummary1, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-      oligoclonal <- FALSE
-      if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){
-        nOligoclonalClusters2 <- nOligoclonalClusters2 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-    }
-  })
-}
-
-
-numberOfCancerClusters <- sampleDescription %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() > 1) %>%
-    pull(color) %>%
-    unique() %>% length() 
-
-print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 1', nOligoclonalClusters1, numberOfCancerClusters, treeName))
-
## [1] "0 out of 0 clusters were found to be oligoclonal in Lu7, using method 1"
-
print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName))
-
## [1] "0 out of 0 clusters were found to be oligoclonal in Lu7, using method 2"
-
print(splittingSummary1)
-
## [1] Color       Oligoclonal ClusterSize
-## <0 Zeilen> (oder row.names mit Länge 0)
-
print(splittingSummary2)
-
## [1] Color       Oligoclonal ClusterSize
-## <0 Zeilen> (oder row.names mit Länge 0)
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/experiments/data/htmls/Pr9.html b/experiments/data/htmls/Pr9.html deleted file mode 100644 index e308295..0000000 --- a/experiments/data/htmls/Pr9.html +++ /dev/null @@ -1,765 +0,0 @@ - - - - - - - - - - - - - - - -Pr9 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
-

Splitting statistics

-

This code analyses splitting statistics for CTC-clusters.

-

The analysis takes a list of trees sampled from its posterior -distribution as input and computes the mutations placement probability -distribution for each oneof them. From this distribution we derive a -score that quantifies the probability that two cells have experienced -divergent evolution. This score is called the splitting score.

-
-
-

Configure the script

-
inputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/input_folder"
-simulationInputFolder <- "/Users/jgawron/Documents/projects/CTC_backup/simulations/simulations2"
-treeName <- "Pr9"
-nTreeSamplingEvents <- 1000
-nMutationSamplingEvents <- 1000
-
-
-

Loading data

-
source("/Users/jgawron/Documents/projects/CTC-SCITE/CTC-SCITE/experiments/workflow/resources/functions.R")
-
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
-## ✔ dplyr     1.1.4     ✔ readr     2.1.5
-## ✔ forcats   1.0.0     ✔ stringr   1.5.1
-## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
-## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
-## ✔ purrr     1.0.2     
-## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
-## ✖ dplyr::filter() masks stats::filter()
-## ✖ dplyr::lag()    masks stats::lag()
-## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
input <- load_data(inputFolder, treeName)
-
## Rows: 40000 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 109 Columns: 54
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (51): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 25 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
-
-

Sample description

-

Each row corresponds to a cell. Column description: - Cluster: An -number indicating the sample the cell belongs to. - ClusterName: The -name of the sample in the nodeDescription.tsv file - WBC: a binary -vector indicating whether the cell is a white blood cell (1) or not (0). -- color: Indicates the color of the cluster in the tree, as described in -the nodeDescription.tsv file.

-
print(input$sample_description)
-
##    Cluster ClusterName WBC      color single_cell
-## 1        0   Pr9_CTC_1   0     gray93        TRUE
-## 2        1  Pr9_CTC_10   0     gray93        TRUE
-## 3        2  Pr9_CTC_11   0     gray93        TRUE
-## 4        3  Pr9_CTC_12   0     gray93        TRUE
-## 5        4  Pr9_CTC_13   0     gray93        TRUE
-## 6        5  Pr9_CTC_14   0     gray93        TRUE
-## 7        6  Pr9_CTC_15   0     gray93        TRUE
-## 8        7  Pr9_CTC_16   0     gray93        TRUE
-## 9        8  Pr9_CTC_17   0     gray93        TRUE
-## 10       9  Pr9_CTC_18   0     gray93        TRUE
-## 11      10  Pr9_CTC_19   0     gray93        TRUE
-## 12      11   Pr9_CTC_2   0     gray93        TRUE
-## 13      12  Pr9_CTC_20   0     gray93        TRUE
-## 14      13  Pr9_CTC_21   0     gray93        TRUE
-## 15      14  Pr9_CTC_22   0     gray93        TRUE
-## 16      15  Pr9_CTC_23   0     gray93        TRUE
-## 17      16  Pr9_CTC_24   0     gray93        TRUE
-## 18      17  Pr9_CTC_25   0     gray93        TRUE
-## 19      18  Pr9_CTC_26   0 lightcoral       FALSE
-## 20      18  Pr9_CTC_26   0 lightcoral       FALSE
-## 21      19  Pr9_CTC_27   0     gray93        TRUE
-## 22      20  Pr9_CTC_28   0     gray93        TRUE
-## 23      21   Pr9_CTC_5   0     gray93        TRUE
-## 24      22   Pr9_CTC_6   0 sandybrown       FALSE
-## 25      22   Pr9_CTC_6   0 sandybrown       FALSE
-## 26      22   Pr9_CTC_6   0 sandybrown       FALSE
-## 27      23   Pr9_CTC_7   1   skyblue3       FALSE
-## 28      23   Pr9_CTC_7   1   skyblue3       FALSE
-## 29      24   Pr9_CTC_9   0     gray93        TRUE
-
-
-

General overview

-

We sample 1000 trees.

-

For each pair of cells in the same cluster and each sampled tree we -compute the splitting score, that is, the probability that the two cells -have experienced divergent evolution. A low splitting score (close to 0) -indicates that the two cells are likely genealogically closely related, -while a high splitting score (close to 1) indicates that the two cells -have evolved in a divergent manner.

-

Throughout the sampling of trees, this gives rise to an empirical -distribution of splitting scores for each pair of cells in the same -cluster. Intuitively, this distribution takes into account the -uncertainty in the tree estimation. To be able to interpret the -splitting score appropriately (e.g. to answer the question when is a -splitting score is high enough to call oligo-clonality) we need to -calibrate our expectations.

-

We do this by assessing the distributions of splitting scores when we -know that the clusters is mono-clonal. To this end, we simulate -reference and alternative read count data of monoclonal clusters of -different sizes (2,3,4 and 5-cell clusters) and add these to the -original dataset, run the tree inference algorithm and compute the -splitting score distributions for all pairs of cells in the same -cluster. To ensure that the simulated data does not confound the tree -inference too much, we do this one cluster at a time.

-

For each simulated cluster we pick one pair of cells and printed the -splitting score distribution below. With high number of sampled trees, -the distributions of all pairs of cells from the same cluster are very -similar, since the model treats all cells from the same cluster as -interchangeable.

-

Finally, we print the empirical distribution of the the splitting -scores for all clusters of the same size.

-

The latter is used to specify the cutoff for oligo-clonality: It is -defined as the 95%-percentile of the aggregated distribution of -splitting scores.

-
cutoffsSplittingProbs <- data.frame(clusterSize = vector(), Cutoff = vector())
-cutoffsBranchingProbabilities <- data.frame(clusterSize = vector(), Cutoff = vector())
-
-for (clusterSize in 2:5){
-  try(
-  {treeNameSimulated <- paste(treeName, clusterSize, sep = '_')
-
-
-  inputSimulated <- load_data(simulationInputFolder, treeNameSimulated)
-
-  sampleDescriptionSimulated <- inputSimulated$sample_description
-  
-  distance <- computeClusterSplits(inputSimulated$sample_description, inputSimulated$postSampling, treeNameSimulated, inputSimulated$nCells,
-                     inputSimulated$nMutations, inputSimulated$nClusters,
-                     inputSimulated$alleleCount,
-                     inputSimulated$mutatedReadCounts, inputSimulated$totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c("orchid", "orchid1", "orchid2",
-                                           "orchid3", "orchid4", "darkorchid",
-                                           "darkorchid1","darkorchid2", "darkorchid3",
-                                           "darkorchid4", "purple", "purple1",
-                                           "purple2", "purple3", "purple4"))
-
-  
-  cutoffsSplittingProbs <- rbind(cutoffsSplittingProbs, data.frame(clusterSize = clusterSize, Cutoff = mean(distance$splittingProbs$Splitting_probability) + 2 * sd(distance$splittingProbs$Splitting_probability) ))
-
-  plot(ggplot(data.frame(x = distance$aggregatedBranchingProbabilities), aes(x = x)) +
-    geom_histogram(binwidth = 0.01))
-  cutoffsBranchingProbabilities <- rbind(cutoffsBranchingProbabilities, data.frame(clusterSize = clusterSize, Cutoff = quantile(distance$aggregatedBranchingProbabilities, probs = 0.95, names = FALSE)[1] ))
-  })
-}
-
## Rows: 58058 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 109 Columns: 62
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (59): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 29 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-
## [1] "Computing genomic distances of leaves: 30 29"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 32 31"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 34 33"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 36 35"
-## [1] "Computing the posterior distribution"
-

-
## Rows: 57822 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 109 Columns: 60
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (57): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 28 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-

-
## [1] "Computing genomic distances of leaves: 30 29"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 33 32"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 36 35"
-## [1] "Computing the posterior distribution"
-

-
## Rows: 16255 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 109 Columns: 58
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (55): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 27 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-

-
## [1] "Computing genomic distances of leaves: 30 29"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 34 33"
-## [1] "Computing the posterior distribution"
-

-
## Rows: 14665 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (1): Tree
-## dbl (4): LogScore, SequencingErrorRate, DropoutRate, LogTau
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 109 Columns: 58
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr  (3): X1, X3, X4
-## dbl (55): X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17, X1...
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-## Rows: 27 Columns: 5
-## ── Column specification ────────────────────────────────────────────────────────
-## Delimiter: "\t"
-## chr (2): Cluster, Description
-## dbl (3): CellCount, TCs, WBCs
-## 
-## ℹ Use `spec()` to retrieve the full column specification for this data.
-## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-

-
## [1] "Computing genomic distances of leaves: 30 29"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 35 34"
-## [1] "Computing the posterior distribution"
-

-
print(cutoffsBranchingProbabilities)
-
##   clusterSize     Cutoff
-## 1           2 0.04707869
-## 2           3 0.49998666
-## 3           4 0.72476307
-## 4           5 0.31157830
-

Now we can compute the aggregated splitting score distributions for -each cluster. The distribution’s mean is compared to the cutoffs -computed above, and if it is higher than the cutoff, we call the cluster -oligo-clonal.

-
nTumorClusters <- 0
-nOligoclonalClusters2 <- 0
-splittingSummary2 <- data.frame(Color = vector(), Oligoclonal = vector(), ClusterSize = vector())
-
-for(clusterSize in 2:5){
-  try({
-    clusterColor <- input$sample_description %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() == clusterSize) %>%
-    pull(color) %>%
-    unique() 
-    
-    for(color in clusterColor){
-      distance <- computeClusterSplits(input$sample_description, input$postSampling, treeName, input$nCells,
-                     input$nMutations, input$nClusters,
-                     input$alleleCount,
-                     input$mutatedReadCounts, input$totalReadCounts,
-                     nMutationSamplingEvents = nMutationSamplingEvents, nTreeSamplingEvents = nTreeSamplingEvents,
-                     cellPairSelection = c(color))
-
-      splittingProbs <- mean(distance$splittingProbs$Splitting_probability)
-      branchingProbs <- mean(distance$aggregatedBranchingProbabilities)
-      
-      nTumorClusters <- nTumorClusters + 1
-      oligoclonal <- FALSE
-
-      if(branchingProbs > cutoffsBranchingProbabilities[(cutoffsBranchingProbabilities$clusterSize == clusterSize), 2]){
-        nOligoclonalClusters2 <- nOligoclonalClusters2 + 1
-        oligoclonal <- TRUE
-      }
-      splittingSummary2 <- rbind(splittingSummary2, data.frame(Color = color, Oligoclonal = oligoclonal, ClusterSize = clusterSize))
-    }
-  })
-}
-
## [1] "Computing genomic distances of leaves: 19 18"
-## [1] "Computing the posterior distribution"
-

-
## [1] "Computing genomic distances of leaves: 24 23"
-## [1] "Computing the posterior distribution"
-

-
numberOfCancerClusters <- input$sample_description %>%
-    filter(WBC ==0 &  color != 'gray93') %>%
-    group_by(color) %>%
-    filter(n() > 1) %>%
-    pull(color) %>%
-    unique() %>% length() 
-
-print(sprintf('%d out of %d clusters were found to be oligoclonal in %s, using method 2', nOligoclonalClusters2, numberOfCancerClusters, treeName))
-
## [1] "1 out of 2 clusters were found to be oligoclonal in Pr9, using method 2"
-
print(splittingSummary2)
-
##        Color Oligoclonal ClusterSize
-## 1 lightcoral        TRUE           2
-## 2 sandybrown       FALSE           3
-
- - - - -
- - - - - - - - - - - - - - - diff --git a/experiments/data/markdowns/Br16_AC_topSeparators.Rmd b/experiments/data/markdowns/Br16_AC_topSeparators.Rmd index c4c9b3c..b1e42df 100755 --- a/experiments/data/markdowns/Br16_AC_topSeparators.Rmd +++ b/experiments/data/markdowns/Br16_AC_topSeparators.Rmd @@ -18,9 +18,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../software/Rcode/annotateVariants.R') -sampleName <- 'Br16_AC' -inputFolder <- '../input_folder/' +source("../software/Rcode/annotateVariants.R") +sampleName <- "Br16_AC" +inputFolder <- "../input_folder/" annotations <- annotate_variants(sampleName, inputFolder) ``` @@ -40,10 +40,10 @@ This is a generalization of the earlier method to find the top seperating mutati # sandybrown ```{r} -clusterName <- 'sandybrown' +clusterName <- "sandybrown" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -51,12 +51,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -83,7 +82,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -100,8 +99,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -111,13 +110,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.8, lwd = 2, lty = 2, col = "green") +abline(h = 0.8, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -139,7 +138,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -157,14 +156,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -183,7 +182,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -201,14 +200,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -216,10 +215,10 @@ print(sprintf('Number of mutations in cluster %s with high functional impact: %d # mistyrose ```{r} -clusterName <- 'mistyrose' +clusterName <- "mistyrose" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -227,12 +226,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -259,7 +257,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.9) # select those below 0.5 say +selected_muts <- which(min_dist < 0.9) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -276,8 +274,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` -The choice of clustering is highly dependent on the filtering, so I will stop here. \ No newline at end of file +The choice of clustering is highly dependent on the filtering, so I will stop here. diff --git a/experiments/data/markdowns/Br16_B_topSeparators.Rmd b/experiments/data/markdowns/Br16_B_topSeparators.Rmd index 658bf58..139bf00 100755 --- a/experiments/data/markdowns/Br16_B_topSeparators.Rmd +++ b/experiments/data/markdowns/Br16_B_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../software/Rcode/annotateVariants.R') -sampleName <- 'Br16_B' -inputFolder <- '../input_folder/' +source("../software/Rcode/annotateVariants.R") +sampleName <- "Br16_B" +inputFolder <- "../input_folder/" annotations <- annotate_variants(sampleName, inputFolder) ``` @@ -40,10 +40,10 @@ This is a generalization of the earlier method to find the top seperating mutati This is a 3-cell CTC-cluster. ```{r} -clusterName <- 'lightpink2' +clusterName <- "lightpink2" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -51,12 +51,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -83,7 +82,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -100,8 +99,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -111,13 +110,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.8, lwd = 2, lty = 2, col = "green") +abline(h = 0.8, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 3) ``` @@ -139,7 +138,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -157,14 +156,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -183,7 +182,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -201,14 +200,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -225,7 +224,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -243,14 +242,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -260,10 +259,10 @@ print(sprintf('Number of mutations in cluster %s with high functional impact: %d This is a 3-cell CTC-cluster. ```{r} -clusterName <- 'orangered4' +clusterName <- "orangered4" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -271,12 +270,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -303,7 +301,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.7) # select those below 0.5 say +selected_muts <- which(min_dist < 0.7) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -320,8 +318,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -331,13 +329,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.85, lwd = 2, lty = 2, col = "green") +abline(h = 0.85, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 3) ``` @@ -359,7 +357,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -377,14 +375,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -403,7 +401,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -421,14 +419,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -445,7 +443,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -463,14 +461,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -479,10 +477,10 @@ print(sprintf('Number of mutations in cluster %s with high functional impact: %d This is a 3-cell CTC-cluster. ```{r} -clusterName <- 'goldenrod' +clusterName <- "goldenrod" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -490,12 +488,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -522,7 +519,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.7) # select those below 0.5 say +selected_muts <- which(min_dist < 0.7) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -539,8 +536,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -550,13 +547,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.82, lwd = 2, lty = 2, col = "green") +abline(h = 0.82, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -578,7 +575,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -596,14 +593,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -622,7 +619,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -640,14 +637,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -656,10 +653,10 @@ print(sprintf('Number of mutations in cluster %s with high functional impact: %d This is a 4-cell CTC-cluster. ```{r} -clusterName <- 'sienna2' +clusterName <- "sienna2" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -667,12 +664,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -699,7 +695,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -716,8 +712,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -727,13 +723,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.8, lwd = 2, lty = 2, col = "green") +abline(h = 0.8, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -755,7 +751,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -773,14 +769,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -799,7 +795,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -817,14 +813,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -836,10 +832,10 @@ print(sprintf('Number of mutations in cluster %s with high functional impact: %d This is a 4-cell CTC-cluster. ```{r} -clusterName <- 'springgreen' +clusterName <- "springgreen" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -847,12 +843,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -879,7 +874,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.7) # select those below 0.5 say +selected_muts <- which(min_dist < 0.7) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -896,8 +891,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -907,13 +902,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.9, lwd = 2, lty = 2, col = "green") +abline(h = 0.9, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -935,7 +930,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -953,14 +948,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -979,7 +974,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -997,14 +992,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -1018,10 +1013,10 @@ print(sprintf('Number of mutations in cluster %s with high functional impact: %d This is a 4-cell CTC-cluster. ```{r} -clusterName <- 'palegreen3' +clusterName <- "palegreen3" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -1029,12 +1024,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -1061,7 +1055,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.75) # select those below 0.5 say +selected_muts <- which(min_dist < 0.75) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -1078,8 +1072,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -1089,13 +1083,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.8, lwd = 2, lty = 2, col = "green") +abline(h = 0.8, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -1117,7 +1111,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -1135,14 +1129,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -1161,7 +1155,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -1179,14 +1173,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` diff --git a/experiments/data/markdowns/Br23_topSeparators.Rmd b/experiments/data/markdowns/Br23_topSeparators.Rmd index a36dd4c..0bbfd08 100755 --- a/experiments/data/markdowns/Br23_topSeparators.Rmd +++ b/experiments/data/markdowns/Br23_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization} -source('../software/Rcode/annotateVariants.R') -sampleName <- 'Br23' -inputFolder <- '../input_folder/' +source("../software/Rcode/annotateVariants.R") +sampleName <- "Br23" +inputFolder <- "../input_folder/" ``` #### Mutation distance matrix @@ -36,10 +36,10 @@ This is a generalization of the earlier method to find the top seperating mutati # sandybrown ```{r} -clusterName <- 'sandybrown' +clusterName <- "sandybrown" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -47,13 +47,12 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message =FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) annotations <- annotate_variants(sampleName, inputFolder) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -81,7 +80,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -105,8 +104,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -117,10 +116,10 @@ No apparent clustering visible. ```{r} -clusterName <- 'skyblue3' +clusterName <- "skyblue3" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -128,13 +127,12 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) annotations <- annotate_variants(sampleName, inputFolder) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -159,7 +157,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.8) # select those below 0.5 say +selected_muts <- which(min_dist < 0.8) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -179,9 +177,9 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` -No apparent clustering visible. \ No newline at end of file +No apparent clustering visible. diff --git a/experiments/data/markdowns/Br26_topSeparators.Rmd b/experiments/data/markdowns/Br26_topSeparators.Rmd index 8ca9c4e..89f49d1 100755 --- a/experiments/data/markdowns/Br26_topSeparators.Rmd +++ b/experiments/data/markdowns/Br26_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../software/Rcode/annotateVariants.R') -sampleName <- 'Br26' -inputFolder <- '../input_folder/' +source("../software/Rcode/annotateVariants.R") +sampleName <- "Br26" +inputFolder <- "../input_folder/" annotations <- annotate_variants(sampleName, inputFolder) ``` @@ -38,10 +38,10 @@ This is a generalization of the earlier method to find the top seperating mutati # plum ```{r} -clusterName <- 'lightcoral' +clusterName <- "lightcoral" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -49,12 +49,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -81,7 +80,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<1) # select those below 0.5 say +selected_muts <- which(min_dist < 1) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -98,8 +97,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -118,13 +117,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.98, lwd = 2, lty = 2, col = "green") +abline(h = 0.98, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -156,12 +155,12 @@ top_df$variantName <- cluster1 ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) -``` \ No newline at end of file +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) +``` diff --git a/experiments/data/markdowns/Br38_topSeparators.Rmd b/experiments/data/markdowns/Br38_topSeparators.Rmd index cb3399f..fb0aba8 100755 --- a/experiments/data/markdowns/Br38_topSeparators.Rmd +++ b/experiments/data/markdowns/Br38_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../software/Rcode/annotateVariants.R') -sampleName <- 'Br38' -inputFolder <- '../input_folder/' +source("../software/Rcode/annotateVariants.R") +sampleName <- "Br38" +inputFolder <- "../input_folder/" annotations <- annotate_variants(sampleName, inputFolder) ``` @@ -38,10 +38,10 @@ This is a generalization of the earlier method to find the top seperating mutati # plum ```{r} -clusterName <- 'lightcoral' +clusterName <- "lightcoral" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -49,12 +49,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -81,7 +80,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.7) # select those below 0.5 say +selected_muts <- which(min_dist < 0.7) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -98,8 +97,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` -No apparent clustering visible. \ No newline at end of file +No apparent clustering visible. diff --git a/experiments/data/markdowns/Br61_topSeparators.Rmd b/experiments/data/markdowns/Br61_topSeparators.Rmd index 21d9cb2..0ea1255 100755 --- a/experiments/data/markdowns/Br61_topSeparators.Rmd +++ b/experiments/data/markdowns/Br61_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../software/Rcode/annotateVariants.R') -sampleName <- 'Br61' -inputFolder <- '../input_folder/' +source("../software/Rcode/annotateVariants.R") +sampleName <- "Br61" +inputFolder <- "../input_folder/" annotations <- annotate_variants(sampleName, inputFolder) ``` @@ -38,10 +38,10 @@ This is a generalization of the earlier method to find the top seperating mutati # lightcoral ```{r} -clusterName <- 'lightcoral' +clusterName <- "lightcoral" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -49,12 +49,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -81,7 +80,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.6) # select those below 0.5 say +selected_muts <- which(min_dist < 0.6) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -103,8 +102,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. @@ -115,10 +114,10 @@ No apparent clustering visible. # sandybrown ```{r} -clusterName <- 'sandybrown' +clusterName <- "sandybrown" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -126,12 +125,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -158,7 +156,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -175,8 +173,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -186,13 +184,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.8, lwd = 2, lty = 2, col = "green") +abline(h = 0.8, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -214,7 +212,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -232,14 +230,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -258,7 +256,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -276,23 +274,23 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` # khaki3 ```{r} -clusterName <- 'khaki3' +clusterName <- "khaki3" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -300,12 +298,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -332,7 +329,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -353,8 +350,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. @@ -363,10 +360,10 @@ No apparent clustering visible. # lemonchiffon ```{r} -clusterName <- 'lemonchiffon' +clusterName <- "lemonchiffon" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -374,12 +371,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -406,7 +402,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.7) # select those below 0.5 say +selected_muts <- which(min_dist < 0.7) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -423,8 +419,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -434,13 +430,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.82, lwd = 2, lty = 2, col = "green") +abline(h = 0.82, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -462,7 +458,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -480,14 +476,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -506,7 +502,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -524,24 +520,24 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` # plum ```{r} -clusterName <- 'plum' +clusterName <- "plum" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -549,12 +545,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -581,7 +576,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -603,8 +598,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. @@ -614,10 +609,10 @@ No apparent clustering visible. # violetred3 ```{r} -clusterName <- 'violetred3' +clusterName <- "violetred3" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -625,12 +620,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -657,7 +651,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.8) # select those below 0.5 say +selected_muts <- which(min_dist < 0.8) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -674,8 +668,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -685,13 +679,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.95, lwd = 2, lty = 2, col = "green") +abline(h = 0.95, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -713,7 +707,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -731,14 +725,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -757,7 +751,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -775,14 +769,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -791,10 +785,10 @@ print(sprintf('Number of mutations in cluster %s with high functional impact: %d # paleturquoise3 ```{r} -clusterName <- 'paleturquoise3' +clusterName <- "paleturquoise3" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -802,12 +796,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -834,7 +827,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -851,8 +844,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -862,13 +855,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.8, lwd = 2, lty = 2, col = "green") +abline(h = 0.8, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -890,7 +883,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -908,14 +901,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -934,7 +927,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -952,14 +945,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -967,10 +960,10 @@ print(sprintf('Number of mutations in cluster %s with high functional impact: %d # skyblue3 ```{r} -clusterName <- 'skyblue3' +clusterName <- "skyblue3" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -978,12 +971,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -1010,7 +1002,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -1032,8 +1024,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` -No apparent clustering visible. \ No newline at end of file +No apparent clustering visible. diff --git a/experiments/data/markdowns/Brx50_topSeparators.Rmd b/experiments/data/markdowns/Brx50_topSeparators.Rmd index 749864f..6001d18 100755 --- a/experiments/data/markdowns/Brx50_topSeparators.Rmd +++ b/experiments/data/markdowns/Brx50_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../software/Rcode/annotateVariants.R') -sampleName <- 'Brx50' -inputFolder <- file.path('..','input_folder') +source("../software/Rcode/annotateVariants.R") +sampleName <- "Brx50" +inputFolder <- file.path("..", "input_folder") annotations <- annotate_variants(sampleName, inputFolder) ``` @@ -38,10 +38,10 @@ This is a generalization of the earlier method to find the top seperating mutati # lightcoral ```{r} -clusterName <- 'lightcoral' +clusterName <- "lightcoral" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -49,12 +49,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -81,7 +80,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -98,8 +97,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering structure visible. @@ -108,10 +107,10 @@ No apparent clustering structure visible. # sandybrown ```{r} -clusterName <- 'sandybrown' +clusterName <- "sandybrown" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -119,12 +118,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -151,7 +149,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.8) # select those below 0.5 say +selected_muts <- which(min_dist < 0.8) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -168,8 +166,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` -No apparent clustering structure visible. \ No newline at end of file +No apparent clustering structure visible. diff --git a/experiments/data/markdowns/LM2_topSeparators.Rmd b/experiments/data/markdowns/LM2_topSeparators.Rmd index 6215a8f..08820ae 100755 --- a/experiments/data/markdowns/LM2_topSeparators.Rmd +++ b/experiments/data/markdowns/LM2_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../software/Rcode/annotateVariants.R') -sampleName <- 'LM2' -inputFolder <- '../input_folder/' +source("../software/Rcode/annotateVariants.R") +sampleName <- "LM2" +inputFolder <- "../input_folder/" annotations <- annotate_variants(sampleName, inputFolder) ``` @@ -38,10 +38,10 @@ This is a generalization of the earlier method to find the top seperating mutati # thistle ```{r} -clusterName <- 'thistle' +clusterName <- "thistle" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -49,12 +49,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -81,7 +80,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.85) # select those below 0.5 say +selected_muts <- which(min_dist < 0.85) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -100,8 +99,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. @@ -111,10 +110,10 @@ No apparent clustering visible. # violetred3 ```{r} -clusterName <- 'violetred3' +clusterName <- "violetred3" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -122,12 +121,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -152,7 +150,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -172,8 +170,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. @@ -183,10 +181,10 @@ No apparent clustering visible. # lightslateblue ```{r} -clusterName <- 'lightslateblue' +clusterName <- "lightslateblue" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -194,12 +192,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -224,7 +221,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -246,8 +243,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. @@ -256,10 +253,10 @@ No apparent clustering visible. # paleturquoise3 ```{r} -clusterName <- 'paleturquoise3' +clusterName <- "paleturquoise3" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -267,12 +264,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -297,7 +293,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -317,8 +313,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. @@ -328,10 +324,10 @@ No apparent clustering visible. # khaki3 ```{r} -clusterName <- 'khaki3' +clusterName <- "khaki3" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -339,12 +335,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -369,7 +364,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -390,8 +385,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. @@ -400,10 +395,10 @@ No apparent clustering visible. # darkseagreen4 ```{r} -clusterName <- 'darkseagreen4' +clusterName <- "darkseagreen4" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -411,12 +406,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -441,7 +435,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.56) # select those below 0.5 say +selected_muts <- which(min_dist < 0.56) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -462,8 +456,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` No apparent clustering visible. @@ -471,10 +465,10 @@ No apparent clustering visible. # navajowhite2 ```{r} -clusterName <- 'navajowhite2' +clusterName <- "navajowhite2" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -482,12 +476,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -512,7 +505,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -532,8 +525,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -544,10 +537,10 @@ No apparent clustering visible. # lemonchiffon ```{r} -clusterName <- 'lemonchiffon' +clusterName <- "lemonchiffon" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -555,12 +548,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -585,7 +577,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.57) # select those below 0.5 say +selected_muts <- which(min_dist < 0.57) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -602,8 +594,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -613,13 +605,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.66, lwd = 2, lty = 2, col = "green") +abline(h = 0.66, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -641,7 +633,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -659,14 +651,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -685,7 +677,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -703,14 +695,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` Two genetic clusters could be identified in the 3-cell CTC cluster, but only with moderate functional impact variants. @@ -720,10 +712,10 @@ Two genetic clusters could be identified in the 3-cell CTC cluster, but only wit # plum ```{r} -clusterName <- 'plum' +clusterName <- "plum" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -731,12 +723,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -761,7 +752,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -778,8 +769,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -789,13 +780,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.58, lwd = 2, lty = 2, col = "green") +abline(h = 0.58, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 3) ``` @@ -816,7 +807,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -834,14 +825,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -860,7 +851,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -878,14 +869,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -902,7 +893,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -920,14 +911,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -935,10 +926,10 @@ print(sprintf('Number of mutations in cluster %s with high functional impact: %d # gold This is a 5-cell CTC cluster, so I am expecting up to 5 genetic clusters. ```{r} -clusterName <- 'gold' +clusterName <- "gold" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -946,12 +937,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -976,7 +966,7 @@ ggheatmap(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -993,8 +983,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -1004,13 +994,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.68, lwd = 2, lty = 2, col = "green") +abline(h = 0.68, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 2) ``` @@ -1032,7 +1022,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -1050,14 +1040,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -1076,7 +1066,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -1094,14 +1084,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` We found two distinct genetic clusters. @@ -1111,10 +1101,10 @@ We found two distinct genetic clusters. # yellowgreen ```{r} -clusterName <- 'yellowgreen' +clusterName <- "yellowgreen" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -1122,12 +1112,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -1148,7 +1137,7 @@ mutations: mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<1) # select those below 0.5 say +selected_muts <- which(min_dist < 1) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -1161,7 +1150,7 @@ heatmaply(mat3) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.93) # select those below 0.5 say +selected_muts <- which(min_dist < 0.93) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -1178,8 +1167,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -1189,13 +1178,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.94, lwd = 2, lty = 2, col = "green") +abline(h = 0.94, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 4) ``` @@ -1217,7 +1206,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -1235,14 +1224,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -1259,7 +1248,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -1277,14 +1266,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -1305,7 +1294,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -1323,14 +1312,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -1341,7 +1330,7 @@ print(sprintf('Number of mutations in cluster %s with high functional impact: %d #### Fourth cluster: ```{r} -cluster1 <- names(geneGroups)[geneGroups ==4] +cluster1 <- names(geneGroups)[geneGroups == 4] ``` Mutations in cluster: @@ -1351,7 +1340,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -1369,14 +1358,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` diff --git a/experiments/data/markdowns/Pr9_topSeparators.Rmd b/experiments/data/markdowns/Pr9_topSeparators.Rmd index 521a4fe..6fea5ac 100755 --- a/experiments/data/markdowns/Pr9_topSeparators.Rmd +++ b/experiments/data/markdowns/Pr9_topSeparators.Rmd @@ -17,9 +17,9 @@ output: ## Data ```{r initialization, message = FALSE} -source('../software/Rcode/annotateVariants.R') -sampleName <- 'Pr9' -inputFolder <- '../input_folder/' +source("../software/Rcode/annotateVariants.R") +sampleName <- "Pr9" +inputFolder <- "../input_folder/" annotations <- annotate_variants(sampleName, inputFolder) ``` @@ -38,10 +38,10 @@ This is a generalization of the earlier method to find the top seperating mutati # lightcoral ```{r} -clusterName <- 'lightcoral' +clusterName <- "lightcoral" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -49,12 +49,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -81,7 +80,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.9) # select those below 0.5 say +selected_muts <- which(min_dist < 0.9) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -98,8 +97,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` This looks rather complicated and I won't continue here, as choosing clusters would be arbitrary. @@ -112,10 +111,10 @@ This looks rather complicated and I won't continue here, as choosing clusters wo This is a 3-cell CTC-cluster, so I am expecting up to 3 distinct genotype clusters. ```{r} -clusterName <- 'sandybrown' +clusterName <- "sandybrown" -d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) -mat<-as.matrix(d) +d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, "_postSampling_", clusterName, ".txt")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) +mat <- as.matrix(d) mat[1:4, 1:4] ``` @@ -123,12 +122,11 @@ mat[1:4, 1:4] For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants. ```{r message=FALSE} -coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1) +coverage <- read.table(file.path(inputFolder, sampleName, paste(sampleName, "covScore.txt", sep = "_")), header = TRUE, sep = "\t", stringsAsFactors = F, row.names = 1) coverage$variantName <- rownames(coverage) head(coverage) coverage <- inner_join(coverage, annotations, by = "variantName") - ``` ## Method @@ -155,7 +153,7 @@ heatmaply(mat) mat2 <- mat diag(mat2) <- 1 min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations -selected_muts <- which(min_dist<0.5) # select those below 0.5 say +selected_muts <- which(min_dist < 0.5) # select those below 0.5 say mat3 <- mat[selected_muts, selected_muts] ``` @@ -172,8 +170,8 @@ To cluster mutations, we create a dendrogram based on the pairwise distances: ```{r} mat <- mat3 d_mat <- as.dist(mat) -hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix -par(cex=0.6) +hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") ``` @@ -186,13 +184,13 @@ We define a cut point to get distinct branches. These should roughly represent t ```{r} -par(cex=0.6) +par(cex = 0.6) plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations") -abline(h=0.65, lwd = 2, lty = 2, col = "green") +abline(h = 0.65, lwd = 2, lty = 2, col = "green") ``` ```{r} -#geneGroups <- cutree(hc, k = NULL, h = 0.6) +# geneGroups <- cutree(hc, k = NULL, h = 0.6) geneGroups <- cutree(hc, k = 3) ``` @@ -214,7 +212,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -232,14 +230,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_1 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_1 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_1 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_1 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_1 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_1 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_1 == "HIGH"))) ``` @@ -258,7 +256,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -276,14 +274,14 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) ``` @@ -301,7 +299,7 @@ cluster1 Distances within cluster: ```{r} -d1 <- d[cluster1 , cluster1] +d1 <- d[cluster1, cluster1] ``` Average distance to other mutations in cluster: @@ -319,12 +317,12 @@ top_df$variantName <- rownames(top_df) ``` ```{r} -top_muts_2 <- inner_join(top_df, coverage, by='variantName') -#colnames(top_muts_1)[1] <- "mutation" +top_muts_2 <- inner_join(top_df, coverage, by = "variantName") +# colnames(top_muts_1)[1] <- "mutation" top_muts_2 ``` ```{r} -print(sprintf('Number of mutations in cluster %s with moderate functional impact: %d', clusterName, sum(top_muts_2 == 'MODERATE'))) -print(sprintf('Number of mutations in cluster %s with high functional impact: %d', clusterName, sum(top_muts_2 == 'HIGH'))) -``` \ No newline at end of file +print(sprintf("Number of mutations in cluster %s with moderate functional impact: %d", clusterName, sum(top_muts_2 == "MODERATE"))) +print(sprintf("Number of mutations in cluster %s with high functional impact: %d", clusterName, sum(top_muts_2 == "HIGH"))) +``` diff --git a/experiments/workflow/Snakefile b/experiments/workflow/Snakefile index bedb35c..921935f 100644 --- a/experiments/workflow/Snakefile +++ b/experiments/workflow/Snakefile @@ -1,26 +1,30 @@ from pathlib import Path -SAMPLES = config['sample'] +SAMPLES = config["sample"] PROJECT_DIR = Path(workflow.basedir).parent -SCRIPT_DIR = Path(workflow.basedir) / 'scripts' -RESOURCES_DIR = Path(workflow.basedir) / 'resources' -markdown_helper_functions = (RESOURCES_DIR / 'functions.R').resolve().as_posix().replace('/', '\\/') -input_folder = Path(config['input']).resolve().as_posix().replace('/','\\/') -simulations_folder = Path(config['simulations']).resolve().as_posix().replace('/','\\/') +SCRIPT_DIR = Path(workflow.basedir) / "scripts" +RESOURCES_DIR = Path(workflow.basedir) / "resources" +markdown_helper_functions = ( + (RESOURCES_DIR / "functions.R").resolve().as_posix().replace("/", "\\/") +) +input_folder = Path(config["input"]).resolve().as_posix().replace("/", "\\/") +simulations_folder = ( + Path(config["simulations"]).resolve().as_posix().replace("/", "\\/") +) print(markdown_helper_functions) -######Rules###### -include: 'rules/base.smk' +######Rules###### +include: "rules/base.smk" all_files = [] for sample in SAMPLES: - all_files.append((PROJECT_DIR / 'data' / 'htmls' / f'{sample}.html').resolve()) + all_files.append((PROJECT_DIR / "data" / "htmls" / f"{sample}.html").resolve()) rule all: input: - all_files + all_files, diff --git a/experiments/workflow/resources/UnitTests.R b/experiments/workflow/resources/UnitTests.R index e959740..352f534 100755 --- a/experiments/workflow/resources/UnitTests.R +++ b/experiments/workflow/resources/UnitTests.R @@ -1,69 +1,70 @@ - - ############ -#Test functions +# Test functions ############ -test_compute_pairwise_distance_of_leaves1 <- function(){ - - treeParentVectorFormat <- c(6,8,1,1,3,3,8,6) - bestMutationPlacement <- c(5,8,6,0) - - outcome <- c(0,0) - - +test_compute_pairwise_distance_of_leaves1 <- function() { + treeParentVectorFormat <- c(6, 8, 1, 1, 3, 3, 8, 6) + bestMutationPlacement <- c(5, 8, 6, 0) + + outcome <- c(0, 0) + + pairwiseGenealogy <- findMostRecentCommonAncestor(treeParentVectorFormat, 5, 5) - - - dist <- computePairwiseDistanceOfLeaves2(treeParentVectorFormat, 5, 5, - bestMutationPlacement, pairwiseGenealogy) - + + + dist <- computePairwiseDistanceOfLeaves2( + treeParentVectorFormat, 5, 5, + bestMutationPlacement, pairwiseGenealogy + ) + PASS <- TRUE - - if (sum(dist!= outcome)){ - PASS = FALSE + + if (sum(dist != outcome)) { + PASS <- FALSE } return(PASS) } -test_compute_pairwise_distance_of_leaves2 <- function(){ - - treeParentVectorFormat <- c(6,8,1,1,3,3,8,6) - bestMutationPlacement <- c(5,8,6,0) - - outcome <- c(0,0) - +test_compute_pairwise_distance_of_leaves2 <- function() { + treeParentVectorFormat <- c(6, 8, 1, 1, 3, 3, 8, 6) + bestMutationPlacement <- c(5, 8, 6, 0) + + outcome <- c(0, 0) + pairwiseGenealogy <- findMostRecentCommonAncestor(treeParentVectorFormat, 2, 4) - - dist <- computePairwiseDistanceOfLeaves2(treeParentVectorFormat, 2, 4, - bestMutationPlacement,pairwiseGenealogy) - + + dist <- computePairwiseDistanceOfLeaves2( + treeParentVectorFormat, 2, 4, + bestMutationPlacement, pairwiseGenealogy + ) + PASS <- TRUE - - if (sum(dist != outcome)){ - PASS = FALSE + + if (sum(dist != outcome)) { + PASS <- FALSE } return(PASS) } -test_compute_pairwise_distance_of_leaves3 <- function(){ - - treeParentVectorFormat <- c(6,8,1,1,3,3,8,6) - bestMutationPlacement <- c(5,8,6,0) - outcome <- c(3,1) - +test_compute_pairwise_distance_of_leaves3 <- function() { + treeParentVectorFormat <- c(6, 8, 1, 1, 3, 3, 8, 6) + bestMutationPlacement <- c(5, 8, 6, 0) + outcome <- c(3, 1) + pairwiseGenealogy <- findMostRecentCommonAncestor(treeParentVectorFormat, 0, 5) - - dist <- computePairwiseDistanceOfLeaves2(treeParentVectorFormat, 0, 5, - bestMutationPlacement, pairwiseGenealogy) - + + dist <- computePairwiseDistanceOfLeaves2( + treeParentVectorFormat, 0, 5, + bestMutationPlacement, pairwiseGenealogy + ) + PASS <- TRUE - - if (sum(dist != outcome)){ - PASS = FALSE + + if (sum(dist != outcome)) { + PASS <- FALSE } return(PASS) } @@ -72,66 +73,65 @@ test_compute_pairwise_distance_of_leaves3 <- function(){ -test_find_MRCA1 <- function(){ - - treeParentVectorFormat <- c(6,8,1,1,3,3,8,6) - - +test_find_MRCA1 <- function() { + treeParentVectorFormat <- c(6, 8, 1, 1, 3, 3, 8, 6) + + res <- findMostRecentCommonAncestor(treeParentVectorFormat, 5, 5) - + outcome <- list(c(5, 3, 1, 8), c(), 5, 5) PASS <- TRUE - - for (i in c(1,3,4)){ - if (sum(outcome[[i]] != res[[i]])>0){ + + for (i in c(1, 3, 4)) { + if (sum(outcome[[i]] != res[[i]]) > 0) { PASS <- FALSE break } } - - if(length(res[[2]])>0) { + + if (length(res[[2]]) > 0) { PASS <- FALSE } - - + + return(PASS) } -test_find_MRCA2 <- function(){ - treeParentVectorFormat <- c(6,8,1,1,3,3,8,6) +test_find_MRCA2 <- function() { + treeParentVectorFormat <- c(6, 8, 1, 1, 3, 3, 8, 6) res <- findMostRecentCommonAncestor(treeParentVectorFormat, 2, 4) - - outcome <- list(c(2,1,8), c(4,3), 1, c(2,1,3,4)) + + outcome <- list(c(2, 1, 8), c(4, 3), 1, c(2, 1, 3, 4)) PASS <- TRUE - - for (i in 1:4){ - if (sum(outcome[[i]] != res[[i]])>0){ + + for (i in 1:4) { + if (sum(outcome[[i]] != res[[i]]) > 0) { PASS <- FALSE break } } - - - + + + return(PASS) } -test_find_MRCA3 <- function(){ - treeParentVectorFormat <- c(6,8,1,1,3,3,8,6) +test_find_MRCA3 <- function() { + treeParentVectorFormat <- c(6, 8, 1, 1, 3, 3, 8, 6) res <- findMostRecentCommonAncestor(treeParentVectorFormat, 1, 6) - - outcome <- list(c(1, 8), 6, 8, c(1,8,6), c(1,8,6)) + + outcome <- list(c(1, 8), 6, 8, c(1, 8, 6), c(1, 8, 6)) PASS <- TRUE - - for (i in 1:3){ - if (sum(outcome[[i]] != res[[i]])>0){ + + for (i in 1:3) { + if (sum(outcome[[i]] != res[[i]]) > 0) { PASS <- FALSE break } } - - - + + + return(PASS) } @@ -139,136 +139,146 @@ test_find_MRCA3 <- function(){ -test_computePairwiseDistanceOfLeavesGivenTree <- function(){ +test_computePairwiseDistanceOfLeavesGivenTree <- function() { source("functions.R") - + inputFolder <- "../../input_folder" treeName <- "Br7" - + input <- load_data(inputFolder, treeName) - + postSampling <- input$postSampling - + nCells <- 24 nMutations <- 10 nClusters <- 11 - + alleleCount <- c(8, 6, 2, 6, 2, 2, 2, 2, 8, 4, 6) - ClusterID <- c(0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, 8, 8, 8, 9, 9, 10, 10, 10) - mutatedReadCounts <- list(c(0 , 4, 0, 0, 0, 0, 4, 0, 11, 0, 0), - c(4, 0, 1, 3, 0, 0, 0, 0, 1, 0, 0), - c(0, 0, 0, 25, 0, 0, 0, 0, 4, 0, 0), - c(3, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0), - c(8, 1, 16, 4, 73, 0, 0, 0, 5, 0, 0), - c(0, 0, 9, 0, 0, 0, 26, 0, 8, 0, 0), - c(0, 0, 2, 0, 22, 0, 0, 0, 8, 0, 0), - c(12, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0), - c(7, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0), - c(16, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0)) - - totalReadCounts <- list(c(31, 238, 0, 234, 0, 0, 20, 0, 147, 0, 245), - c(16, 31, 8, 68, 0, 0, 3, 0, 34, 0, 0), - c(5, 5, 64, 114, 0, 0, 128, 0, 51, 0, 0), - c(13, 16, 13, 26, 10, 0, 15, 0, 14, 3, 14), - c(120, 98, 102, 503, 181, 0, 0, 0, 50, 0, 156), - c(22, 6, 47, 0, 0, 0, 64, 0, 23, 14, 5), - c(0, 63, 62, 0, 45, 0, 171, 0, 99, 0, 93), - c(24, 12, 0, 6, 2, 0, 0, 0, 7, 0, 0), - c(14, 0, 0, 0, 0, 0, 0, 0, 49, 0, 24), - c(32, 42, 0, 282, 0, 0, 0, 0, 119, 0, 19)) - + ClusterID <- c(0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, 8, 8, 8, 9, 9, 10, 10, 10) + mutatedReadCounts <- list( + c(0, 4, 0, 0, 0, 0, 4, 0, 11, 0, 0), + c(4, 0, 1, 3, 0, 0, 0, 0, 1, 0, 0), + c(0, 0, 0, 25, 0, 0, 0, 0, 4, 0, 0), + c(3, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0), + c(8, 1, 16, 4, 73, 0, 0, 0, 5, 0, 0), + c(0, 0, 9, 0, 0, 0, 26, 0, 8, 0, 0), + c(0, 0, 2, 0, 22, 0, 0, 0, 8, 0, 0), + c(12, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0), + c(7, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0), + c(16, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0) + ) + + totalReadCounts <- list( + c(31, 238, 0, 234, 0, 0, 20, 0, 147, 0, 245), + c(16, 31, 8, 68, 0, 0, 3, 0, 34, 0, 0), + c(5, 5, 64, 114, 0, 0, 128, 0, 51, 0, 0), + c(13, 16, 13, 26, 10, 0, 15, 0, 14, 3, 14), + c(120, 98, 102, 503, 181, 0, 0, 0, 50, 0, 156), + c(22, 6, 47, 0, 0, 0, 64, 0, 23, 14, 5), + c(0, 63, 62, 0, 45, 0, 171, 0, 99, 0, 93), + c(24, 12, 0, 6, 2, 0, 0, 0, 7, 0, 0), + c(14, 0, 0, 0, 0, 0, 0, 0, 49, 0, 24), + c(32, 42, 0, 282, 0, 0, 0, 0, 119, 0, 19) + ) + seqErrRate <- 0.00225 dropoutRate <- 0.301 wbcStatus <- c(1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1) - - res <- computePairwiseDistanceOfLeavesGivenTree(postSampling$'1', 7,1, - nCells, nMutations,nClusters, alleleCount, - ClusterID, mutatedReadCounts, totalReadCounts, wbcStatus, - nSamplingEvents = 10) + + res <- computePairwiseDistanceOfLeavesGivenTree(postSampling$"1", 7, 1, + nCells, nMutations, nClusters, alleleCount, + ClusterID, mutatedReadCounts, totalReadCounts, wbcStatus, + nSamplingEvents = 10 + ) outcome <- c(4, 7, 3, 0, 0, 0, 0, 0, 2, 0) - + PASS <- TRUE - - if(sum(res != outcome)>0) PASS <- FALSE - + + if (sum(res != outcome) > 0) PASS <- FALSE + return(PASS) } -test_transposeMatrix <- function(){ - matrix <- list(c(1,2),c(3,4), c(5,6)) - - res <- transposeMatrix(matrix, 3,2) - - expected <- list(c(1,3,5),c(2,4,6)) +test_transposeMatrix <- function() { + matrix <- list(c(1, 2), c(3, 4), c(5, 6)) + + res <- transposeMatrix(matrix, 3, 2) + + expected <- list(c(1, 3, 5), c(2, 4, 6)) PASS <- TRUE - - if(sum(expected[[1]] != res[[1]])> 0) PASS <- FALSE - if(sum(expected[[2]] != res[[2]])> 0) PASS <- FALSE - - + + if (sum(expected[[1]] != res[[1]]) > 0) PASS <- FALSE + if (sum(expected[[2]] != res[[2]]) > 0) PASS <- FALSE + + return(PASS) } -test_mutation_distribution <- function(){ - +test_mutation_distribution <- function() { nCells <- 24 nMutations <- 10 nClusters <- 11 - + alleleCount <- c(8, 6, 2, 6, 2, 2, 2, 2, 8, 4, 6) - ClusterID <- c(0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, 8, 8, 8, 9, 9, 10, 10, 10) - mutatedReadCounts <- list(c(0 , 4, 0, 0, 0, 0, 4, 0, 11, 0, 0), - c(4, 0, 1, 3, 0, 0, 0, 0, 1, 0, 0), - c(0, 0, 0, 25, 0, 0, 0, 0, 4, 0, 0), - c(3, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0), - c(8, 1, 16, 4, 73, 0, 0, 0, 5, 0, 0), - c(0, 0, 9, 0, 0, 0, 26, 0, 8, 0, 0), - c(0, 0, 2, 0, 22, 0, 0, 0, 8, 0, 0), - c(12, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0), - c(7, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0), - c(16, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0)) - - totalReadCounts <- list(c(31, 238, 0, 234, 0, 0, 20, 0, 147, 0, 245), - c(16, 31, 8, 68, 0, 0, 3, 0, 34, 0, 0), - c(5, 5, 64, 114, 0, 0, 128, 0, 51, 0, 0), - c(13, 16, 13, 26, 10, 0, 15, 0, 14, 3, 14), - c(120, 98, 102, 503, 181, 0, 0, 0, 50, 0, 156), - c(22, 6, 47, 0, 0, 0, 64, 0, 23, 14, 5), - c(0, 63, 62, 0, 45, 0, 171, 0, 99, 0, 93), - c(24, 12, 0, 6, 2, 0, 0, 0, 7, 0, 0), - c(14, 0, 0, 0, 0, 0, 0, 0, 49, 0, 24), - c(32, 42, 0, 282, 0, 0, 0, 0, 119, 0, 19)) - + ClusterID <- c(0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, 8, 8, 8, 9, 9, 10, 10, 10) + mutatedReadCounts <- list( + c(0, 4, 0, 0, 0, 0, 4, 0, 11, 0, 0), + c(4, 0, 1, 3, 0, 0, 0, 0, 1, 0, 0), + c(0, 0, 0, 25, 0, 0, 0, 0, 4, 0, 0), + c(3, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0), + c(8, 1, 16, 4, 73, 0, 0, 0, 5, 0, 0), + c(0, 0, 9, 0, 0, 0, 26, 0, 8, 0, 0), + c(0, 0, 2, 0, 22, 0, 0, 0, 8, 0, 0), + c(12, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0), + c(7, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0), + c(16, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0) + ) + + totalReadCounts <- list( + c(31, 238, 0, 234, 0, 0, 20, 0, 147, 0, 245), + c(16, 31, 8, 68, 0, 0, 3, 0, 34, 0, 0), + c(5, 5, 64, 114, 0, 0, 128, 0, 51, 0, 0), + c(13, 16, 13, 26, 10, 0, 15, 0, 14, 3, 14), + c(120, 98, 102, 503, 181, 0, 0, 0, 50, 0, 156), + c(22, 6, 47, 0, 0, 0, 64, 0, 23, 14, 5), + c(0, 63, 62, 0, 45, 0, 171, 0, 99, 0, 93), + c(24, 12, 0, 6, 2, 0, 0, 0, 7, 0, 0), + c(14, 0, 0, 0, 0, 0, 0, 0, 49, 0, 24), + c(32, 42, 0, 282, 0, 0, 0, 0, 119, 0, 19) + ) + seqErrRate <- 0.00225 dropoutRate <- 0.301 wbcStatus <- c(1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1) - - tree <- c(34, 44, 26, 41, 33, 30, 38, 35, 37, 31, 26, 43, 44, 40, 32, 45, 25, 35, 42, 24 ,39 ,32 ,28, 27, 30, 33, 42, 29 ,46 ,46, 25 ,37, 39, 38, 28, 40, 31, 24, 45, 27, 36, 36, 43, 41, 34, 29) - + + tree <- c(34, 44, 26, 41, 33, 30, 38, 35, 37, 31, 26, 43, 44, 40, 32, 45, 25, 35, 42, 24, 39, 32, 28, 27, 30, 33, 42, 29, 46, 46, 25, 37, 39, 38, 28, 40, 31, 24, 45, 27, 36, 36, 43, 41, 34, 29) + ancestorMatrix <- parentVector2ancMatrix(tree, length(tree)) - - - mutationDistributions <- computeMutationDistribution(nCells, nMutations, nClusters, ancestorMatrix, - alleleCount, ClusterID, mutatedReadCounts, - totalReadCounts,dropoutRate, seqErrRate, 1, - wbcStatus) - + + + mutationDistributions <- computeMutationDistribution( + nCells, nMutations, nClusters, ancestorMatrix, + alleleCount, ClusterID, mutatedReadCounts, + totalReadCounts, dropoutRate, seqErrRate, 1, + wbcStatus + ) + PASS <- TRUE - if(length(mutationDistributions) != nMutations) PASS <- FALSE - - - numberOfNodesInTree <- 2*nCells-1 - - for (i in 1:length(mutationDistributions)){ - if(length(mutationDistributions[[i]]) != numberOfNodesInTree) PASS <- FALSE + if (length(mutationDistributions) != nMutations) PASS <- FALSE + + + numberOfNodesInTree <- 2 * nCells - 1 + + for (i in 1:length(mutationDistributions)) { + if (length(mutationDistributions[[i]]) != numberOfNodesInTree) PASS <- FALSE } - + return(list("logMutationDistributions" = mutationDistributions, "PASS" = PASS)) } @@ -277,84 +287,90 @@ test_mutation_distribution <- function(){ -test_transposeMatrix <- function(){ - matrix <- list(c(1,2),c(3,4), c(5,6)) - - res <- transposeMatrix(matrix, 3,2) - - expected <- list(c(1,3,5),c(2,4,6)) +test_transposeMatrix <- function() { + matrix <- list(c(1, 2), c(3, 4), c(5, 6)) + + res <- transposeMatrix(matrix, 3, 2) + + expected <- list(c(1, 3, 5), c(2, 4, 6)) PASS <- TRUE - - if(sum(expected[[1]] != res[[1]])> 0) PASS <- FALSE - if(sum(expected[[2]] != res[[2]])> 0) PASS <- FALSE - - + + if (sum(expected[[1]] != res[[1]]) > 0) PASS <- FALSE + if (sum(expected[[2]] != res[[2]]) > 0) PASS <- FALSE + + return(PASS) } -test_sampleMutationPlacements <- function(){ - +test_sampleMutationPlacements <- function() { nSamplings <- 100000 nMutations <- 10 nCells <- 24 logProbs <- test_mutation_distribution()$logMutationDistributions - - mutationSampling <- transposeMatrix(sampleMutationsPlacement(nSamplings, - nMutations, logProbs), - nSamplings, nMutations) - - + + mutationSampling <- transposeMatrix( + sampleMutationsPlacement( + nSamplings, + nMutations, logProbs + ), + nSamplings, nMutations + ) + + PASS <- TRUE - for(mutation in 1:length(mutationSampling)){ + for (mutation in 1:length(mutationSampling)) { sampledPlacement <- vector() - for(i in 0:(2*nCells-2)){ - sampledPlacement <- c(sampledPlacement,sum(mutationSampling[[mutation]] == i)) + for (i in 0:(2 * nCells - 2)) { + sampledPlacement <- c(sampledPlacement, sum(mutationSampling[[mutation]] == i)) } - sampledPlacement <- sampledPlacement/sum(sampledPlacement) + sampledPlacement <- sampledPlacement / sum(sampledPlacement) Probs <- logProbs[[mutation]] %>% exp() - Probs <- Probs/sum(Probs) - if(((Probs-sampledPlacement)^2 %>% sum()) > 10e-4) PASS <- FALSE + Probs <- Probs / sum(Probs) + if (((Probs - sampledPlacement)^2 %>% sum()) > 10e-4) PASS <- FALSE } return(PASS) } -test_ComputePerMutationProbabilityOfPolyclonality <- function(){ - - treeParentVectorFormat <- c(6,8,1,1,3,3,8,6) - - - pairwiseGenealogy <- list(c(0,6,8), c(2,1), c(8), c(0,6,8,1,2)) +test_ComputePerMutationProbabilityOfPolyclonality <- function() { + treeParentVectorFormat <- c(6, 8, 1, 1, 3, 3, 8, 6) + + + pairwiseGenealogy <- list(c(0, 6, 8), c(2, 1), c(8), c(0, 6, 8, 1, 2)) nCells <- 5 nMutations <- 2 - - + + PASS <- TRUE - - logMutationPlacementProbs <- list(log(c(0.5,0.5,0,0,0,0,0,0,0)), log(c(0,0.5,0,0.5,0,0,0,0,0))) + + logMutationPlacementProbs <- list(log(c(0.5, 0.5, 0, 0, 0, 0, 0, 0, 0)), log(c(0, 0.5, 0, 0.5, 0, 0, 0, 0, 0))) outcome <- 0.5 - res <- ComputePerMutationProbabilityOfPolyclonality(pairwiseGenealogy, - logMutationPlacementProbs, - nMutations, nCells) - if(res != outcome) PASS <- FALSE - - - logMutationPlacementProbs <- list(log(c(0,0.6,0.4,0,0,0,0,0,0)), log(c(0,0.4,0,0.6,0,0,0,0,0))) + res <- ComputePerMutationProbabilityOfPolyclonality( + pairwiseGenealogy, + logMutationPlacementProbs, + nMutations, nCells + ) + if (res != outcome) PASS <- FALSE + + + logMutationPlacementProbs <- list(log(c(0, 0.6, 0.4, 0, 0, 0, 0, 0, 0)), log(c(0, 0.4, 0, 0.6, 0, 0, 0, 0, 0))) outcome <- 0 - res <- ComputePerMutationProbabilityOfPolyclonality(pairwiseGenealogy, - logMutationPlacementProbs, - nMutations, nCells) - if(res != outcome) PASS <- FALSE - - logMutationPlacementProbs <- list(log(c(0.4,0.6,00,0,0,0,0,0,0)), log(c(0,0.4,0,0.6,0,0,0,0,0))) + res <- ComputePerMutationProbabilityOfPolyclonality( + pairwiseGenealogy, + logMutationPlacementProbs, + nMutations, nCells + ) + if (res != outcome) PASS <- FALSE + + logMutationPlacementProbs <- list(log(c(0.4, 0.6, 00, 0, 0, 0, 0, 0, 0)), log(c(0, 0.4, 0, 0.6, 0, 0, 0, 0, 0))) outcome <- 0.4 - res <- ComputePerMutationProbabilityOfPolyclonality(pairwiseGenealogy, - logMutationPlacementProbs, - nMutations, nCells) - if(res != outcome) PASS <- FALSE - + res <- ComputePerMutationProbabilityOfPolyclonality( + pairwiseGenealogy, + logMutationPlacementProbs, + nMutations, nCells + ) + if (res != outcome) PASS <- FALSE + return(PASS) } - - diff --git a/experiments/workflow/resources/functions.R b/experiments/workflow/resources/functions.R index 6254d70..0916f1b 100755 --- a/experiments/workflow/resources/functions.R +++ b/experiments/workflow/resources/functions.R @@ -1,6 +1,5 @@ - ############ -#Function Definitions +# Function Definitions ############ library(Rcpp) @@ -9,16 +8,16 @@ library(tidyverse) sourceCpp("../../workflow/resources/mutations_placement.cpp") -#source('UnitTests.R') +# source('UnitTests.R') #' Takes a list of mutations and outputs which one of these is a driver. -#' +#' #' #' @param mutations a names vector containing chromosomes in the format "chrN" in the first -#' column and an integer chromosomal position on the second column +#' column and an integer chromosomal position on the second column #' @param annotations an annotation data frame. Must contain the columns #' - 'CGI-Oncogenic Summary': entry can be 'driver (oncodriveMUT)' or somerthing else #' - 'CGI-Oncogenic Prediction': entry can be 'oncogenic (predicted)' or something ele @@ -28,17 +27,17 @@ sourceCpp("../../workflow/resources/mutations_placement.cpp") #' @export #' #' @examples -IsDriver <- function(mutations, annotations){ +IsDriver <- function(mutations, annotations) { annotated_mutations <- annotations %>% - filter(annotations$'#CHROM' == as.character(mutations[1]) & annotations$POS == as.numeric(mutations[2])) - + filter(annotations$"#CHROM" == as.character(mutations[1]) & annotations$POS == as.numeric(mutations[2])) + check <- annotated_mutations %>% - select(c('CGI-Oncogenic Summary','CGI-Oncogenic Prediction', 'CGI-External oncogenic annotation')) %in% - c('oncogenic (predicted)', 'driver (oncodriveMUT)') %>% + select(c("CGI-Oncogenic Summary", "CGI-Oncogenic Prediction", "CGI-External oncogenic annotation")) %in% + c("oncogenic (predicted)", "driver (oncodriveMUT)") %>% sum() - + driver <- FALSE - if(check > 0){ + if (check > 0) { driver <- TRUE } return(driver) @@ -50,43 +49,42 @@ IsDriver <- function(mutations, annotations){ -#Legacy -#Input: a tree in parent vector format, meaning that the i'th entry of the vector -# is te parent node of the entry i. Nodes are counted from zero and the root is +# Legacy +# Input: a tree in parent vector format, meaning that the i'th entry of the vector +# is te parent node of the entry i. Nodes are counted from zero and the root is # length(Tree) -#Output: A list with three entries: +# Output: A list with three entries: # - the first entry is a vector of nodes tracing back leaf 1 to the root # - the second entry is a vector of nodes tracing back leaf2 to the descendant # of the MRCA in the lineage # - the thirs entry is the MRCA -find_most_recent_common_ancestor <- function(treeParentVectorFormat, leaf1, leaf2){ - ##Trace back the lineage of the tree for one leaf. - ##Then trace back the lineage of the tree for the other leaf and for every node - ##whether is lies in the lineage of the first leaf. - ##The first node that does is the most recent common ancestor node. - ##Concatenating these two will form the shortest path through the tree. +find_most_recent_common_ancestor <- function(treeParentVectorFormat, leaf1, leaf2) { + ## Trace back the lineage of the tree for one leaf. + ## Then trace back the lineage of the tree for the other leaf and for every node + ## whether is lies in the lineage of the first leaf. + ## The first node that does is the most recent common ancestor node. + ## Concatenating these two will form the shortest path through the tree. ## If there is a mutation on the tree, then this means that the cells are - ##split by the tree, if there is none, then they aren't. - - ##Note that the nodes and leaves of the tree are encoded from 0 to the number of nodes minus 1 + ## split by the tree, if there is none, then they aren't. + + ## Note that the nodes and leaves of the tree are encoded from 0 to the number of nodes minus 1 ## Therefore, I add 1 to the indices to be compatible with R indication starting at 1 lineage1 <- leaf1 - repeat { - #print(treeParentVectorFormat[lineage1[length(lineage1)] + 1]) + repeat { + # print(treeParentVectorFormat[lineage1[length(lineage1)] + 1]) lineage1 <- c(lineage1, treeParentVectorFormat[lineage1[length(lineage1)] + 1]) - if(lineage1[length(lineage1)] == length(treeParentVectorFormat)) break + if (lineage1[length(lineage1)] == length(treeParentVectorFormat)) break } lineage2 <- leaf2 nextParent <- treeParentVectorFormat[leaf2 + 1] - while(!(nextParent %in% lineage1)) { + while (!(nextParent %in% lineage1)) { lineage2 <- c(lineage2, nextParent) nextParent <- treeParentVectorFormat[nextParent + 1] - #print(nextParent) - #print(!(nextParent %in% lineage1)) - + # print(nextParent) + # print(!(nextParent %in% lineage1)) } MRCA <- nextParent - return(list(lineage1,lineage2, MRCA)) + return(list(lineage1, lineage2, MRCA)) } @@ -97,45 +95,49 @@ find_most_recent_common_ancestor <- function(treeParentVectorFormat, leaf1, leaf # Take a tree and a pair of mutations and do the following: compute_pairwise_distance_of_leaves <- function(treeData, leaf1, leaf2, nCells, nMutations, nClusters, - alleleCount,ClusterID, - mutatedReadCounts, totalReadCounts,wbcStatus){ + alleleCount, ClusterID, + mutatedReadCounts, totalReadCounts, wbcStatus) { tree <- treeData$Tree treeParentVectorFormat <- as.numeric(unlist(strsplit(tree, " "))) dropoutRate <- treeData$DropoutRate seqErrRate <- treeData$SequencingErrorRate - + ### Now I need to compute the best mutation placement on the tree. This is done - ##using the scoreTree C++ function (taken from CTC_treeScoring.cpp). - - #print("Preprocess tree") - ancestorMatrix <- parentVector2ancMatrix(treeParentVectorFormat, - length(treeParentVectorFormat)) - - - #print("Find best Mutation placement") - bestMutationPlacement <- getMutationPlacement (nCells, nMutations, nClusters, - ancestorMatrix, alleleCount, - ClusterID,mutatedReadCounts, - totalReadCounts, - dropoutRate, seqErrRate, 1, - wbcStatus)## This crashes when executed on posterior sampling of Br61 - #print("Finding most recent common ancestor") - pairwiseGenealogy <- findMostRecentCommonAncestor(treeParentVectorFormat, leaf1,leaf2) - + ## using the scoreTree C++ function (taken from CTC_treeScoring.cpp). + + # print("Preprocess tree") + ancestorMatrix <- parentVector2ancMatrix( + treeParentVectorFormat, + length(treeParentVectorFormat) + ) + + + # print("Find best Mutation placement") + bestMutationPlacement <- getMutationPlacement( + nCells, nMutations, nClusters, + ancestorMatrix, alleleCount, + ClusterID, mutatedReadCounts, + totalReadCounts, + dropoutRate, seqErrRate, 1, + wbcStatus + ) ## This crashes when executed on posterior sampling of Br61 + # print("Finding most recent common ancestor") + pairwiseGenealogy <- findMostRecentCommonAncestor(treeParentVectorFormat, leaf1, leaf2) + positionOfMRCA <- which(pairwiseGenealogy[[1]] == pairwiseGenealogy[[3]]) firstLeafToMRCA <- pairwiseGenealogy[[1]][1:(positionOfMRCA)] - + secondLeafToMRCA <- pairwiseGenealogy[[2]] - - #print(firstLeafToMRCA) - #print(secondLeafToMRCA) - pathBetweenLeaves <- c(firstLeafToMRCA,rev(secondLeafToMRCA)) - #print(pathBetweenLeaves) - + + # print(firstLeafToMRCA) + # print(secondLeafToMRCA) + pathBetweenLeaves <- c(firstLeafToMRCA, rev(secondLeafToMRCA)) + # print(pathBetweenLeaves) + ## Now count an output how many of the mutations lie in the shortest path between the leaves. - ##This equals the Hamming distance between the inferred Genotypes of two leaves - ##Need to exclude the MRCA for this - + ## This equals the Hamming distance between the inferred Genotypes of two leaves + ## Need to exclude the MRCA for this + return(sum(bestMutationPlacement %in% pathBetweenLeaves[pathBetweenLeaves != firstLeafToMRCA[positionOfMRCA]])) } @@ -161,131 +163,135 @@ compute_pairwise_distance_of_leaves <- function(treeData, leaf1, leaf2, nCells, #' #' @return splittingFraction: The fraction of sampling events for which the pair of cells #' shows branching evolution -#' +#' #' @export #' #' @examples -produce_Distance_Posterior <- function(leaf1, leaf2,postSampling, treeName,nCells, +produce_Distance_Posterior <- function(leaf1, leaf2, postSampling, treeName, nCells, nMutations, nClusters, - alleleCount,ClusterID, - mutatedReadCounts, totalReadCounts,wbcStatus, nSamplingEvents = 20, clusterName = ""){ - + alleleCount, ClusterID, + mutatedReadCounts, totalReadCounts, wbcStatus, nSamplingEvents = 20, clusterName = "") { ## For each row in the posterior Sampling file, the distance of two leaves is computed - + print("Computing the posterior distribution") - + distance_statistics <- parallel::mclapply(postSampling, - FUN = computePairwiseDistanceOfLeavesGivenTree, leaf1,leaf2, - nCells, nMutations,nClusters, alleleCount, - ClusterID, mutatedReadCounts, totalReadCounts, wbcStatus, - nSamplingEvents) - - - dist_histogram <- lapply(distance_statistics, FUN = function(input_list_elements){ + FUN = computePairwiseDistanceOfLeavesGivenTree, leaf1, leaf2, + nCells, nMutations, nClusters, alleleCount, + ClusterID, mutatedReadCounts, totalReadCounts, wbcStatus, + nSamplingEvents + ) + + + dist_histogram <- lapply(distance_statistics, FUN = function(input_list_elements) { return(input_list_elements[1]) }) %>% unlist() - - totalNumberOfSplits <- lapply(distance_statistics, FUN = function(input_list_elements){ + + totalNumberOfSplits <- lapply(distance_statistics, FUN = function(input_list_elements) { return(input_list_elements[2]) - }) %>% unlist %>% sum() - - StatisticsOfMutationPlacement <- lapply(distance_statistics, FUN = function(input_list_elements){ + }) %>% + unlist() %>% + sum() + + StatisticsOfMutationPlacement <- lapply(distance_statistics, FUN = function(input_list_elements) { return(input_list_elements[3]) - }) %>% unlist - - + }) %>% unlist() + + totalNumberOfSamplingEvents <- nSamplingEvents * length(postSampling) - - - -# median_dist <- median(dist_histogram) - - - - -# plot( -# ggplot(data.frame(HammingDistance = dist_histogram), aes(x = HammingDistance)) + -# geom_histogram(binwidth = 1, fill = "skyblue", color = "skyblue", alpha = 0.7)+ -# xlab(sprintf("genetic distance between leaf %d and leaf %d", leaf1, leaf2)) + ylab("total count") + -# ggtitle("Posterior sampling of genetic distances") + -# geom_vline(xintercept = median_dist,color = "red", linetype = "dashed", linewidth = 1) + -# labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName),caption = "median indicated by dashed red line") + -# theme_minimal() + -# theme( -# plot.title = element_text(size = 20, face = "bold"), -# axis.title.x = element_text(size = 18), -# axis.title.y = element_text(size = 18), -# plot.subtitle = element_text(size= 18), -# axis.text = element_text(size = 16) -# ) -# ) - - + + + + # median_dist <- median(dist_histogram) + + + + + # plot( + # ggplot(data.frame(HammingDistance = dist_histogram), aes(x = HammingDistance)) + + # geom_histogram(binwidth = 1, fill = "skyblue", color = "skyblue", alpha = 0.7)+ + # xlab(sprintf("genetic distance between leaf %d and leaf %d", leaf1, leaf2)) + ylab("total count") + + # ggtitle("Posterior sampling of genetic distances") + + # geom_vline(xintercept = median_dist,color = "red", linetype = "dashed", linewidth = 1) + + # labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName),caption = "median indicated by dashed red line") + + # theme_minimal() + + # theme( + # plot.title = element_text(size = 20, face = "bold"), + # axis.title.x = element_text(size = 18), + # axis.title.y = element_text(size = 18), + # plot.subtitle = element_text(size= 18), + # axis.text = element_text(size = 16) + # ) + # ) + + data <- data.frame(StatisticsOfMutationPlacement = StatisticsOfMutationPlacement) - - + + sum(is.na(data$StatisticsOfMutationPlacement)) class(data$StatisticsOfMutationPlacement) - - + + ggplot(data = data, aes(x = StatisticsOfMutationPlacement, y = 1)) + geom_point() - - + + tryCatch( expr = { histo <- ggplot(data, aes(x = StatisticsOfMutationPlacement)) + - geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7)+ - xlab("Splitting score") + ylab("total count") + + geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7) + + xlab("Splitting score") + + ylab("total count") + ggtitle("Posterior sampling of branching probabilites") + - geom_vline(xintercept = mean(StatisticsOfMutationPlacement),color = "blue", linetype = "dashed", linewidth = 1) + + geom_vline(xintercept = mean(StatisticsOfMutationPlacement), color = "blue", linetype = "dashed", linewidth = 1) + labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName)) + theme_minimal() + theme( plot.title = element_text(size = 20, face = "bold"), axis.title.x = element_text(size = 18), axis.title.y = element_text(size = 18), - plot.subtitle = element_text(size= 18), - axis.text = element_text(size = 16) + plot.subtitle = element_text(size = 18), + axis.text = element_text(size = 16) ) hist_data <- ggplot_build(histo)$data[[1]] max_y <- max(hist_data$count) - histo <- histo + annotate("text", x = mean(StatisticsOfMutationPlacement) + 0.08, y = 0.9 * max_y, label="mean", color = "blue", size = 7) + histo <- histo + annotate("text", x = mean(StatisticsOfMutationPlacement) + 0.08, y = 0.9 * max_y, label = "mean", color = "blue", size = 7) print(histo) }, - error = function(e){ + error = function(e) { histo <- ggplot(data, aes(x = log(StatisticsOfMutationPlacement))) + - geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7)+ - xlab("log(Splitting Score") + ylab("total count") + + geom_histogram(bins = 10, fill = "skyblue", color = "skyblue", alpha = 0.7) + + xlab("log(Splitting Score") + + ylab("total count") + ggtitle("Posterior sampling of branching probabilites - Logarithmic Scale") + - geom_vline(xintercept = log(mean(StatisticsOfMutationPlacement)),color = "blue", linetype = "dashed", linewidth = 1) + - labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName),caption = "mean indicated by dashed red line") + + geom_vline(xintercept = log(mean(StatisticsOfMutationPlacement)), color = "blue", linetype = "dashed", linewidth = 1) + + labs(subtitle = sprintf("Tree %s - %s", treeName, clusterName), caption = "mean indicated by dashed red line") + theme_minimal() + theme( plot.title = element_text(size = 20, face = "bold"), axis.title.x = element_text(size = 18), axis.title.y = element_text(size = 18), - plot.subtitle = element_text(size= 18), - axis.text = element_text(size = 16) + plot.subtitle = element_text(size = 18), + axis.text = element_text(size = 16) ) hist_data <- ggplot_build(histo)$data[[1]] max_y <- max(hist_data$count) - histo <- histo + annotate("text", x = log(mean(StatisticsOfMutationPlacement)) + 0.08, y = 0.9 * max_y, label="log(mean)", color = "blue", size = 7) + histo <- histo + annotate("text", x = log(mean(StatisticsOfMutationPlacement)) + 0.08, y = 0.9 * max_y, label = "log(mean)", color = "blue", size = 7) print(histo) } ) - - - - - return(list(splittingFraction = totalNumberOfSplits/totalNumberOfSamplingEvents, branchingStatistics = StatisticsOfMutationPlacement)) + + + + + return(list(splittingFraction = totalNumberOfSplits / totalNumberOfSamplingEvents, branchingStatistics = StatisticsOfMutationPlacement)) } -#' This function identifies cells that belong to the same CTC cluster - also -#' those which have been physically split. For each pair of tumour cells from the +#' This function identifies cells that belong to the same CTC cluster - also +#' those which have been physically split. For each pair of tumour cells from the #' same CTC cluster, the distnace postior is computed. #' #' @param sampleDescription A data frame with the description of each sample. @@ -304,14 +310,14 @@ produce_Distance_Posterior <- function(leaf1, leaf2,postSampling, treeName,nCell #' @param totalReadCounts A tibble containing the total read counts. #' @param nMutationSamplingEvents The number of mutation that should be sampled #' per tree. -#' @param nTreeSamplingEvents The number of trees that should be sampled. +#' @param nTreeSamplingEvents The number of trees that should be sampled. #' @param cellPairSelection An optional parameter that takes a list of -#' pairs of strings-valued names of cells that should be analysed (the names as in the +#' pairs of strings-valued names of cells that should be analysed (the names as in the #' samples_nodeDescription.tsv file). #' It can also take a character vector, in which case the entries should be the color coded #' names of the clusters. #' -#' @return splittinProbs a vector that gives for each pair of cells the fraction of +#' @return splittinProbs a vector that gives for each pair of cells the fraction of #' trees for which they split #' aggregatedBranchingProbabilities: a vector of aggregated probabilities for all considered #' pairs of leaves and all sampled trees. At the moment only implement if cellPairSelection @@ -324,117 +330,115 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, nCel alleleCount, mutatedReadCounts, totalReadCounts, nMutationSamplingEvents = 1000, nTreeSamplingEvents = 500, - cellPairSelection = NA){ - + cellPairSelection = NA) { desired_values <- sample(1:length(postSampling), size = nTreeSamplingEvents, replace = FALSE) %>% sort() postSampling <- postSampling[desired_values] splittingProbs <- matrix(0, nrow = 0, ncol = 2) %>% as.data.frame() colnames(splittingProbs) <- c("Cluster", "Splitting_probability") aggregatedProbabilities <- vector() - if(class(cellPairSelection) == "list"){ + if (class(cellPairSelection) == "list") { counter <- 1 system.time( - for (it in cellPairSelection){ - leaf1 <- which(sampleDescription$ClusterName == it[1])-1 - leaf2 <- which(sampleDescription$ClusterName == it[2])-1 - - print(paste(paste("Computing genomic distances of leaves:", leaf1, sep = " "), leaf2, sep = " ")) - posterior <- produce_Distance_Posterior(leaf1,leaf2,postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount,sampleDescription$Cluster, - mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = nMutationSamplingEvents) - splittingProbs <- rbind(splittingProbs, data.frame(Cluster = as.character(counter), Splitting_probability = posterior$splittingFraction)) - aggregatedProbabilities <- c(aggregatedProbabilities, posterior$branchingStatistics) - counter <- counter + 1 - } + for (it in cellPairSelection) { + leaf1 <- which(sampleDescription$ClusterName == it[1]) - 1 + leaf2 <- which(sampleDescription$ClusterName == it[2]) - 1 + + print(paste(paste("Computing genomic distances of leaves:", leaf1, sep = " "), leaf2, sep = " ")) + posterior <- produce_Distance_Posterior(leaf1, leaf2, postSampling, treeName, nCells, + nMutations, nClusters, + alleleCount, sampleDescription$Cluster, + mutatedReadCounts, totalReadCounts, sampleDescription$WBC, + nSamplingEvents = nMutationSamplingEvents + ) + splittingProbs <- rbind(splittingProbs, data.frame(Cluster = as.character(counter), Splitting_probability = posterior$splittingFraction)) + aggregatedProbabilities <- c(aggregatedProbabilities, posterior$branchingStatistics) + counter <- counter + 1 + } ) - } - - else if(class(cellPairSelection) == 'character'){ - CTCclusters <- unique(cellPairSelection) - CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite","gray93"))] - system.time( - for(it in CTCclusters){ - cellsInCluster <- which(sampleDescription$color %in% it)-1 ## Make sure array indication is - ## compatible with cpp - cluster_done <- 0 - for(i in cellsInCluster){ - if(cluster_done == 1){ - cluster_done <- 0 + } else if (class(cellPairSelection) == "character") { + CTCclusters <- unique(cellPairSelection) + CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite", "gray93"))] + system.time( + for (it in CTCclusters) { + cellsInCluster <- which(sampleDescription$color %in% it) - 1 ## Make sure array indication is + ## compatible with cpp + cluster_done <- 0 + for (i in cellsInCluster) { + if (cluster_done == 1) { + cluster_done <- 0 + break + } + if (sampleDescription$WBC[i + 1] == 1) next + j <- cellsInCluster[1] + while (j < i) { + if (cluster_done == 1) { break } - if(sampleDescription$WBC[i+1] == 1) next - j <- cellsInCluster[1] - while(j < i){ - if(cluster_done == 1){ - break - } - if(sampleDescription$WBC[j+1] == 1){ - j <- j + 1 - next - } - print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " ")) - posterior <- produce_Distance_Posterior(i,j,postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount,sampleDescription$Cluster, - mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = nMutationSamplingEvents, clusterName = it) - splittingProbs <- rbind(splittingProbs, data.frame(Cluster = it,Splitting_probability = posterior$splittingFraction)) - aggregatedProbabilities <- c(aggregatedProbabilities, posterior$branchingStatistics) + if (sampleDescription$WBC[j + 1] == 1) { j <- j + 1 - cluster_done <- 1 + next } + print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " ")) + posterior <- produce_Distance_Posterior(i, j, postSampling, treeName, nCells, + nMutations, nClusters, + alleleCount, sampleDescription$Cluster, + mutatedReadCounts, totalReadCounts, sampleDescription$WBC, + nSamplingEvents = nMutationSamplingEvents, clusterName = it + ) + splittingProbs <- rbind(splittingProbs, data.frame(Cluster = it, Splitting_probability = posterior$splittingFraction)) + aggregatedProbabilities <- c(aggregatedProbabilities, posterior$branchingStatistics) + j <- j + 1 + cluster_done <- 1 } - } - ) - } - - - else{ + } + ) + } else { CTCclusters <- unique(sampleDescription$color) - CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite","gray93"))] + CTCclusters <- CTCclusters[!(CTCclusters %in% c("ghostwhite", "gray93"))] system.time( - for(it in CTCclusters){ - cellsInCluster <- which(sampleDescription$color %in% it)-1 ## Make sure array indication is + for (it in CTCclusters) { + cellsInCluster <- which(sampleDescription$color %in% it) - 1 ## Make sure array indication is ## compatible with cpp - #cluster_done <- 0 - for(i in cellsInCluster){ + # cluster_done <- 0 + for (i in cellsInCluster) { # if(cluster_done == 1){ # cluster_done <- 0 # break - #} - if(sampleDescription$WBC[i+1] == 1) next + # } + if (sampleDescription$WBC[i + 1] == 1) next j <- cellsInCluster[1] - while(j < i){ - #if(cluster_done == 1){ + while (j < i) { + # if(cluster_done == 1){ # break - #} - if(sampleDescription$WBC[j+1] == 1){ + # } + if (sampleDescription$WBC[j + 1] == 1) { j <- j + 1 next } print(paste(paste("Computing genomic distances of leaves:", i, sep = " "), j, sep = " ")) - posterior <- produce_Distance_Posterior(i,j,postSampling, treeName, nCells, - nMutations, nClusters, - alleleCount,sampleDescription$Cluster, - mutatedReadCounts, totalReadCounts,sampleDescription$WBC, nSamplingEvents = nMutationSamplingEvents, clusterName = it) - splittingProbs <- rbind(splittingProbs, data.frame(Cluster = it,Splitting_probability = posterior$splittingFraction)) + posterior <- produce_Distance_Posterior(i, j, postSampling, treeName, nCells, + nMutations, nClusters, + alleleCount, sampleDescription$Cluster, + mutatedReadCounts, totalReadCounts, sampleDescription$WBC, + nSamplingEvents = nMutationSamplingEvents, clusterName = it + ) + splittingProbs <- rbind(splittingProbs, data.frame(Cluster = it, Splitting_probability = posterior$splittingFraction)) j <- j + 1 - #cluster_done <- 1 + # cluster_done <- 1 } } - } ) } - # plot( -# splittingProbs %>% group_by(Cluster) %>% summarize(meanSplittingProbability = mean(Splitting_probability)) %>% -# ggplot(aes(x = Cluster, y = meanSplittingProbability)) + -# geom_col() + -# theme_minimal() - # ) - + # plot( + # splittingProbs %>% group_by(Cluster) %>% summarize(meanSplittingProbability = mean(Splitting_probability)) %>% + # ggplot(aes(x = Cluster, y = meanSplittingProbability)) + + # geom_col() + + # theme_minimal() + # ) + return(list(splittingProbs = splittingProbs, aggregatedBranchingProbabilities = aggregatedProbabilities)) } @@ -447,124 +451,140 @@ computeClusterSplits <- function(sampleDescription, postSampling, treeName, nCel # Loads all necessary data for the CTC-project. # Specifically it return a named list as follows: -# postSampling: Loads the posterior sampling tsv as a list of named vectors with the +# postSampling: Loads the posterior sampling tsv as a list of named vectors with the # following columns: # the (unnormalised) LogScore, estimated sequencing error rate, -# the estimated dropout rate, logTau and the Tree in parent vector +# the estimated dropout rate, logTau and the Tree in parent vector # format meaning that the i'th entry of the vector -# is te parent node of the entry i. Nodes are counted from zero and the root is +# is te parent node of the entry i. Nodes are counted from zero and the root is # length(Tree) # nClusters: The total number of CTC-clusters # ClusterID: The Cluster-ID maps cells identities to the cell-clusters they belong to. -# The i-th entry having value x means that +# The i-th entry having value x means that # Cells i is in the j-th cluster (7th row in the description file) # nCells: total number of Cells in the experiment # nMutations: Total number of considered mutations in the dataset # nClusters: Total number of CTC-clusters in the experiment # alleleCount: Total number of cells per Cluster*2 -# ClusterID: Number +# ClusterID: Number # mutatedReadCounts: A vector containing the number of of mutated reads for each # cluster # total Read Counts: vector containingtotal read count for each cluster -# wbcStatus: A vector that has value 1 if cell i is a white blood cells and +# wbcStatus: A vector that has value 1 if cell i is a white blood cells and # 0 else -load_data <- function(inputFolder, treeName){ - ##Define paths - - posteriorSamplingFile <- sprintf("%s/%s/%s_postSampling.tsv", inputFolder, treeName,treeName) - - countFile <- sprintf("%s/%s/%s.txt", inputFolder, treeName,treeName) +load_data <- function(inputFolder, treeName) { + ## Define paths + + posteriorSamplingFile <- sprintf("%s/%s/%s_postSampling.tsv", inputFolder, treeName, treeName) + + countFile <- sprintf("%s/%s/%s.txt", inputFolder, treeName, treeName) descriptionFile <- sprintf("%s/%s/%s_samples_nodeDescription.tsv", inputFolder, treeName, treeName) - - - ##Load data - + + + ## Load data + postSampling <- read_delim(posteriorSamplingFile, - delim = "\t", col_names = c("LogScore", "SequencingErrorRate","DropoutRate", "LogTau", "Tree")) + delim = "\t", col_names = c("LogScore", "SequencingErrorRate", "DropoutRate", "LogTau", "Tree") + ) postSampling <- split(postSampling, seq(nrow(postSampling))) - - + + counts <- read_delim(countFile, - delim = "\t", col_names = FALSE) + delim = "\t", col_names = FALSE + ) description <- read_delim(descriptionFile, - delim = "\t", col_names = c("Cluster", "CellCount", "TCs", "WBCs", "Description")) + delim = "\t", col_names = c("Cluster", "CellCount", "TCs", "WBCs", "Description") + ) nCells <- sum(description$CellCount) - nClusters <- nrow(description) + nClusters <- nrow(description) nMutations <- nrow(counts) - alleleCount <- description$CellCount*2 - - + alleleCount <- description$CellCount * 2 + + description <- description %>% mutate(color = regmatches(Description, regexpr("color=([a-zA-Z]+[0-9]*)", Description)) %>% substr(start = 7, stop = (nchar(.)))) - - - + + + ClusterID <- vector() - for(i in 1:nClusters) ClusterID <- c(ClusterID, rep.int(i-1,description$CellCount[i])) + for (i in 1:nClusters) ClusterID <- c(ClusterID, rep.int(i - 1, description$CellCount[i])) ## Note that Cpp counts arrays from zero, so the cluster IDs are counted likewise ## in order to be compatible with Cpp code. - - ##Pull apart the count file into counts for mutated read and total counts respectively - mutatedReadCounts <- matrix(0,nrow = nMutations, ncol = 0) - for (j in 1:nClusters){ - mutatedReadCounts <- cbind(mutatedReadCounts,counts[,4+2*j]) + + ## Pull apart the count file into counts for mutated read and total counts respectively + mutatedReadCounts <- matrix(0, nrow = nMutations, ncol = 0) + for (j in 1:nClusters) { + mutatedReadCounts <- cbind(mutatedReadCounts, counts[, 4 + 2 * j]) } - - totalReadCounts <- matrix(0,nrow = nMutations, ncol = 0) - for (j in 1:nClusters){ - totalReadCounts <- cbind(totalReadCounts,counts[,4+2*j-1]) + + totalReadCounts <- matrix(0, nrow = nMutations, ncol = 0) + for (j in 1:nClusters) { + totalReadCounts <- cbind(totalReadCounts, counts[, 4 + 2 * j - 1]) } - - - wildtypeReadCounts <- totalReadCounts - mutatedReadCounts - - - mutatedReadCounts <- mutatedReadCounts %>% t() %>% as.data.frame() %>% as.list() - wildtypeReadCounts <- wildtypeReadCounts %>% t() %>% as.data.frame() %>% as.list() - totalReadCounts <- totalReadCounts %>% t() %>% as.data.frame() %>% as.list() - - - mutationDescription <- counts[,1:4] - - ##wbc status indicates which of the cells is a white blood cells and which one isn't. - ##So far, the cells are arbitrary, and I will assign the fist cells from a cluster to be WBCs. + + + wildtypeReadCounts <- totalReadCounts - mutatedReadCounts + + + mutatedReadCounts <- mutatedReadCounts %>% + t() %>% + as.data.frame() %>% + as.list() + wildtypeReadCounts <- wildtypeReadCounts %>% + t() %>% + as.data.frame() %>% + as.list() + totalReadCounts <- totalReadCounts %>% + t() %>% + as.data.frame() %>% + as.list() + + + mutationDescription <- counts[, 1:4] + + ## wbc status indicates which of the cells is a white blood cells and which one isn't. + ## So far, the cells are arbitrary, and I will assign the fist cells from a cluster to be WBCs. wbcStatus <- rep(0, nCells) - - for(i in 1:nClusters){ + + for (i in 1:nClusters) { j <- 1 - while(j <= description$WBCs[i]){ #Iterating over the number of White blood cells of a cluster - wbcStatus[which(ClusterID == i-1)[1] + j-1] <- 1 # and identifying the first cell + while (j <= description$WBCs[i]) { # Iterating over the number of White blood cells of a cluster + wbcStatus[which(ClusterID == i - 1)[1] + j - 1] <- 1 # and identifying the first cell # that belongs to a cluster and counting from then on - ## Note: The cluster IDs are counted from zero! - j<- j+1 + ## Note: The cluster IDs are counted from zero! + j <- j + 1 } } - - sample_description <- data.frame(Cluster = ClusterID, - ClusterName = description$Cluster[ClusterID + 1], - WBC = wbcStatus, - color = description$color[ClusterID + 1]) - + + sample_description <- data.frame( + Cluster = ClusterID, + ClusterName = description$Cluster[ClusterID + 1], + WBC = wbcStatus, + color = description$color[ClusterID + 1] + ) + sample_description <- sample_description %>% mutate(single_cell = !(duplicated(Cluster)) & !(duplicated(Cluster, fromLast = TRUE))) - - -# annotations <- read_delim('../../input_folder/filtered/CGI/LM2_cgi/alterations.tsv', delim = '\t') - - return(list("postSampling" = postSampling, "nClusters" = nClusters, - "clusterID" = ClusterID, "nCells" = nCells, - "nMutations" = nMutations, "nClusters" = nClusters, - "alleleCount" = alleleCount, - "mutatedReadCounts" = mutatedReadCounts, - "totalReadCounts" = totalReadCounts, "wbcStatus" = wbcStatus, - "sample_description" = sample_description, - "mutationDescription" = mutationDescription, -# "annotations" = annotations, - "sampleName" = treeName, "directory" = inputFolder)) + + + # annotations <- read_delim('../../input_folder/filtered/CGI/LM2_cgi/alterations.tsv', delim = '\t') + + return(list( + "postSampling" = postSampling, "nClusters" = nClusters, + "clusterID" = ClusterID, "nCells" = nCells, + "nMutations" = nMutations, "nClusters" = nClusters, + "alleleCount" = alleleCount, + "mutatedReadCounts" = mutatedReadCounts, + "totalReadCounts" = totalReadCounts, "wbcStatus" = wbcStatus, + "sample_description" = sample_description, + "mutationDescription" = mutationDescription, + # "annotations" = annotations, + "sampleName" = treeName, "directory" = inputFolder + )) } @@ -579,103 +599,105 @@ load_data <- function(inputFolder, treeName){ #' lies in the 1% quantile of the set of all pairwise genetic distances. #' As the distance the Hamming distance is chosen. #' -#' @param inputFolder -#' @param treeName +#' @param inputFolder +#' @param treeName #' #' @return #' monoclonal_pairs: A list of pairs of cell names that are similar to each other. #' distance_matrix: A matrix indicates all pairwise distnaces of suggested genotypes. #' full_distance_matrix: The full pairwise distance matrix of all genotypes. -#' -#' +#' +#' #' @export #' #' @examples -load_monoclonal_pairs <- function(inputFolder, treeName, cutoff = ""){ - data_file <- sprintf("%s/%s/%s_genotypes.ped", inputFolder, treeName,treeName) - - data <- read_delim(data_file, delim = '\t',col_names = FALSE) +load_monoclonal_pairs <- function(inputFolder, treeName, cutoff = "") { + data_file <- sprintf("%s/%s/%s_genotypes.ped", inputFolder, treeName, treeName) + + data <- read_delim(data_file, delim = "\t", col_names = FALSE) data2 <- data %>% select(!2:6) - + distance_matrix <- matrix(0, nrow = nrow(data2), ncol = nrow(data2)) - - - for(i in 1:nrow(data2)){ + + + for (i in 1:nrow(data2)) { j <- 1 - while(j < i){ - row_i <- data2 %>% select(!1) %>% slice(i) - row_j <- data2 %>% select(!1) %>% slice(j) - - distance_matrix[i,j] <- sum(!(row_i == row_j)) - j <- j+1 + while (j < i) { + row_i <- data2 %>% + select(!1) %>% + slice(i) + row_j <- data2 %>% + select(!1) %>% + slice(j) + + distance_matrix[i, j] <- sum(!(row_i == row_j)) + j <- j + 1 } } - - + + distance_vector <- as.vector(distance_matrix[lower.tri(distance_matrix)]) - - - - if(class(cutoff) == "numeric"){ - monoclonal_candidate_cutoff <- cutoff - } - else{ + + + + if (class(cutoff) == "numeric") { + monoclonal_candidate_cutoff <- cutoff + } else { monoclonal_candidate_cutoff <- quantile(distance_vector, probs = 0.01) } - - + + sum(distance_vector <= monoclonal_candidate_cutoff) which(distance_vector <= monoclonal_candidate_cutoff) - + print("1% quantile of genetic distances:") print(monoclonal_candidate_cutoff) - + plot( - ggplot(data.frame(x = distance_vector),aes(x = x))+ - geom_histogram(binwidth = 2) + - geom_vline(xintercept = monoclonal_candidate_cutoff, linetype = "dashed", color = "red") + ggplot(data.frame(x = distance_vector), aes(x = x)) + + geom_histogram(binwidth = 2) + + geom_vline(xintercept = monoclonal_candidate_cutoff, linetype = "dashed", color = "red") ) - - + + candidates <- list() candidate_index <- vector() iterator <- 0 number_of_output_pairs <- 15 - for (count in 0:monoclonal_candidate_cutoff){ + for (count in 0:monoclonal_candidate_cutoff) { all_elements <- which(distance_matrix == count) all_elements_list <- list() - for (it in all_elements){ - coordinates1 <- ((it-1) %% dim(distance_matrix)[2]) + 1 - coordinates2 <- ((it-1) %/% dim(distance_matrix)[2]) + 1 + for (it in all_elements) { + coordinates1 <- ((it - 1) %% dim(distance_matrix)[2]) + 1 + coordinates2 <- ((it - 1) %/% dim(distance_matrix)[2]) + 1 all_elements_list <- append(all_elements_list, list(c(coordinates1, coordinates2))) } - - for (it in all_elements_list){ - if(it[1] <= it[2]) next - - - #Check whether the candidate pair of cells consists of single tumour cells: - - - - candidates <- c(candidates,list(c(as.character(data2[it[1],1]),as.character(data2[it[2],1])))) + + for (it in all_elements_list) { + if (it[1] <= it[2]) next + + + # Check whether the candidate pair of cells consists of single tumour cells: + + + + candidates <- c(candidates, list(c(as.character(data2[it[1], 1]), as.character(data2[it[2], 1])))) candidate_index <- c(candidate_index, it[1], it[2]) iterator <- iterator + 1 - - if(iterator == number_of_output_pairs) break + + if (iterator == number_of_output_pairs) break } - if(iterator == number_of_output_pairs) break - } - if(length(unique(sort(candidate_index)))!=0){ - distance_matrix2 <- distance_matrix[unique(sort(candidate_index)),unique(sort(candidate_index))] - colnames(distance_matrix2) <- data2[unique(sort(candidate_index)),1]$X1 + if (iterator == number_of_output_pairs) break } - else{ + if (length(unique(sort(candidate_index))) != 0) { + distance_matrix2 <- distance_matrix[unique(sort(candidate_index)), unique(sort(candidate_index))] + colnames(distance_matrix2) <- data2[unique(sort(candidate_index)), 1]$X1 + } else { distance_matrix2 <- 0 } - - + + distance_matrix <- as.data.frame(distance_matrix) colnames(distance_matrix) <- data2$X1 rownames(distance_matrix) <- data2$X1 @@ -685,8 +707,8 @@ load_monoclonal_pairs <- function(inputFolder, treeName, cutoff = ""){ -callGenotypes <- function(){ - +callGenotypes <- function() { + } @@ -695,7 +717,7 @@ callGenotypes <- function(){ #################### -##Helper Functions## +## Helper Functions## #################### #' Forking-based parallelisation. @@ -715,7 +737,7 @@ callGenotypes <- function(){ #' array of “rank” (==length(dim(.))) one higher than the result of FUN(X[[i]]). #' @param USE.NAMES logical; if TRUE and if X is character, use X as names for #' the result unless it had names already. Since this argument follows ... its -#' name cannot be abbreviated. +#' name cannot be abbreviated. #' #' @returnFor sapply(simplify = TRUE) and replicate(simplify = TRUE): if X has #' length zero or n = 0, an empty list. Otherwise an atomic vector or matrix or @@ -727,12 +749,12 @@ callGenotypes <- function(){ #' @export #' #' @examples -#mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) { +# mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) { # FUN <- match.fun(FUN) # answer <- parallel::mclapply(X = X, FUN = FUN, ...) -# if (USE.NAMES && is.character(X) && is.null(names(answer))) +# if (USE.NAMES && is.character(X) && is.null(names(answer))) # names(answer) <- X -# if (!isFALSE(simplify) && length(answer)) +# if (!isFALSE(simplify) && length(answer)) # simplify2array(answer, higher = (simplify == "array")) # else answer -#} +# } diff --git a/experiments/workflow/rules/base.smk b/experiments/workflow/rules/base.smk index 4b8cc7f..4e28b5b 100644 --- a/experiments/workflow/rules/base.smk +++ b/experiments/workflow/rules/base.smk @@ -1,19 +1,19 @@ rule prepare_markdown_file: input: - PROJECT_DIR / 'workflow' / 'resources' / 'template.Rmd' + PROJECT_DIR / "workflow" / "resources" / "template.Rmd", output: - PROJECT_DIR / 'data' / 'markdowns' / '{SAMPLE}.Rmd', + PROJECT_DIR / "data" / "markdowns" / "{SAMPLE}.Rmd", resources: - mem_mb = 1024, - runtime = 10, + mem_mb=1024, + runtime=10, params: - sampling_depth = 1000, - script_dir = {markdown_helper_functions}, - author = config['author'], - input_dir = input_folder, - simulation_dir = simulations_folder, + sampling_depth=1000, + script_dir={markdown_helper_functions}, + author=config["author"], + input_dir=input_folder, + simulation_dir=simulations_folder, log: - PROJECT_DIR / 'logs' / 'prepare_markdown_file.{SAMPLE}.log', + PROJECT_DIR / "logs" / "prepare_markdown_file.{SAMPLE}.log", shell: """ ( sed -e "s/__tree__/{wildcards.SAMPLE}/g" \ @@ -26,33 +26,34 @@ rule prepare_markdown_file: {input} > {output} ) &> {log} """ + rule render_markdown_file: input: - PROJECT_DIR / 'data' / 'markdowns' / '{SAMPLE}.Rmd', + PROJECT_DIR / "data" / "markdowns" / "{SAMPLE}.Rmd", output: - PROJECT_DIR / 'data' / 'htmls' / '{SAMPLE}.html', + PROJECT_DIR / "data" / "htmls" / "{SAMPLE}.html", threads: 16 conda: - Path(workflow.basedir) / 'envs' / 'R.yml', + Path(workflow.basedir) / "envs" / "R.yml" log: - PROJECT_DIR / 'logs' / 'render_markdown_file.{SAMPLE}.log', + PROJECT_DIR / "logs" / "render_markdown_file.{SAMPLE}.log", shell: """ ( Rscript -e "rmarkdown::render('{input}', output_file = '{output}')" ) &> {log} """ + rule render_topSeparators_markdowns: input: - PROJECT_DIR / 'data' / 'markdowns' / '{SAMPLE}_top_Separators.Rmd', + PROJECT_DIR / "data" / "markdowns" / "{SAMPLE}_top_Separators.Rmd", output: - PROJECT_DIR / 'data' / 'htmls' / '{SAMPLE}_top_Separators.html', + PROJECT_DIR / "data" / "htmls" / "{SAMPLE}_top_Separators.html", threads: 4 conda: - Path(workflow.basedir) / 'envs' / 'R.yml', + Path(workflow.basedir) / "envs" / "R.yml" log: - PROJECT_DIR / 'logs' / 'render_topSeparators_markdowns.{SAMPLE}.log', + PROJECT_DIR / "logs" / "render_topSeparators_markdowns.{SAMPLE}.log", shell: """ ( Rscript -e "rmarkdown::render('{input}', output_file = '{output}')" ) &> {log} """ -