Merge pull request #65 from FredHutch/cansavvy/separate-depmap

Make depmap annotation optional
FredHutch · Nov 18, 2024 · dfd8cb1 · dfd8cb1
2 parents 77aebef + 7cd2002
commit dfd8cb1
Show file tree

Hide file tree

Showing 7 changed files with 227 additions and 131 deletions.
diff --git a/R/03-annotate.R b/R/03-annotate.R
@@ -2,10 +2,10 @@
 #' @description In this function, a `gimap_dataset` is annotated as far as which genes should be used as controls.
 #' @param .data Data can be piped in with tidyverse pipes from function to function. But the data must still be a gimap_dataset
 #' @param gimap_dataset A special dataset structure that is setup using the `setup_data()` function.
-#' @param cell_line which cell line are you using? (e.g., HELA, PC9, etc.). Required argument
-#' @param cn_annotate TRUE or FALSE you'd also like to have Copy number annotation from DepMap. These data are optional
 #' @param annotation_file If no file is given, will attempt to use the design file from https://media.addgene.org/cms/filer_public/a9/9a/a99a9328-324b-42ff-8ccc-30c544b899e4/pgrna_library.xlsx
 #' @param control_genes A vector of gene symbols (e.g. AAMP) that should be labeled as control genes. These will be used for log fold change calculations. If no list is given then DepMap Public 23Q4 Achilles_common_essentials.csv is used https://depmap.org/portal/download/all/
+#' @param depmap_annotate TRUE or FALSE you'd also like to have DepMap annotation. These data are optional
+#' @param cell_line which cell line are you using? (e.g., HELA, PC9, etc.). Required argument if demap_annotate is TRUE.
 #' @importFrom stringr word
 #' @export
 #' @examples \dontrun{
@@ -24,14 +24,16 @@
 #' }
 gimap_annotate <- function(.data = NULL,
                            gimap_dataset,
-                           cell_line,
+                           annotation_file = NULL,
                            control_genes = NULL,
-                           cn_annotate = TRUE,
-                           annotation_file = NULL) {
+                           depmap_annotate = TRUE,
+                           cell_line = NULL) {
   if (!is.null(.data)) gimap_dataset <- .data
 
   if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
 
+  if (depmap_annotate & is.null(cell_line)) stop("If you want DepMap annotation you must specify which cell line you are using")
+
   # Get the annotation data based on the pg construct design
   if (!is.null(annotation_file)) {
     if (!file.exists(annotation_file)) stop("The annotation_file specified cannot be found. Please double check the file path")
@@ -54,33 +56,33 @@ gimap_annotate <- function(.data = NULL,
   }
 
   ############################ Get TPM data ####################################
-  # This is not optional because its used to flag things
-  ## get TPM and CN information (w/ option for user to upload their own info)
-  depmap_metadata <- readr::read_csv("https://figshare.com/ndownloader/files/35020903", show_col_types = FALSE)
-
-  my_depmap_id <- depmap_metadata %>%
-    dplyr::filter(stripped_cell_line_name == toupper(cell_line)) %>%
-    dplyr::pull(DepMap_ID)
+  if (depmap_annotate) {
+    # This is used to flag things
+    ## get TPM and CN information (w/ option for user to upload their own info)
+    depmap_metadata <- readr::read_csv("https://figshare.com/ndownloader/files/35020903", show_col_types = FALSE)
 
-  if (length(my_depmap_id) == 0) stop("The cell line specified, ", cell_line, "was not found in the DepMap data. Run supported_cell_lines() to see the full list")
+    my_depmap_id <- depmap_metadata %>%
+      dplyr::filter(stripped_cell_line_name == toupper(cell_line)) %>%
+      dplyr::pull(DepMap_ID)
 
-  tpm_file <- file.path(system.file("extdata", package = "gimap"), "CCLE_expression.csv")
+    if (length(my_depmap_id) == 0) stop("The cell line specified, ", cell_line, "was not found in the DepMap data. Run supported_cell_lines() to see the full list")
 
-  if (!file.exists(tpm_file)) tpm_setup()
+    tpm_file <- file.path(system.file("extdata", package = "gimap"), "CCLE_expression.csv")
 
-  depmap_tpm <- readr::read_csv(tpm_file,
-    show_col_types = FALSE,
-    col_select = c("genes", dplyr::all_of(my_depmap_id))
-  ) %>%
-    dplyr::rename(log2_tpm = my_depmap_id) %>%
-    dplyr::mutate(expressed_flag = dplyr::case_when(
-      log2_tpm < 1 ~ FALSE,
-      log2_tpm >= 1 ~ TRUE,
-      is.na(log2_tpm) ~ NA
-    ))
+    if (!file.exists(tpm_file)) tpm_setup()
 
-  ############################ COPY NUMBER ANNOTATION ##########################
-  if (cn_annotate) {
+    depmap_tpm <- readr::read_csv(tpm_file,
+      show_col_types = FALSE,
+      col_select = c("genes", dplyr::all_of(my_depmap_id))
+    ) %>%
+      dplyr::rename(log2_tpm = my_depmap_id) %>%
+      dplyr::mutate(expressed_flag = dplyr::case_when(
+        log2_tpm < 1 ~ FALSE,
+        log2_tpm >= 1 ~ TRUE,
+        is.na(log2_tpm) ~ NA
+      ))
+
+    ############################ COPY NUMBER ANNOTATION ##########################
     cn_file <- file.path(system.file("extdata", package = "gimap"), "CCLE_gene_cn.csv")
     if (!file.exists(cn_file)) cn_setup()
 
@@ -102,32 +104,7 @@ gimap_annotate <- function(.data = NULL,
   annotation_df <- annotation_df %>%
     dplyr::mutate(
       gene1_essential_flag = gene1_symbol %in% control_genes,
-      gene2_essential_flag = gene2_symbol %in% control_genes
-    ) %>%
-    dplyr::left_join(depmap_tpm, by = c("gene1_symbol" = "genes")) %>%
-    dplyr::rename(gene1_expressed_flag = expressed_flag) %>%
-    dplyr::left_join(depmap_tpm, by = c("gene2_symbol" = "genes"), suffix = c("_gene1", "_gene2")) %>%
-    dplyr::rename(gene2_expressed_flag = expressed_flag) %>%
-    dplyr::mutate(norm_ctrl_flag = dplyr::case_when(
-      target_type == "gene_gene" ~ "double_targeting",
-      target_type == "gene_ctrl" & gene1_essential_flag == TRUE ~ "positive_control",
-      target_type == "ctrl_gene" & gene2_essential_flag == TRUE ~ "positive_control",
-      target_type == "gene_ctrl" & gene1_essential_flag != TRUE ~ "single_targeting",
-      target_type == "ctrl_gene" & gene2_essential_flag != TRUE ~ "single_targeting",
-      target_type == "ctrl_ctrl" ~ "negative_control"
-    )) %>%
-    dplyr::mutate(
-      norm_ctrl_flag = factor(norm_ctrl_flag, levels = c(
-        "negative_control",
-        "positive_control",
-        "single_targeting",
-        "double_targeting"
-      )),
-      unexpressed_ctrl_flag = dplyr::case_when(
-        norm_ctrl_flag == "double_targeting" & gene1_expressed_flag == FALSE & gene2_expressed_flag == FALSE ~ TRUE,
-        norm_ctrl_flag == "single_targeting" & (gene1_expressed_flag == FALSE | gene2_expressed_flag == FALSE) ~ TRUE,
-        TRUE ~ FALSE
-      ),
+      gene2_essential_flag = gene2_symbol %in% control_genes,
       pgRNA_target = dplyr::case_when(
         target_type == "gene_gene" ~ paste(gene1_symbol, gene2_symbol, sep = "_"),
         target_type == "gene_ctrl" ~ paste(gene1_symbol, "ctrl", sep = "_"),
@@ -136,6 +113,34 @@ gimap_annotate <- function(.data = NULL,
       )
     )
 
+  if (depmap_annotate) {
+    annotation_df <- annotation_df %>%
+      dplyr::left_join(depmap_tpm, by = c("gene1_symbol" = "genes")) %>%
+      dplyr::rename(gene1_expressed_flag = expressed_flag) %>%
+      dplyr::left_join(depmap_tpm, by = c("gene2_symbol" = "genes"), suffix = c("_gene1", "_gene2")) %>%
+      dplyr::rename(gene2_expressed_flag = expressed_flag) %>%
+      dplyr::mutate(norm_ctrl_flag = dplyr::case_when(
+        target_type == "gene_gene" ~ "double_targeting",
+        target_type == "gene_ctrl" & gene1_essential_flag == TRUE ~ "positive_control",
+        target_type == "ctrl_gene" & gene2_essential_flag == TRUE ~ "positive_control",
+        target_type == "gene_ctrl" & gene1_essential_flag != TRUE ~ "single_targeting",
+        target_type == "ctrl_gene" & gene2_essential_flag != TRUE ~ "single_targeting",
+        target_type == "ctrl_ctrl" ~ "negative_control"
+      )) %>%
+      dplyr::mutate(
+        norm_ctrl_flag = factor(norm_ctrl_flag, levels = c(
+          "negative_control",
+          "positive_control",
+          "single_targeting",
+          "double_targeting"
+        )),
+        unexpressed_ctrl_flag = dplyr::case_when(
+          norm_ctrl_flag == "double_targeting" & gene1_expressed_flag == FALSE & gene2_expressed_flag == FALSE ~ TRUE,
+          norm_ctrl_flag == "single_targeting" & (gene1_expressed_flag == FALSE | gene2_expressed_flag == FALSE) ~ TRUE,
+          TRUE ~ FALSE
+        )
+      )
+  }
   ################################ STORE IT ####################################
 
   if (gimap_dataset$filtered_data$filter_step_run) {

diff --git a/R/04-normalize.R b/R/04-normalize.R
@@ -155,7 +155,6 @@ gimap_normalize <- function(.data = NULL,
       message("The input data for the IDs which were not found in the annotation data will be kept throughout the analysis, but any data from the annotation won't be available for them.")
     }
   } else {
-
     missing_ids_file <- file.path(missing_ids_file)
     readr::write_csv(missing_ids, missing_ids_file)
   }

diff --git a/R/utils.R b/R/utils.R
@@ -69,21 +69,21 @@ get_example_data <- function(which_data) {
     )
     return(readr::read_tsv(file, show_col_types = FALSE))
   } else if (which_data == "final_treatment") {
-      file <- list.files(
-        pattern = "gimap_dataset_final_treatment.RDS",
-        recursive = TRUE,
-        system.file("extdata", package = "gimap"),
-        full.names = TRUE
-      )
-      return(readr::read_rds(file))
-   } else if (which_data == "final_treatment") {
-      file <- list.files(
-        pattern = "gimap_dataset_final_treatment.RDS",
-        recursive = TRUE,
-        system.file("extdata", package = "gimap"),
-        full.names = TRUE
-        )
-        return(readr::read_rds(file))
+    file <- list.files(
+      pattern = "gimap_dataset_final_treatment.RDS",
+      recursive = TRUE,
+      system.file("extdata", package = "gimap"),
+      full.names = TRUE
+    )
+    return(readr::read_rds(file))
+  } else if (which_data == "final_treatment") {
+    file <- list.files(
+      pattern = "gimap_dataset_final_treatment.RDS",
+      recursive = TRUE,
+      system.file("extdata", package = "gimap"),
+      full.names = TRUE
+    )
+    return(readr::read_rds(file))
   } else {
     stop("Specification for `which_data` not understood; Need to use 'gimap', 'count', 'meta', or 'annotation' ")
   }

diff --git a/inst/rmd/scratch_gimap_CRISPR_score_review.Rmd b/inst/rmd/scratch_gimap_CRISPR_score_review.Rmd
@@ -18,10 +18,9 @@ devtools::load_all()
 ```{r}
  gimap_dataset <- get_example_data("gimap") %>%
    gimap_filter() %>%
-   gimap_annotate() %>%
+   gimap_annotate(cell_line = "HELA") %>%
    gimap_normalize(
-     timepoints = "day",
-     replicates = "rep") %>%
+     timepoints = "day") %>%
   calc_crispr()
 ```
 
@@ -35,7 +34,7 @@ old_crispr_results <- read_tsv("~/Downloads/results/calculate_LFC/tables/tsv/HeL
 
 ```{r}
 old_crispr_results %<>% mutate(
-  rep = recode(rep, "A" = "late_RepA", "B" = "late_RepB", "C" = "late_RepC")
+  rep = recode(rep, "A" = "Day22_RepA_late", "B" = "Day22_RepB_late", "C" = "Day22_RepC_late")
 )
 ```
 

diff --git a/inst/rmd/scratch_gimap_CRISPR_score_review.html b/inst/rmd/scratch_gimap_CRISPR_score_review.html
diff --git a/inst/rmd/scratch_gimap_GI_review.Rmd b/inst/rmd/scratch_gimap_GI_review.Rmd
@@ -7,6 +7,7 @@ date: "`r Sys.Date()`"
 ```{r warning=FALSE, message=FALSE}
 library(tidyverse)
 library(devtools)
+library(magrittr)
 
 devtools::load_all()
 ```
@@ -16,10 +17,9 @@ devtools::load_all()
 ```{r}
  gimap_dataset <- get_example_data("gimap") %>%
    gimap_filter() %>%
-   gimap_annotate() %>%
+   gimap_annotate(cell_line = "HELA") %>%
    gimap_normalize(
-     timepoints = "day",
-     replicates = "rep") %>%
+     timepoints = "day") %>%
   calc_crispr() %>%
   calc_gi()
 ```
@@ -34,6 +34,14 @@ saveRDS(gimap_dataset, "gimap_dataset_calculated.RDS")
 old_gi_results <- readRDS("d.HeLa_GI_scores_target")
 ```
 
+#### sync the reps values
+
+```{r}
+old_gi_results %<>% mutate(
+  rep = recode(rep, "A" = "Day22_RepA_late", "B" = "Day22_RepB_late", "C" = "Day22_RepC_late")
+)
+```
+
 ## Compare results
 
 ### Columns
@@ -48,7 +56,7 @@ colnames(old_gi_results)
 
 Looks like we want to compare the pgRNA targets which should be the `pgRNA_target_double` column for the gimap results and the `pgRNA_target` column for the GI Mapping results
 
-Columns in the GI Mapping results not in the gimap results seem to include model results (`intercept`, `slope`) as well as some significance values (`p_val`, `fdr`), observed and expected CRISPR score (`mean_observed_CS` and `mean_expected_CS` (which I think I need for plotting)). And how does `mean_GI_score` relate to the three GI score columns in the gimap results? 
+Columns in the GI Mapping results not in the gimap results seem to include model results (`intercept`, `slope`) as well as some significance values (`p_val`, `fdr`), observed and expected CRISPR score (`mean_observed_CS` and `mean_expected_CS` (which I think I need for plotting)). And how does `mean_GI_score` relate to the three GI score columns in the gimap results? The three GI score columns relate to the different target types.  
 
 ### Target overlap
 
@@ -61,7 +69,7 @@ length(unique(gimap_dataset$gi_scores$pgRNA_target_double))
 length(intersect(unique(old_gi_results$pgRNA_target), unique(gimap_dataset$gi_scores$pgRNA_target_double)))
 ```
 
-Only 1/3 of the GI Mapping targets are represented in the gimap results
+Only 1/3 of the GI Mapping targets are represented in the gimap results...
 
 ### Number of observations/rows total and rep column overlap
 
@@ -70,18 +78,57 @@ nrow(old_gi_results)
 nrow(gimap_dataset$gi_scores)
 ```
 
-The gimap results has far more rows 
+The gimap results has far more rows...
 
 ```{r}
 table(old_gi_results$rep)
 
 table(gimap_dataset$gi_scores$rep)
 ```
 
-While the replicate values themselves can be synced like we've done in other reviews, this further shows the far greater number of observations (even if there are fewer represented pgRNA targets) in the gimap results
+While the replicate values themselves can't be completely synced like we've done in other reviews due to the gimap results including the Day05_RepA_early datapoints, this further shows the far greater number of observations (even if there are fewer represented pgRNA targets) in the gimap results....
 
 ```{r}
 head(old_gi_results)
 
 ```
 
+```{r}
+joindf <- dplyr::full_join(old_gi_results, gimap_dataset$gi_scores,
+                           by = c("pgRNA_target" = "pgRNA_target_double", "rep"),
+                           suffix = c("_old", "_new"))
+```
+
+## Split out the comparisons we want to make
+
+Have to drop NA's because the gimap results include Day05_RepA_early datapoints while the GI Mapping doesn't. Also have to drop NAs because there are GI Mapping targets that aren't represented in the gimap targets?
+
+```{r}
+joined_df <- rbind(
+  #join the gene_ctrl
+  full_join(
+    old_gi_results %>% filter(target_type == "gene_ctrl") %>% select(c("pgRNA_target", "paralog_pair", "mean_GI_score", "rep")),
+    gimap_dataset$gi_scores %>% separate(pgRNA_target_double, c("gene1", "gene2"), sep="_", remove = FALSE) %>% mutate(ctrl = "ctrl") %>% unite("pgRNA_target_summary", c("gene1", "ctrl"), sep="_") %>% select(c("pgRNA_target_summary", "pgRNA_target_double", "single_target_gi_score_1", "rep")),
+    by = c("pgRNA_target" = "pgRNA_target_summary", "paralog_pair"="pgRNA_target_double", "rep"),
+    suffix = c("_old", "_new")
+    ) %>% mutate(target_type = "gene_ctrl") %>% distinct() %>% select(c("pgRNA_target", "mean_GI_score", "rep", "single_target_gi_score_1", "target_type")) %>% `colnames<-`(c("pgRNA_target", "GI_Mapping_GI_score", "rep", "gimap_GI_score", "target_type")) %>% drop_na(),
+  # join the ctrl_gene
+  full_join(
+    old_gi_results %>% filter(target_type == "ctrl_gene") %>% select(c("pgRNA_target", "paralog_pair", "mean_GI_score", "rep")),
+    gimap_dataset$gi_scores %>% separate(pgRNA_target_double, c("gene1", "gene2"), sep="_", remove=FALSE) %>% mutate(ctrl = "ctrl") %>% unite("pgRNA_target_summary", c("ctrl", "gene2"), sep="_") %>% select(c("pgRNA_target_summary", "pgRNA_target_double", "single_target_gi_score_2", "rep")),
+    by = c("pgRNA_target" = "pgRNA_target_summary", "rep"),
+    suffix = c("_old", "_new")
+    ) %>% mutate(target_type = "ctrl_gene") %>% distinct() %>% select(c("pgRNA_target", "mean_GI_score", "rep", "single_target_gi_score_2", "target_type")) %>% `colnames<-`(c("pgRNA_target", "GI_Mapping_GI_score", "rep", "gimap_GI_score", "target_type")) %>% drop_na(),
+  #join the gene_gene
+  full_join(
+    old_gi_results %>% filter(target_type == "gene_gene") %>% select(c("pgRNA_target", "mean_GI_score", "rep")),
+    gimap_dataset$gi_scores %>% select(c("pgRNA_target_double", "double_target_gi_score", "rep")),
+    by = c("pgRNA_target" = "pgRNA_target_double", "rep"),
+    suffix = c("_old", "_new")
+  ) %>% mutate(target_type = "gene_gene") %>% `colnames<-`(c("pgRNA_target", "GI_Mapping_GI_score", "rep", "gimap_GI_score", "target_type")) %>% drop_na()
+)
+```
+
+```{r}
+joined_df %>% ggplot(aes(x=gimap_GI_score, y=GI_Mapping_GI_score, color=target_type)) + geom_point() + facet_wrap(rep~target_type) + theme(legend.position = "none")
+```