Merge branch 'main' into cansavvy/docker-updates

FredHutch · Jul 17, 2024 · 2d25d6b · 2d25d6b
2 parents 06ef77b + 73c9acd
commit 2d25d6b
Show file tree

Hide file tree

Showing 22 changed files with 467 additions and 149 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,11 +1,12 @@
 # Generated by roxygen2: do not edit by hand
 
+export(calc_crispr)
 export(calc_gi)
-export(calc_lfc)
 export(example_data_folder)
 export(get_example_data)
 export(gimap_annotate)
 export(gimap_filter)
+export(gimap_normalize)
 export(run_qc)
 export(setup_data)
 import(dplyr)

diff --git a/R/00-setup_data.R b/R/00-setup_data.R
@@ -17,7 +17,6 @@
 #'
 #' # You can see what an example dataset looks like by pulling the example gimap_dataset:
 #' gimap_dataset <- get_example_data("gimap")
-#'
 #' }
 setup_data <- function(counts = NULL,
                        pg_ids = NULL,
@@ -43,7 +42,8 @@ setup_data <- function(counts = NULL,
       all_reps_zerocount_ids = NULL
     ),
     annotation = NULL,
-    log_fc = NULL,
+    normalized_log_fc = NULL,
+    crispr_score = NULL,
     results = NULL
   )
 

diff --git a/R/02-filter.R → R/02-gimap_filter.R b/R/02-filter.R → R/02-gimap_filter.R
@@ -29,12 +29,12 @@
 #'
 #' # To see filtered data
 #' gimap_dataset$filtered_data
-#' 
+#'
 #' # If you want to only use a single filter or some subset, specify which using the filter_type parameter
-#' gimap_dataset <- gimap_filter(gimap_dataset, filter_type = "zero_count_only") 
-#' #or 
+#' gimap_dataset <- gimap_filter(gimap_dataset, filter_type = "zero_count_only")
+#' #or
 #' gimap_dataset <- gimap_filter(gimap_dataset, filter_type = "low_plasmid_cpm_only")
-#' 
+#'
 #' # If you want to use multiple filters and more than one to flag a pgRNA construct before it's filtered out, use the `min_n_filters` argument
 #' gimap_dataset <- gimap_filter(gimap_dataset, filter_type = "both", min_n_filters = 2)
 #' 
@@ -56,14 +56,14 @@ gimap_filter <- function(.data = NULL,
   if (!is.null(.data)) gimap_dataset <- .data
 
   if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
-  
+
   #check filter type input to make sure that it is a supportable input
   if (!(filter_type %in% c("both", "zero_count_only", "low_plasmid_cpm_only"))) stop("Specification for `filter_type` not understood; Need to use 'both', 'zero_count_only', or 'low_plasmid_cpm_only'")
-  
+
   zc_filter <- NULL
   p_filter <- NULL
   #*ADD any new filters here* assigning it a NULL value
-  
+
   #This section calls the appropriate filtering functions and assigns results to the filter variables assigned NULL earlier (they will stay NULL if there filter wasn't selected to be run according to the input to the function)
   if (filter_type == "both"){
     zc_filter <- qc_filter_zerocounts(gimap_dataset, filter_zerocount_target_col = filter_zerocount_target_col)$filter
@@ -73,13 +73,13 @@ gimap_filter <- function(.data = NULL,
   } else if(filter_type == "low_plasmid_cpm_only"){
     p_filter <- qc_filter_plasmid(gimap_dataset, cutoff = cutoff, filter_plasmid_target_col = filter_plasmid_target_col)$filter
   }
-  
-  
+
+
   possible_filters <- list(zc_filter, p_filter)
   #*ADD any new filters here* within the list of `possible_filters`
-  
+
   #this first cbinds each filter enumerated in possible_filters together (no matter how many there are, and ignores the NULLs) using the reduce function
-  #then it finds the row sum (how many are filters flagged each construct e.g., number of TRUE in each row), 
+  #then it finds the row sum (how many are filters flagged each construct e.g., number of TRUE in each row),
   #and finally compares the row sum to the `min_n_filters` parameter to report TRUEs and FALSEs according to whether each construct is flagged by the minimum number of required filters
   #TRUE means it should be filtered, FALSE means it shouldn't be filtered
   one_filter_df <- reduce(possible_filters, cbind) %>% 
@@ -143,7 +143,7 @@ gimap_filter <- function(.data = NULL,
 #' @examples \dontrun{
 #'   gimap_dataset <- get_example_data("gimap")
 #'   qc_filter_zerocounts(gimap_dataset)
-#'   
+#'
 #'   #or to specify a different column (or set of columns to select)
 #'   qc_filter_zerocount(gimap_dataset, filter_zerocount_target_col = 1:2)
 #' }
@@ -154,94 +154,94 @@ qc_filter_zerocounts <- function(gimap_dataset, filter_zerocount_target_col = NU
   if (is.null(filter_zerocount_target_col)) {filter_zerocount_target_col <- c(1:ncol(gimap_dataset$raw_counts))}
 
   if (!all(filter_zerocount_target_col %in% 1:ncol(gimap_dataset$raw_counts))) {
-    stop("The columns selected do not exist. `filter_zerocount_target_col` needs to correspond to the index of the columns in `gimap_dataset$raw_counts` that you need to filter by") 
+    stop("The columns selected do not exist. `filter_zerocount_target_col` needs to correspond to the index of the columns in `gimap_dataset$raw_counts` that you need to filter by")
    }
-  
+
   counts_filter <- data.frame(gimap_dataset$raw_counts[,filter_zerocount_target_col]) %>% map(~.x %in% c(0)) %>% reduce(`|`)
 
   zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), n = c(sum(!counts_filter), sum(counts_filter))) %>%
     mutate(percent = round(((n/sum(n))*100),2))
 
-  return(list(filter = counts_filter, 
+  return(list(filter = counts_filter,
               reportdf = zerocount_df))
 
 }
 
 #' Create a filter for pgRNAs which have a low log2 CPM value for the plasmid/Day 0 sample/time point
-#' @description This function flags and reports which and how many pgRNAs have low log2 CPM values for the plasmid/Day 0 sample/time point. If more than one column is specified as the plasmid sample, 
+#' @description This function flags and reports which and how many pgRNAs have low log2 CPM values for the plasmid/Day 0 sample/time point. If more than one column is specified as the plasmid sample,
 #' we pool all the replicate samples to find the lower outlier and flag constructs for which any plasmid replicate has a log2 CPM value below the cutoff
 #' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the log2 CPM transformed data
 #' @param cutoff default is NULL, the cutoff for low log2 CPM values for the plasmid time period; if not specified, The lower outlier (defined by taking the difference of the lower quartile and 1.5 * interquartile range) is used
 #' @param filter_plasmid_target_col default is NULL, and if NULL, will select the first column only; this parameter specifically should be used to specify the plasmid column(s) that will be selected
 #' @importFrom magrittr %>%
 #' @importFrom dplyr mutate across if_any
-#' @importFrom tidyr pivot_wider pivot_longer 
+#' @importFrom tidyr pivot_wider pivot_longer
 #' @importFrom janitor clean_names
 #' @return a named list with the filter `filter` specifying which pgRNAs have low plasmid log2 CPM (column of interest is `plasmid_cpm_filter`) and a report df `reportdf` for the number and percent of pgRNA which have a low plasmid log2 CPM
 #' @examples \dontrun{
 #'   gimap_dataset <- get_example_data("gimap")
-#' 
+#'
 #'   qc_filter_plasmid(gimap_dataset)
-#'   
+#'
 #'   #or to specify a cutoff value to be used in the filter rather than the lower outlier default
 #'   qc_filter_plasmid(gimap_dataset, cutoff=2)
-#'   
+#'
 #'   #or to specify a different column (or set of columns to select)
 #'   qc_filter_plasmid(gimap_dataset, filter_plasmid_target_col = 1:2)
 #'
 #'   # or to specify a cutoff value that will be used in the filter rather than the lower outlier default as well as to specify a different column (or set of columns) to select
 #'   qc_filter_plasmid(gimap_dataset, cutoff=1.75, filter_plasmid_target_col=1:2)
-#' 
+#'
 #' }
 #'
 
 qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL, filter_plasmid_target_col = NULL){
-  
+
   if (is.null(filter_plasmid_target_col)) {filter_plasmid_target_col <- c(1)}
-  
+
   if (!all(filter_plasmid_target_col %in% 1:ncol(gimap_dataset$transformed_data$log2_cpm))) {
-    stop("The columns selected do not exist. `filter_plasmid_target_col` needs to correspond to the index of the columns in `gimap_dataset$transformed_data$log2_cpm` that you need to filter by") 
+    stop("The columns selected do not exist. `filter_plasmid_target_col` needs to correspond to the index of the columns in `gimap_dataset$transformed_data$log2_cpm` that you need to filter by")
   }
-  
+
   plasmid_data <- data.frame(gimap_dataset$transformed_data$log2_cpm[, filter_plasmid_target_col]) %>% `colnames<-`(rep(c("plasmid_log2_cpm"), length(filter_plasmid_target_col))) %>% clean_names()
-  
+
   if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, collapse all of the columns into the same vector using pivot_longer to store in a df with the name of the rep and number for row/construct
     plasmid_data <- plasmid_data %>% mutate(construct = rownames(plasmid_data)) %>%
-      pivot_longer(starts_with("plasmid_log2_cpm"), 
-                   values_to = "plasmid_log2_cpm", 
+      pivot_longer(starts_with("plasmid_log2_cpm"),
+                   values_to = "plasmid_log2_cpm",
                    names_to = "rep")
   }
-  
+
   if (is.null(cutoff)) {
-    # if cutoff is null, use lower outlier 
+    # if cutoff is null, use lower outlier
     quantile_info <- quantile(plasmid_data$plasmid_log2_cpm)
-    
+
     cutoff <- quantile_info["25%"] - (1.5 * (quantile_info["75%"] - quantile_info["25%"])) #later step make a function for this in utils since it's used more than once
   }
-  
+
   if (length(filter_plasmid_target_col >1)){ #if more than one column was selected, take collapsed/pooled data and compare it to the cutoff
                                             #then pivot_wider so that the constructs are in the same row and we can use if_any to report if any of the replicates were flagged by the cutoff
                                             #return just that summary column (reporting if any are TRUE) as the filter
-    plasmid_data <- plasmid_data %>% 
+    plasmid_data <- plasmid_data %>%
       mutate(filterFlag = plasmid_log2_cpm < cutoff) %>%
       pivot_wider(id_cols = construct, names_from = rep, values_from = filterFlag)
-    plasmid_cpm_filter <- plasmid_data %>% 
+    plasmid_cpm_filter <- plasmid_data %>%
       mutate(plasmid_cpm_filter=  if_any(.cols = starts_with('plasmid_log2_cpm'))) %>%
       select(plasmid_cpm_filter)
-    
+
   } else {
-  
+
     plasmid_cpm_filter <- as.data.frame(plasmid_data$plasmid_log2_cpm < cutoff) %>%`colnames<-`("plasmid_cpm_filter")
-  
+
   }
-    
-    
+
+
   plasmid_filter_df <- data.frame("Plasmid_log2cpmBelowCutoff" = c(FALSE, TRUE), n = c(sum(!plasmid_cpm_filter), sum(plasmid_cpm_filter))) %>%
     mutate(percent = round(((n / sum(n)) * 100), 2)) #later step make a function for this in utils since it's used more than once
-  
+
   return(list(
     filter = plasmid_cpm_filter,
     reportdf = plasmid_filter_df
   ))
 
-}
+}
diff --git a/R/03-annotate.R b/R/03-annotate.R
@@ -40,6 +40,8 @@ gimap_annotate <- function(.data = NULL,
     annotation_df <- get_example_data("annotation")
   }
 
+  message("Annotating Data")
+
   ############################ CONTROL GENE ANNOTATION #########################
   # If control genes aren't provided then we get some from DepMap
   if (!is.null(control_genes)) {
@@ -120,7 +122,18 @@ gimap_annotate <- function(.data = NULL,
       "positive_control",
       "single_targeting",
       "double_targeting"
-    )))
+    )),
+    unexpressed_ctrl_flag = dplyr::case_when(
+      norm_ctrl_flag == "double_targeting" & gene1_expressed_flag == FALSE & gene2_expressed_flag == FALSE ~ TRUE,
+      norm_ctrl_flag == "single_targeting" & (gene1_expressed_flag == FALSE | gene2_expressed_flag == FALSE) ~ TRUE,
+      TRUE ~ FALSE
+    ),
+    pgRNA_target = dplyr::case_when(
+      target_type == "gene_gene" ~ paste(gene1_symbol, gene2_symbol, sep = "_"),
+      target_type == "gene_ctrl" ~ paste(gene1_symbol, "ctrl", sep = "_"),
+      target_type == "ctrl_gene" ~ paste("ctrl", gene2_symbol, sep = "_"),
+      TRUE ~ target_type
+    ))
 
   if (gimap_dataset$filtered_data$filter_step_run){
     keep_for_annotdf <- annotation_df$pgRNA_id %in% unlist(gimap_dataset$filtered_data$metadata_pg_ids)

diff --git a/R/04-foldchange.R b/R/04-foldchange.R