-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Filtering step runs and stores result #40
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,7 +36,12 @@ setup_data <- function(counts = NULL, | |
pg_metadata = NULL, | ||
sample_metadata = NULL | ||
), | ||
filtered_data = NULL, | ||
filtered_data = list( | ||
filter_step_run = FALSE, #adding a way to know if the filter step was run since it's optional | ||
metadata_pg_ids = NULL, | ||
pg_metadata = NULL, | ||
transformed_log2_cpm = NULL | ||
), | ||
Comment on lines
+39
to
+44
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is part of description (a) |
||
annotation = NULL, | ||
log_fc = NULL, | ||
results = NULL | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,8 +64,8 @@ run_qc <- function(gimap_dataset, | |
dataset = gimap_dataset, | ||
plots_dir = plots_dir, | ||
filter_zerocount_target_col = filter_zerocount_target_col, | ||
filter_plasmid_target_col = filter_zerocount_target_col, | ||
filter_replicates_target_col = filter_zerocount_target_col | ||
filter_plasmid_target_col = filter_plasmid_target_col, | ||
filter_replicates_target_col = filter_replicates_target_col | ||
Comment on lines
+67
to
+68
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This corresponds to description (b) |
||
), | ||
... | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,7 +29,11 @@ | |
#' #or | ||
#' gimap_dataset <- gimap_filter(gimap_dataset, filter_type = "low_plasmid_cpm_only") | ||
#' | ||
#' # | ||
#' # If you want to use multiple filters and more than one to flag a pgRNA construct before it's filtered out, use the `min_n_filters` argument | ||
#' gimap_dataset <- gimap_filter(gimap_ddataset, filter_type = "both", min_n_filters = 2) | ||
#' | ||
#' # You can also specify which columns the filters will be applied to | ||
#' gimap_dataset <- gimap_filter(gimap_dataset, filter_type = "zero_count_only", filter_zerocount_target_col = c(1,2)) | ||
#' | ||
#' } | ||
#' | ||
|
@@ -56,11 +60,11 @@ gimap_filter <- function(.data = NULL, | |
#This section calls the appropriate filtering functions and assigns results to the filter variables assigned NULL earlier (they will stay NULL if there filter wasn't selected to be run according to the input to the function) | ||
if (filter_type == "both"){ | ||
zc_filter <- qc_filter_zerocounts(gimap_dataset, filter_zerocount_target_col = filter_zerocount_target_col)$filter | ||
p_filter <- qc_filter_plasmid(gimap_dataset, cutoff = cutoff, filter_plasmid_target_col = filter_plasmid_target_col)$plasmid_filter | ||
p_filter <- qc_filter_plasmid(gimap_dataset, cutoff = cutoff, filter_plasmid_target_col = filter_plasmid_target_col)$filter | ||
} else if (filter_type == "zero_count_only"){ | ||
zc_filter <- qc_filter_zerocounts(gimap_dataset, filter_zerocount_target_col = filter_zerocount_target_col)$filter | ||
} else if(filter_type == "low_plasmid_cpm_only"){ | ||
p_filter <- qc_filter_plasmid(gimap_dataset, cutoff = cutoff, filter_plasmid_target_col = filter_plasmid_target_col)$plasmid_filter | ||
p_filter <- qc_filter_plasmid(gimap_dataset, cutoff = cutoff, filter_plasmid_target_col = filter_plasmid_target_col)$filter | ||
Comment on lines
-59
to
+67
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is part of description (c) |
||
} | ||
|
||
|
||
|
@@ -71,10 +75,13 @@ gimap_filter <- function(.data = NULL, | |
#then it finds the row sum (how many are filters flagged each construct e.g., number of TRUE in each row), | ||
#and finally compares the row sum to the `min_n_filters` parameter to report TRUEs and FALSEs according to whether each construct is flagged by the minimum number of required filters | ||
#TRUE means it should be filtered, FALSE means it shouldn't be filtered | ||
combined_filter <- rowSums(reduce(possible_filters, cbind)) >= min_n_filters | ||
|
||
combined_filter <- rowSums(reduce(possible_filters, cbind)) >= min_n_filters | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This corresponds to description (e) |
||
#within `combined_filter` TRUE means that the filtering steps flagged the pgRNA construct for removal, therefore, we'll want to use the opposite FALSE values for the filtered data, keeping those that weren't flagged by filtering steps | ||
|
||
gimap_dataset$filtered <- NULL #TODO: Filtered version of the data can be stored here | ||
gimap_dataset$filtered_data$filter_step_run <- TRUE #adding a way to know if the filter step was run since it's optional | ||
gimap_dataset$filtered_data$metadata_pg_ids <- gimap_dataset$metadata$pg_ids[!combined_filter,] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reminder to myself, I'm going to have to go change the next steps of this code (the normalization and calc_crispr steps to use the filtered data instead.) |
||
gimap_dataset$filtered_data$pg_metadata <- gimap_dataset$metadata$pg_metadata[!combined_filter,] | ||
gimap_dataset$filtered_data$transformed_log2_cpm <- gimap_dataset$transformed_data$log2_cpm[!combined_filter,] | ||
Comment on lines
-77
to
+84
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This corresponds to description (f) |
||
|
||
return(gimap_dataset) | ||
} | ||
|
@@ -111,7 +118,8 @@ qc_filter_zerocounts <- function(gimap_dataset, filter_zerocount_target_col = NU | |
zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), n = c(sum(!counts_filter), sum(counts_filter))) %>% | ||
mutate(percent = round(((n/sum(n))*100),2)) | ||
|
||
return(list(filter = counts_filter, reportdf = zerocount_df)) | ||
return(list(filter = counts_filter, | ||
reportdf = zerocount_df)) | ||
|
||
} | ||
|
||
|
@@ -125,7 +133,7 @@ qc_filter_zerocounts <- function(gimap_dataset, filter_zerocount_target_col = NU | |
#' @importFrom dplyr mutate across if_any | ||
#' @importFrom tidyr pivot_wider pivot_longer | ||
#' @importFrom janitor clean_names | ||
#' @return a named list with the filter `plasmid_filter` specifying which pgRNAs have low plasmid log2 CPM (column of interest is `plasmid_cpm_filter`) and a report df `plasmid_filter_report` for the number and percent of pgRNA which have a low plasmid log2 CPM | ||
#' @return a named list with the filter `filter` specifying which pgRNAs have low plasmid log2 CPM (column of interest is `plasmid_cpm_filter`) and a report df `reportdf` for the number and percent of pgRNA which have a low plasmid log2 CPM | ||
Comment on lines
-128
to
+136
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is part of description (c) |
||
#' @examples \dontrun{ | ||
#' gimap_dataset <- get_example_data("gimap") | ||
#' | ||
|
@@ -188,8 +196,8 @@ qc_filter_plasmid <- function(gimap_dataset, cutoff = NULL, filter_plasmid_targe | |
mutate(percent = round(((n / sum(n)) * 100), 2)) #later step make a function for this in utils since it's used more than once | ||
|
||
return(list( | ||
plasmid_filter = plasmid_cpm_filter, | ||
plasmid_filter_report = plasmid_filter_df | ||
filter = plasmid_cpm_filter, | ||
reportdf = plasmid_filter_df | ||
Comment on lines
-191
to
+200
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is part of description (c) |
||
)) | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,7 @@ knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) | |
knitr::opts_knit$set(progress = TRUE, verbose = FALSE) | ||
library(gimap) | ||
library(ggplot2) | ||
library(purrr) | ||
``` | ||
|
||
# Unfiltered/raw data | ||
|
@@ -81,17 +82,35 @@ qc_constructs_countzero_bar(gimap_dataset, filter_zerocount_target_col= filter_z | |
If this filter is applied, this is the number of pgRNAs that would be filtered out | ||
|
||
```{r} | ||
qc_filter_zerocounts(gimap_dataset, filter_zerocount_target_col = filter_zerocount_target_col)$reportdf | ||
potentialFilter1 <- qc_filter_zerocounts(gimap_dataset, filter_zerocount_target_col = filter_zerocount_target_col) | ||
|
||
potentialFilter1$reportdf | ||
Comment on lines
-84
to
+87
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is part of description (g) |
||
``` | ||
|
||
### Filter pgRNAs where there is a low log2 CPM value for the plasmid sample/time point | ||
|
||
If this filter is applied, this is the number of pgRNAs that would be filtered out | ||
|
||
```{r} | ||
qc_filter_plasmid(gimap_dataset, filter_plasmid_target_col = filter_plasmid_target_col)$plasmid_filter_report | ||
potentialFilter2 <- qc_filter_plasmid(gimap_dataset, filter_plasmid_target_col = filter_plasmid_target_col) | ||
|
||
potentialFilter2$reportdf | ||
``` | ||
|
||
### If both filters are applied | ||
|
||
```{r} | ||
combined_filters <- reduce(list(potentialFilter1$filter, potentialFilter2$filter), cbind) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is part of description (g) "but then combined the potential filters themselves later on" |
||
``` | ||
|
||
| Which Filter(s) | Number of pgRNAs flagged for removal | Percent of total pgRNA constructs | | ||
|:---------------|:------------------------------------|:----------| | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Love this table. |
||
| Zero count, but not low plasmid CPM | `r sum(combined_filters[,1] == TRUE & combined_filters[,2] == FALSE)` | `r round(sum(combined_filters[,1] == TRUE & combined_filters[,2] == FALSE) / nrow(combined_filters) * 100, 2)`| | ||
| Low plasmid CPM, but not zero count | `r sum(combined_filters[,2] == TRUE & combined_filters[,1] == FALSE)` | `r round(sum(combined_filters[,2] == TRUE & combined_filters[,1] == FALSE) / nrow(combined_filters) * 100, 2)` | | ||
| Either Zero count or Low plasmid CPM or both | `r sum(rowSums(combined_filters) >= 1)`| `r round(sum(rowSums(combined_filters) >= 1) / nrow(combined_filters) * 100, 2)` | | ||
| Both Zero count and Low plasmid CPM | `r sum(rowSums(combined_filters) == 2)` | `r round(sum(rowSums(combined_filters) == 2) / nrow(combined_filters) * 100, 2)` | | ||
| Remaining pgRNAs flagged by no filters | `r sum(rowSums(combined_filters) == 0)` | `r round(sum(rowSums(combined_filters) == 0) / nrow(combined_filters) * 100, 2)` | | ||
Comment on lines
+106
to
+112
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This corresponds to description (h) |
||
|
||
# Session Info | ||
|
||
```{r} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -158,6 +158,14 @@ run_qc(gimap_dataset, | |
quiet = TRUE) | ||
``` | ||
|
||
```{r} | ||
gimap_dataset <- gimap_dataset %>% | ||
gimap_filter() | ||
|
||
nrow(gimap_dataset$filtered_data$transformed_log2_cpm) | ||
|
||
``` | ||
|
||
Comment on lines
+161
to
+168
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This corresponds to description (i) |
||
```{r} | ||
sessionInfo() | ||
``` |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was still NULL after I ran through setup, qc, and filtering steps. Unnecessary because the info that would normally be associated with it is handled in the annotate function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah we could probably remove it. I don't think it ends up being used???