Skip to content

Commit

Permalink
Merge pull request #65 from FredHutch/cansavvy/separate-depmap
Browse files Browse the repository at this point in the history
Make depmap annotation optional
  • Loading branch information
cansavvy authored Nov 18, 2024
2 parents 77aebef + 7cd2002 commit dfd8cb1
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 131 deletions.
111 changes: 58 additions & 53 deletions R/03-annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
#' @description In this function, a `gimap_dataset` is annotated as far as which genes should be used as controls.
#' @param .data Data can be piped in with tidyverse pipes from function to function. But the data must still be a gimap_dataset
#' @param gimap_dataset A special dataset structure that is setup using the `setup_data()` function.
#' @param cell_line which cell line are you using? (e.g., HELA, PC9, etc.). Required argument
#' @param cn_annotate TRUE or FALSE you'd also like to have Copy number annotation from DepMap. These data are optional
#' @param annotation_file If no file is given, will attempt to use the design file from https://media.addgene.org/cms/filer_public/a9/9a/a99a9328-324b-42ff-8ccc-30c544b899e4/pgrna_library.xlsx
#' @param control_genes A vector of gene symbols (e.g. AAMP) that should be labeled as control genes. These will be used for log fold change calculations. If no list is given then DepMap Public 23Q4 Achilles_common_essentials.csv is used https://depmap.org/portal/download/all/
#' @param depmap_annotate TRUE or FALSE you'd also like to have DepMap annotation. These data are optional
#' @param cell_line which cell line are you using? (e.g., HELA, PC9, etc.). Required argument if demap_annotate is TRUE.
#' @importFrom stringr word
#' @export
#' @examples \dontrun{
Expand All @@ -24,14 +24,16 @@
#' }
gimap_annotate <- function(.data = NULL,
gimap_dataset,
cell_line,
annotation_file = NULL,
control_genes = NULL,
cn_annotate = TRUE,
annotation_file = NULL) {
depmap_annotate = TRUE,
cell_line = NULL) {
if (!is.null(.data)) gimap_dataset <- .data

if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")

if (depmap_annotate & is.null(cell_line)) stop("If you want DepMap annotation you must specify which cell line you are using")

# Get the annotation data based on the pg construct design
if (!is.null(annotation_file)) {
if (!file.exists(annotation_file)) stop("The annotation_file specified cannot be found. Please double check the file path")
Expand All @@ -54,33 +56,33 @@ gimap_annotate <- function(.data = NULL,
}

############################ Get TPM data ####################################
# This is not optional because its used to flag things
## get TPM and CN information (w/ option for user to upload their own info)
depmap_metadata <- readr::read_csv("https://figshare.com/ndownloader/files/35020903", show_col_types = FALSE)

my_depmap_id <- depmap_metadata %>%
dplyr::filter(stripped_cell_line_name == toupper(cell_line)) %>%
dplyr::pull(DepMap_ID)
if (depmap_annotate) {
# This is used to flag things
## get TPM and CN information (w/ option for user to upload their own info)
depmap_metadata <- readr::read_csv("https://figshare.com/ndownloader/files/35020903", show_col_types = FALSE)

if (length(my_depmap_id) == 0) stop("The cell line specified, ", cell_line, "was not found in the DepMap data. Run supported_cell_lines() to see the full list")
my_depmap_id <- depmap_metadata %>%
dplyr::filter(stripped_cell_line_name == toupper(cell_line)) %>%
dplyr::pull(DepMap_ID)

tpm_file <- file.path(system.file("extdata", package = "gimap"), "CCLE_expression.csv")
if (length(my_depmap_id) == 0) stop("The cell line specified, ", cell_line, "was not found in the DepMap data. Run supported_cell_lines() to see the full list")

if (!file.exists(tpm_file)) tpm_setup()
tpm_file <- file.path(system.file("extdata", package = "gimap"), "CCLE_expression.csv")

depmap_tpm <- readr::read_csv(tpm_file,
show_col_types = FALSE,
col_select = c("genes", dplyr::all_of(my_depmap_id))
) %>%
dplyr::rename(log2_tpm = my_depmap_id) %>%
dplyr::mutate(expressed_flag = dplyr::case_when(
log2_tpm < 1 ~ FALSE,
log2_tpm >= 1 ~ TRUE,
is.na(log2_tpm) ~ NA
))
if (!file.exists(tpm_file)) tpm_setup()

############################ COPY NUMBER ANNOTATION ##########################
if (cn_annotate) {
depmap_tpm <- readr::read_csv(tpm_file,
show_col_types = FALSE,
col_select = c("genes", dplyr::all_of(my_depmap_id))
) %>%
dplyr::rename(log2_tpm = my_depmap_id) %>%
dplyr::mutate(expressed_flag = dplyr::case_when(
log2_tpm < 1 ~ FALSE,
log2_tpm >= 1 ~ TRUE,
is.na(log2_tpm) ~ NA
))

############################ COPY NUMBER ANNOTATION ##########################
cn_file <- file.path(system.file("extdata", package = "gimap"), "CCLE_gene_cn.csv")
if (!file.exists(cn_file)) cn_setup()

Expand All @@ -102,32 +104,7 @@ gimap_annotate <- function(.data = NULL,
annotation_df <- annotation_df %>%
dplyr::mutate(
gene1_essential_flag = gene1_symbol %in% control_genes,
gene2_essential_flag = gene2_symbol %in% control_genes
) %>%
dplyr::left_join(depmap_tpm, by = c("gene1_symbol" = "genes")) %>%
dplyr::rename(gene1_expressed_flag = expressed_flag) %>%
dplyr::left_join(depmap_tpm, by = c("gene2_symbol" = "genes"), suffix = c("_gene1", "_gene2")) %>%
dplyr::rename(gene2_expressed_flag = expressed_flag) %>%
dplyr::mutate(norm_ctrl_flag = dplyr::case_when(
target_type == "gene_gene" ~ "double_targeting",
target_type == "gene_ctrl" & gene1_essential_flag == TRUE ~ "positive_control",
target_type == "ctrl_gene" & gene2_essential_flag == TRUE ~ "positive_control",
target_type == "gene_ctrl" & gene1_essential_flag != TRUE ~ "single_targeting",
target_type == "ctrl_gene" & gene2_essential_flag != TRUE ~ "single_targeting",
target_type == "ctrl_ctrl" ~ "negative_control"
)) %>%
dplyr::mutate(
norm_ctrl_flag = factor(norm_ctrl_flag, levels = c(
"negative_control",
"positive_control",
"single_targeting",
"double_targeting"
)),
unexpressed_ctrl_flag = dplyr::case_when(
norm_ctrl_flag == "double_targeting" & gene1_expressed_flag == FALSE & gene2_expressed_flag == FALSE ~ TRUE,
norm_ctrl_flag == "single_targeting" & (gene1_expressed_flag == FALSE | gene2_expressed_flag == FALSE) ~ TRUE,
TRUE ~ FALSE
),
gene2_essential_flag = gene2_symbol %in% control_genes,
pgRNA_target = dplyr::case_when(
target_type == "gene_gene" ~ paste(gene1_symbol, gene2_symbol, sep = "_"),
target_type == "gene_ctrl" ~ paste(gene1_symbol, "ctrl", sep = "_"),
Expand All @@ -136,6 +113,34 @@ gimap_annotate <- function(.data = NULL,
)
)

if (depmap_annotate) {
annotation_df <- annotation_df %>%
dplyr::left_join(depmap_tpm, by = c("gene1_symbol" = "genes")) %>%
dplyr::rename(gene1_expressed_flag = expressed_flag) %>%
dplyr::left_join(depmap_tpm, by = c("gene2_symbol" = "genes"), suffix = c("_gene1", "_gene2")) %>%
dplyr::rename(gene2_expressed_flag = expressed_flag) %>%
dplyr::mutate(norm_ctrl_flag = dplyr::case_when(
target_type == "gene_gene" ~ "double_targeting",
target_type == "gene_ctrl" & gene1_essential_flag == TRUE ~ "positive_control",
target_type == "ctrl_gene" & gene2_essential_flag == TRUE ~ "positive_control",
target_type == "gene_ctrl" & gene1_essential_flag != TRUE ~ "single_targeting",
target_type == "ctrl_gene" & gene2_essential_flag != TRUE ~ "single_targeting",
target_type == "ctrl_ctrl" ~ "negative_control"
)) %>%
dplyr::mutate(
norm_ctrl_flag = factor(norm_ctrl_flag, levels = c(
"negative_control",
"positive_control",
"single_targeting",
"double_targeting"
)),
unexpressed_ctrl_flag = dplyr::case_when(
norm_ctrl_flag == "double_targeting" & gene1_expressed_flag == FALSE & gene2_expressed_flag == FALSE ~ TRUE,
norm_ctrl_flag == "single_targeting" & (gene1_expressed_flag == FALSE | gene2_expressed_flag == FALSE) ~ TRUE,
TRUE ~ FALSE
)
)
}
################################ STORE IT ####################################

if (gimap_dataset$filtered_data$filter_step_run) {
Expand Down
1 change: 0 additions & 1 deletion R/04-normalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ gimap_normalize <- function(.data = NULL,
message("The input data for the IDs which were not found in the annotation data will be kept throughout the analysis, but any data from the annotation won't be available for them.")
}
} else {

missing_ids_file <- file.path(missing_ids_file)
readr::write_csv(missing_ids, missing_ids_file)
}
Expand Down
30 changes: 15 additions & 15 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,21 +69,21 @@ get_example_data <- function(which_data) {
)
return(readr::read_tsv(file, show_col_types = FALSE))
} else if (which_data == "final_treatment") {
file <- list.files(
pattern = "gimap_dataset_final_treatment.RDS",
recursive = TRUE,
system.file("extdata", package = "gimap"),
full.names = TRUE
)
return(readr::read_rds(file))
} else if (which_data == "final_treatment") {
file <- list.files(
pattern = "gimap_dataset_final_treatment.RDS",
recursive = TRUE,
system.file("extdata", package = "gimap"),
full.names = TRUE
)
return(readr::read_rds(file))
file <- list.files(
pattern = "gimap_dataset_final_treatment.RDS",
recursive = TRUE,
system.file("extdata", package = "gimap"),
full.names = TRUE
)
return(readr::read_rds(file))
} else if (which_data == "final_treatment") {
file <- list.files(
pattern = "gimap_dataset_final_treatment.RDS",
recursive = TRUE,
system.file("extdata", package = "gimap"),
full.names = TRUE
)
return(readr::read_rds(file))
} else {
stop("Specification for `which_data` not understood; Need to use 'gimap', 'count', 'meta', or 'annotation' ")
}
Expand Down
7 changes: 3 additions & 4 deletions inst/rmd/scratch_gimap_CRISPR_score_review.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@ devtools::load_all()
```{r}
gimap_dataset <- get_example_data("gimap") %>%
gimap_filter() %>%
gimap_annotate() %>%
gimap_annotate(cell_line = "HELA") %>%
gimap_normalize(
timepoints = "day",
replicates = "rep") %>%
timepoints = "day") %>%
calc_crispr()
```

Expand All @@ -35,7 +34,7 @@ old_crispr_results <- read_tsv("~/Downloads/results/calculate_LFC/tables/tsv/HeL

```{r}
old_crispr_results %<>% mutate(
rep = recode(rep, "A" = "late_RepA", "B" = "late_RepB", "C" = "late_RepC")
rep = recode(rep, "A" = "Day22_RepA_late", "B" = "Day22_RepB_late", "C" = "Day22_RepC_late")
)
```

Expand Down
45 changes: 22 additions & 23 deletions inst/rmd/scratch_gimap_CRISPR_score_review.html

Large diffs are not rendered by default.

61 changes: 54 additions & 7 deletions inst/rmd/scratch_gimap_GI_review.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ date: "`r Sys.Date()`"
```{r warning=FALSE, message=FALSE}
library(tidyverse)
library(devtools)
library(magrittr)
devtools::load_all()
```
Expand All @@ -16,10 +17,9 @@ devtools::load_all()
```{r}
gimap_dataset <- get_example_data("gimap") %>%
gimap_filter() %>%
gimap_annotate() %>%
gimap_annotate(cell_line = "HELA") %>%
gimap_normalize(
timepoints = "day",
replicates = "rep") %>%
timepoints = "day") %>%
calc_crispr() %>%
calc_gi()
```
Expand All @@ -34,6 +34,14 @@ saveRDS(gimap_dataset, "gimap_dataset_calculated.RDS")
old_gi_results <- readRDS("d.HeLa_GI_scores_target")
```

#### sync the reps values

```{r}
old_gi_results %<>% mutate(
rep = recode(rep, "A" = "Day22_RepA_late", "B" = "Day22_RepB_late", "C" = "Day22_RepC_late")
)
```

## Compare results

### Columns
Expand All @@ -48,7 +56,7 @@ colnames(old_gi_results)

Looks like we want to compare the pgRNA targets which should be the `pgRNA_target_double` column for the gimap results and the `pgRNA_target` column for the GI Mapping results

Columns in the GI Mapping results not in the gimap results seem to include model results (`intercept`, `slope`) as well as some significance values (`p_val`, `fdr`), observed and expected CRISPR score (`mean_observed_CS` and `mean_expected_CS` (which I think I need for plotting)). And how does `mean_GI_score` relate to the three GI score columns in the gimap results?
Columns in the GI Mapping results not in the gimap results seem to include model results (`intercept`, `slope`) as well as some significance values (`p_val`, `fdr`), observed and expected CRISPR score (`mean_observed_CS` and `mean_expected_CS` (which I think I need for plotting)). And how does `mean_GI_score` relate to the three GI score columns in the gimap results? The three GI score columns relate to the different target types.

### Target overlap

Expand All @@ -61,7 +69,7 @@ length(unique(gimap_dataset$gi_scores$pgRNA_target_double))
length(intersect(unique(old_gi_results$pgRNA_target), unique(gimap_dataset$gi_scores$pgRNA_target_double)))
```

Only 1/3 of the GI Mapping targets are represented in the gimap results
Only 1/3 of the GI Mapping targets are represented in the gimap results...

### Number of observations/rows total and rep column overlap

Expand All @@ -70,18 +78,57 @@ nrow(old_gi_results)
nrow(gimap_dataset$gi_scores)
```

The gimap results has far more rows
The gimap results has far more rows...

```{r}
table(old_gi_results$rep)
table(gimap_dataset$gi_scores$rep)
```

While the replicate values themselves can be synced like we've done in other reviews, this further shows the far greater number of observations (even if there are fewer represented pgRNA targets) in the gimap results
While the replicate values themselves can't be completely synced like we've done in other reviews due to the gimap results including the Day05_RepA_early datapoints, this further shows the far greater number of observations (even if there are fewer represented pgRNA targets) in the gimap results....

```{r}
head(old_gi_results)
```

```{r}
joindf <- dplyr::full_join(old_gi_results, gimap_dataset$gi_scores,
by = c("pgRNA_target" = "pgRNA_target_double", "rep"),
suffix = c("_old", "_new"))
```

## Split out the comparisons we want to make

Have to drop NA's because the gimap results include Day05_RepA_early datapoints while the GI Mapping doesn't. Also have to drop NAs because there are GI Mapping targets that aren't represented in the gimap targets?

```{r}
joined_df <- rbind(
#join the gene_ctrl
full_join(
old_gi_results %>% filter(target_type == "gene_ctrl") %>% select(c("pgRNA_target", "paralog_pair", "mean_GI_score", "rep")),
gimap_dataset$gi_scores %>% separate(pgRNA_target_double, c("gene1", "gene2"), sep="_", remove = FALSE) %>% mutate(ctrl = "ctrl") %>% unite("pgRNA_target_summary", c("gene1", "ctrl"), sep="_") %>% select(c("pgRNA_target_summary", "pgRNA_target_double", "single_target_gi_score_1", "rep")),
by = c("pgRNA_target" = "pgRNA_target_summary", "paralog_pair"="pgRNA_target_double", "rep"),
suffix = c("_old", "_new")
) %>% mutate(target_type = "gene_ctrl") %>% distinct() %>% select(c("pgRNA_target", "mean_GI_score", "rep", "single_target_gi_score_1", "target_type")) %>% `colnames<-`(c("pgRNA_target", "GI_Mapping_GI_score", "rep", "gimap_GI_score", "target_type")) %>% drop_na(),
# join the ctrl_gene
full_join(
old_gi_results %>% filter(target_type == "ctrl_gene") %>% select(c("pgRNA_target", "paralog_pair", "mean_GI_score", "rep")),
gimap_dataset$gi_scores %>% separate(pgRNA_target_double, c("gene1", "gene2"), sep="_", remove=FALSE) %>% mutate(ctrl = "ctrl") %>% unite("pgRNA_target_summary", c("ctrl", "gene2"), sep="_") %>% select(c("pgRNA_target_summary", "pgRNA_target_double", "single_target_gi_score_2", "rep")),
by = c("pgRNA_target" = "pgRNA_target_summary", "rep"),
suffix = c("_old", "_new")
) %>% mutate(target_type = "ctrl_gene") %>% distinct() %>% select(c("pgRNA_target", "mean_GI_score", "rep", "single_target_gi_score_2", "target_type")) %>% `colnames<-`(c("pgRNA_target", "GI_Mapping_GI_score", "rep", "gimap_GI_score", "target_type")) %>% drop_na(),
#join the gene_gene
full_join(
old_gi_results %>% filter(target_type == "gene_gene") %>% select(c("pgRNA_target", "mean_GI_score", "rep")),
gimap_dataset$gi_scores %>% select(c("pgRNA_target_double", "double_target_gi_score", "rep")),
by = c("pgRNA_target" = "pgRNA_target_double", "rep"),
suffix = c("_old", "_new")
) %>% mutate(target_type = "gene_gene") %>% `colnames<-`(c("pgRNA_target", "GI_Mapping_GI_score", "rep", "gimap_GI_score", "target_type")) %>% drop_na()
)
```

```{r}
joined_df %>% ggplot(aes(x=gimap_GI_score, y=GI_Mapping_GI_score, color=target_type)) + geom_point() + facet_wrap(rep~target_type) + theme(legend.position = "none")
```
Loading

0 comments on commit dfd8cb1

Please sign in to comment.