diff --git a/03-composition_analysis/bter_composition_glm.Rmd b/03-composition_analysis/bter_composition_glm.Rmd index 58a5477..5e6259c 100644 --- a/03-composition_analysis/bter_composition_glm.Rmd +++ b/03-composition_analysis/bter_composition_glm.Rmd @@ -34,67 +34,69 @@ load_pkgs <- function(pkg) { } cran_pkgs <- c( - "BiocManager", "tidyverse", "RColorBrewer", "styler", "here" + "BiocManager", "tidyverse", "GGally", "ggtext", "ggh4x", "ggforce", + "RColorBrewer", "styler", "patchwork", "here" ) + load_pkgs(cran_pkgs) bioconductor_pkgs <- c( - "biomaRt", "rhdf5", "tximport", "DESeq2" + "biomaRt", "rhdf5", "tximport", "DESeq2", "vsn", "hexbin" ) + load_pkgs(bioconductor_pkgs) ``` -# Set a custom ggplot theme +# Set a custom `ggplot` theme ```{r} # Set a custom ggplot theme # devtools::install_github("cttobin/ggthemr") library(ggthemr) + +# Diverging custom_palette <- c( - "#9E9E9E", "#59BDEF", "#EBCC2A", "#E1AF00", "#F27C8D", "#7B73D1", "#7B9FE0", - "#F5CC7F", "#66C2A5", "#28A7B6", "#F2CB57", "#F2A057", "#F2845C" + "#9E9E9E", "#88CCEE", "#18913D", "#E9C83A", "#882255", "#CC6677", + "#332288", "#7B73D1", "#2857BA", "#28A7B6", "#F2CB57", "#F2845C" ) + # ggthemr::colour_plot(custom_palette) custom_theme <- ggthemr::define_palette( swatch = custom_palette, - gradient = c(lower = "#59BDEF", upper = "#FF6B5A") + gradient = c(lower = "#E9C83A", upper = "#882255") ) + ggthemr::ggthemr(custom_theme) ``` -# Create directory for results ```{r} # here::here() # Create directory for results -dir.create(here::here("results", "2020-07-28-gene_expression"), +dir.create(here::here("results", "2022-07-20-gene_expression", "supplementary"), recursive = TRUE ) ``` # Associate transcripts to genes ```{r} -# Remove all biomaRt cached files -# biomaRt::biomartCacheInfo() -# biomaRt::biomartCacheClear() - -# Retrieve Ensembl gene identifiers/names and create transcript-to-gene table -biomaRt::listMarts(host = "metazoa.ensembl.org") - -bter_mart <- biomaRt::useMart( +mart_bter <- biomaRt::useMart( biomart = "metazoa_mart", dataset = "bterrestris_eg_gene", - host = "metazoa.ensembl.org" # version 48 + host = "sep2019-metazoa.ensembl.org" ) -bter_tx2gene <- biomaRt::getBM( +tx2gene_bter <- biomaRt::getBM( attributes = c("ensembl_transcript_id", "ensembl_gene_id"), - mart = bter_mart + mart = mart_bter ) + +# tx2gene_bter <- read.csv(file = "bter1.0tx2gene.csv", +# header = TRUE) ``` # Create vector of nAChR gene identifiers ```{r} -nachrs <- c( +nachrs_bter <- c( "LOC100643274", "LOC100643282", "LOC100645032", "LOC100646787", "LOC100647301", "LOC100647350", "LOC100647624", "LOC100648987", "LOC100649515", "LOC100649612", "LOC100649796" @@ -105,83 +107,83 @@ nachrs <- c( ## Read table of samples and covariates and `kallisto` results ```{r} # Set sample identifiers using the names of the kallisto output directories -castes_ids <- dir(here::here( - "2019-colgan_queen_worker_head", "2020-07-14-kallisto", "results" +ids_castes_bter <- dir(here::here( + "results", "2019-colgan_queen_worker_head", "2020-07-14-kallisto", "results" )) # Read metadata table -castes_table <- read.csv( +table_castes_bter <- read.csv( here::here( - "bter_metadata", + "metadata_bter", "2019-colgan_queen_worker_head_metadata.csv" ), header = TRUE, stringsAsFactors = TRUE ) -castes_table$caste <- relevel( - castes_table$caste, +table_castes_bter$caste <- relevel( + table_castes_bter$caste, ref = "worker" ) -castes_files <- here::here( - "2019-colgan_queen_worker_head", "2020-07-14-kallisto", "results", - castes_table$sample, +files_castes_bter <- here::here( + "results", "2019-colgan_queen_worker_head", "2020-07-14-kallisto", "results", + table_castes_bter$sample, "abundance.h5" ) -names(castes_files) <- castes_table$sample +names(files_castes_bter) <- table_castes_bter$sample # Check that sample names match -all(castes_ids %in% castes_table$sample) +all(ids_castes_bter %in% table_castes_bter$sample) # Check that all files exist -all(file.exists(castes_files)) +all(file.exists(files_castes_bter)) ``` ## Create a `DESeqDataSet` object ```{r} # Import kallisto quantifications -castes_txi <- tximport::tximport(castes_files, +txi_castes_bter <- tximport::tximport(files_castes_bter, type = "kallisto", - tx2gene = bter_tx2gene, + tx2gene = tx2gene_bter, txOut = FALSE ) # Create a DESeqDataSet object # The design below includes the batch factors "block" and "room" -castes_dds <- DESeq2::DESeqDataSetFromTximport(castes_txi, - colData = castes_table, +dds_castes_bter <- DESeq2::DESeqDataSetFromTximport(txi_castes_bter, + colData = table_castes_bter, design = ~ block + room + caste ) # Choose the reference level for caste factor -castes_dds$caste <- relevel(castes_dds$caste, ref = "worker") +dds_castes_bter$caste <- relevel(dds_castes_bter$caste, ref = "worker") ``` ## Pre-filter the low-count genes ```{r} # Estimate the size factors -castes_dds <- DESeq2::estimateSizeFactors(castes_dds) +dds_castes_bter <- DESeq2::estimateSizeFactors(dds_castes_bter) # Pre-filter the dataset -dim(castes_dds) +dim(dds_castes_bter) # 10661 # At least 4 samples with a count of 10 or higher # The number of samples would be set to the smallest group size -keep <- rowSums(DESeq2::counts(castes_dds) >= 10) >= 4 -castes_dds <- castes_dds[keep, ] -dim(castes_dds) +keep <- rowSums(DESeq2::counts(dds_castes_bter) >= 10) >= 4 +dds_castes_bter <- dds_castes_bter[keep, ] +dim(dds_castes_bter) # 9139 ``` ## Use normalised counts to calculate the subunit proportions per sample ```{r} # Normalise counts -castes_counts <- DESeq2::counts(castes_dds, normalized = TRUE) +counts_castes_bter <- DESeq2::counts(dds_castes_bter, normalized = TRUE) # Examine counts -# castes_counts %>% +# counts_castes_bter %>% # as.data.frame() %>% # tibble::rownames_to_column(var = "ens_gene") %>% # dplyr::filter(ens_gene %in% nachrs) @@ -189,11 +191,11 @@ castes_counts <- DESeq2::counts(castes_dds, normalized = TRUE) # "LOC100645032 / alpha-4" (exons 1-4) was discarded in filtering step # Very low expression of "LOC100643274 / alpha6" -castes_counts_nachrs <- castes_counts %>% +counts_nachrs_castes_bter <- counts_castes_bter %>% as.data.frame() %>% tibble::rownames_to_column(var = "ens_gene") %>% tidyr::gather(key = "sample", value = "norm_count", -ens_gene) %>% - dplyr::filter(ens_gene %in% nachrs) %>% + dplyr::filter(ens_gene %in% nachrs_bter) %>% dplyr::filter(ens_gene != "LOC100645032") %>% dplyr::group_by(sample) %>% dplyr::mutate(proportion = norm_count / sum(norm_count)) %>% @@ -219,26 +221,26 @@ castes_counts_nachrs <- castes_counts %>% ## Fit glm with subunit proportion as the response variable ```{r} # Fit glm -castes_glmfit <- glm( +glmfit_castes_bter <- glm( formula = proportion ~ ens_gene + caste + ens_gene * caste, - family = quasibinomial(link = "logit"), data = castes_counts_nachrs + family = quasibinomial(link = "logit"), data = counts_nachrs_castes_bter ) # Save summary sink(here::here( - "results", "2020-07-28-gene_expression", + "results", "2022-07-20-gene_expression", "bter_castes_proportions_glm_summary.txt" )) -print(summary(castes_glmfit)) +print(summary(glmfit_castes_bter)) sink() # returns output to the console # Save glm results as a tidy tibble -castes_glmfit_tidy <- broom::tidy(castes_glmfit) %>% +glmfit_castes_tidy_bter <- broom::tidy(glmfit_castes_bter) %>% dplyr::mutate_if(is.numeric, round, digits = 4) -write.csv(castes_glmfit_tidy, +write.csv(glmfit_castes_tidy_bter, file = here::here( - "results", "2020-07-28-gene_expression", + "results", "2022-07-20-gene_expression", "bter_castes_proportions_glm_tidy.csv" ), row.names = FALSE @@ -249,85 +251,85 @@ write.csv(castes_glmfit_tidy, ## Read `kallisto` results and table of samples and covariates. ```{r} # Set sample identifiers using the names of the kallisto output directories -lifestages_ids <- dir(here::here( - "2015-harrison_queen_worker_whole_body", "results", +ids_lifestages_bter <- dir(here::here( + "results", "2015-harrison_queen_worker_whole_body", "results", "2020-07-14-kallisto" )) # Read metadata table -lifestages_table <- read.csv( +table_lifestages_bter <- read.csv( here::here( - "bter_metadata", + "metadata_bter", "2015-harrison_queen_worker_whole_body_metadata.csv" ), header = TRUE, stringsAsFactors = TRUE ) -lifestages_table$condition <- relevel( - lifestages_table$condition, +table_lifestages_bter$condition <- relevel( + table_lifestages_bter$condition, ref = "worker_larva" ) -lifestages_files <- here::here( - "2015-harrison_queen_worker_whole_body", +files_lifestages_bter <- here::here( + "results", "2015-harrison_queen_worker_whole_body", "2020-07-14-kallisto", "results", - lifestages_table$sample, + table_lifestages_bter$sample, "abundance.h5" ) -names(lifestages_files) <- lifestages_table$sample +names(files_lifestages_bter) <- table_lifestages_bter$sample # Check that sample names match -all(lifestages_ids %in% lifestages_table$sample) +all(ids_lifestages_bter %in% table_lifestages_bter$sample) # Check that all files exist -all(file.exists(lifestages_files)) +all(file.exists(files_lifestages_bter)) ``` ## Create a `DESeqDataSet` object ```{r} # Import kallisto quantifications -lifestages_txi <- tximport::tximport(lifestages_files, +txi_lifestages_bter <- tximport::tximport(files_lifestages_bter, type = "kallisto", - tx2gene = bter_tx2gene, + tx2gene = tx2gene_bter, txOut = FALSE ) # Create a DESeqDataSet object -lifestages_dds <- DESeq2::DESeqDataSetFromTximport(lifestages_txi, - colData = lifestages_table, +dds_lifestages_bter <- DESeq2::DESeqDataSetFromTximport(txi_lifestages_bter, + colData = table_lifestages_bter, design = ~condition ) # Choose the reference level for condition factor -lifestages_dds$condition <- relevel(lifestages_dds$condition, +dds_lifestages_bter$condition <- relevel(dds_lifestages_bter$condition, ref = "worker_larva") ``` ## Pre-filter the low-count genes ```{r} # Estimate the size factors -lifestages_dds <- DESeq2::estimateSizeFactors(lifestages_dds) +dds_lifestages_bter <- DESeq2::estimateSizeFactors(dds_lifestages_bter) # Pre-filter the dataset -dim(lifestages_dds) +dim(dds_lifestages_bter) # 10661 # At least 3 samples with a count of 10 or higher # The number of samples would be set to the smallest group size -keep <- rowSums(DESeq2::counts(lifestages_dds) >= 10) >= 3 -lifestages_dds <- lifestages_dds[keep, ] -dim(lifestages_dds) +keep <- rowSums(DESeq2::counts(dds_lifestages_bter) >= 10) >= 3 +dds_lifestages_bter <- dds_lifestages_bter[keep, ] +dim(dds_lifestages_bter) # 9892 ``` ## Use normalised counts to calculate the subunit proportions per sample ```{r} # Normalise counts -lifestages_counts <- DESeq2::counts(lifestages_dds, normalized = TRUE) +counts_lifestages_bter <- DESeq2::counts(dds_lifestages_bter, normalized = TRUE) # Examine counts -# lifestages_counts %>% +# counts_lifestages_bter %>% # as.data.frame() %>% # tibble::rownames_to_column(var = "ens_gene") %>% # dplyr::filter(ens_gene %in% nachrs) @@ -335,11 +337,11 @@ lifestages_counts <- DESeq2::counts(lifestages_dds, normalized = TRUE) # "LOC100645032 / alpha-4" (exons 1-4) was discarded in filtering step # Very low expression of "LOC100643274 / alpha6" -lifestages_counts_nachrs <- lifestages_counts %>% +counts_nachrs_lifestages_bter <- counts_lifestages_bter %>% as.data.frame() %>% tibble::rownames_to_column(var = "ens_gene") %>% tidyr::gather(key = "sample", value = "norm_count", -ens_gene) %>% - dplyr::filter(ens_gene %in% nachrs) %>% + dplyr::filter(ens_gene %in% nachrs_bter) %>% dplyr::filter(ens_gene != "LOC100645032") %>% dplyr::group_by(sample) %>% dplyr::mutate(proportion = norm_count / sum(norm_count)) %>% @@ -367,26 +369,26 @@ lifestages_counts_nachrs <- lifestages_counts %>% ## Fit glm with subunit proportion as the response variable ```{r} # Fit glm -lifestages_glmfit <- glm( +glmfit_lifestages_bter <- glm( formula = proportion ~ ens_gene + caste_stage + ens_gene * caste_stage, - family = quasibinomial(link = "logit"), data = lifestages_counts_nachrs + family = quasibinomial(link = "logit"), data = counts_nachrs_lifestages_bter ) # Save summary sink(here::here( - "results", "2020-07-28-gene_expression", + "results", "2022-07-20-gene_expression", "bter_life_stages_proportions_glm_summary.txt" )) -print(summary(lifestages_glmfit)) +print(summary(glmfit_lifestages_bter)) sink() # returns output to the console # Save glm results as a tidy tibble -lifestages_glmfit_tidy <- broom::tidy(lifestages_glmfit) %>% +glmfit_lifestages_tidy_bter <- broom::tidy(glmfit_lifestages_bter) %>% dplyr::mutate_if(is.numeric, round, digits = 4) -write.csv(lifestages_glmfit_tidy, +write.csv(glmfit_lifestages_tidy_bter, file = here::here( - "results", "2020-07-28-gene_expression", + "results", "2022-07-20-gene_expression", "bter_life_stages_proportions_glm_tidy.csv" ), row.names = FALSE