From 784365d20655f7ffec573c20834a6670e3242ef9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Federico=20L=C3=B3pez-Osorio?= Date: Tue, 13 Sep 2022 17:13:59 +0100 Subject: [PATCH] Updates in response to MEC-21-1370 reviews --- .../amel_composition_glm.Rmd | 261 ++++++++++-------- 1 file changed, 139 insertions(+), 122 deletions(-) diff --git a/03-composition_analysis/amel_composition_glm.Rmd b/03-composition_analysis/amel_composition_glm.Rmd index b4a36da..a710044 100755 --- a/03-composition_analysis/amel_composition_glm.Rmd +++ b/03-composition_analysis/amel_composition_glm.Rmd @@ -33,228 +33,230 @@ load_pkgs <- function(pkg) { } cran_pkgs <- c( - "BiocManager", "tidyverse", "RColorBrewer", "styler", "here" + "BiocManager", "tidyverse", "GGally", "ggtext", "ggh4x", "ggforce", + "RColorBrewer", "styler", "patchwork", "here" ) + load_pkgs(cran_pkgs) bioconductor_pkgs <- c( - "biomaRt", "rhdf5", "tximport", "DESeq2" + "biomaRt", "rhdf5", "tximport", "DESeq2", "vsn", "hexbin" ) + load_pkgs(bioconductor_pkgs) ``` -# Set a custom ggplot theme +# Set a custom `ggplot` theme ```{r} # Set a custom ggplot theme # devtools::install_github("cttobin/ggthemr") library(ggthemr) + +# Diverging custom_palette <- c( - "#9E9E9E", "#59BDEF", "#EBCC2A", "#E1AF00", "#F27C8D", "#7B73D1", "#7B9FE0", - "#F5CC7F", "#66C2A5", "#28A7B6", "#F2CB57", "#F2A057", "#F2845C" + "#9E9E9E", "#88CCEE", "#18913D", "#E9C83A", "#882255", "#CC6677", + "#332288", "#7B73D1", "#2857BA", "#28A7B6", "#F2CB57", "#F2845C" ) + # ggthemr::colour_plot(custom_palette) custom_theme <- ggthemr::define_palette( swatch = custom_palette, - gradient = c(lower = "#59BDEF", upper = "#FF6B5A") + gradient = c(lower = "#E9C83A", upper = "#882255") ) + ggthemr::ggthemr(custom_theme) ``` -# Create directory for results ```{r} # here::here() # Create directory for results -dir.create(here::here("results", "2020-07-28-gene_expression"), +dir.create(here::here("results", "2022-07-20-gene_expression"), recursive = TRUE ) ``` # Read tables of samples and covariates ```{r} -# Jasper et al. 2015 +# Jasper et al., 2015 # Set sample identifiers using the names of the kallisto output directories -jasper_ids <- dir(here::here( - "2015-jasper_worker_tissues", "2020-06-16-kallisto", "results" +ids_jasper <- dir(here::here( + "results", "2015-jasper_worker_tissues", "2020-06-16-kallisto", "results" )) # Read metadata table -jasper_table <- read.csv( +table_jasper <- read.csv( here::here( - "amel_metadata", + "metadata_amel", "2015-jasper_worker_tissues_metadata.csv" ), header = TRUE, stringsAsFactors = TRUE ) -jasper_files <- here::here( - "2015-jasper_worker_tissues", "2020-06-16-kallisto", "results", - jasper_table$sample, +files_jasper <- here::here( + "results", "2015-jasper_worker_tissues", "2020-06-16-kallisto", "results", + table_jasper$sample, "abundance.h5" ) -names(jasper_files) <- jasper_table$sample +names(files_jasper) <- table_jasper$sample # Check that sample names match -all(jasper_ids %in% jasper_table$sample) +all(ids_jasper %in% table_jasper$sample) # Check that all files exist -all(file.exists(jasper_files)) +all(file.exists(files_jasper)) # Select the worker "brain" samples -jasper_brain_table <- jasper_table %>% +jasper_brain_table <- table_jasper %>% dplyr::filter(tissue == "brain") -jasper_brain_files <- jasper_files[grep("brain", jasper_files)] +jasper_brain_files <- files_jasper[grep("brain", files_jasper)] -# Manfredini et al. 2015 +# Manfredini et al., 2015 # Set sample identifiers using the names of the kallisto output directories -amel_manfredini_ids <- dir(here::here( - "2015-manfredini_queen_brains", "2020-06-18-kallisto", "results" +ids_manfredini_amel <- dir(here::here( + "results", "2015-manfredini_queen_brains", "2020-06-18-kallisto", "results" )) # Read metadata table -amel_manfredini_table <- read.csv( +table_manfredini_amel <- read.csv( here::here( - "amel_metadata", + "metadata_amel", "2015-manfredini_queen_brains_metadata.csv" ), header = TRUE, stringsAsFactors = TRUE ) -amel_manfredini_files <- here::here( - "2015-manfredini_queen_brains", "2020-06-18-kallisto", "results", - amel_manfredini_table$sample, +files_manfredini_amel <- here::here( + "results", "2015-manfredini_queen_brains", "2020-06-18-kallisto", "results", + table_manfredini_amel$sample, "abundance.h5" ) -names(amel_manfredini_files) <- amel_manfredini_table$sample +names(files_manfredini_amel) <- table_manfredini_amel$sample # Check that sample names match -all(amel_manfredini_ids %in% amel_manfredini_table$sample) +all(ids_manfredini_amel %in% table_manfredini_amel$sample) # Check that all files exist -all(file.exists(amel_manfredini_files)) +all(file.exists(files_manfredini_amel)) -# Liberti et al. 2015 +# Liberti et al., 2019 # Set sample identifiers using the names of the kallisto output directories -liberti_ids <- dir(here::here( - "2015-liberti_queen_brains", "2020-06-18-kallisto", "results" +ids_liberti <- dir(here::here( + "results", "2019-liberti_queen_brains", "2020-06-18-kallisto", "results" )) # Read metadata table -liberti_table <- read.csv( +table_liberti <- read.csv( here::here( - "amel_metadata", - "2015-liberti_queen_brains_metadata.csv" + "metadata_amel", + "2019-liberti_queen_brains_metadata.csv" ), header = TRUE, stringsAsFactors = TRUE ) -liberti_files <- here::here( - "2015-liberti_queen_brains", "2020-06-18-kallisto", "results", - liberti_table$sample, +files_liberti <- here::here( + "results", "2019-liberti_queen_brains", "2020-06-18-kallisto", "results", + table_liberti$sample, "abundance.h5" ) -names(liberti_files) <- liberti_table$sample +names(files_liberti) <- table_liberti$sample # Check that sample names match -all(liberti_ids %in% liberti_table$sample) +all(ids_liberti %in% table_liberti$sample) # Check that all files exist -all(file.exists(liberti_files)) +all(file.exists(files_liberti)) -# Christen et al. 2015 +# Christen et al., 2015 # Set sample identifiers using the names of the kallisto output directories -christen_ids <- dir(here::here( - "2018-christen_worker_brains", "2020-06-18-kallisto", "results" +ids_christen <- dir(here::here( + "results", "2018-christen_worker_brains", "2020-06-18-kallisto", "results" )) # Read metadata table -christen_table <- read.csv( +table_christen <- read.csv( here::here( - "amel_metadata", + "metadata_amel", "2018-christen_worker_brains_metadata.csv" ), header = TRUE, stringsAsFactors = TRUE ) -christen_files <- here::here( - "2018-christen_worker_brains", "2020-06-18-kallisto", "results", - christen_table$sample, +files_christen <- here::here( + "results", "2018-christen_worker_brains", "2020-06-18-kallisto", "results", + table_christen$sample, "abundance.h5" ) -names(christen_files) <- christen_table$sample +names(files_christen) <- table_christen$sample # Check that sample names match -all(christen_ids %in% christen_table$sample) +all(ids_christen %in% table_christen$sample) # Check that all files exist -all(file.exists(christen_files)) +all(file.exists(files_christen)) ``` # Combine input files and information about the samples ```{r} # Castes -castes_files <- c( - jasper_brain_files, amel_manfredini_files, liberti_files, christen_files +files_castes_amel <- c( + jasper_brain_files, files_manfredini_amel, files_liberti, files_christen ) -castes_table <- dplyr::bind_rows( - jasper_brain_table, amel_manfredini_table, liberti_table, christen_table +table_castes_amel <- dplyr::bind_rows( + jasper_brain_table, table_manfredini_amel, table_liberti, table_christen ) %>% dplyr::select(sample, caste, tissue, study, species) # Tissues -tissues_files <- jasper_files -tissues_table <- jasper_table +files_tissues_amel <- files_jasper +table_tissues_amel <- table_jasper ``` # Associate transcripts to genes ```{r} -biomaRt::listMarts(host = "metazoa.ensembl.org") -# biomaRt::listMarts(host = "eg40-metazoa.ensembl.org") # version 40 -# biomaRt::listEnsemblArchives() - -amel_mart <- biomaRt::useMart( +mart_amel <- biomaRt::useMart( biomart = "metazoa_mart", dataset = "amellifera_eg_gene", - host = "metazoa.ensembl.org" # version 47 + host = "sep2019-metazoa.ensembl.org" # version 45 ) -amel_tx2gene <- biomaRt::getBM( +tx2gene_amel <- biomaRt::getBM( attributes = c("ensembl_transcript_id", "ensembl_gene_id"), - mart = amel_mart + mart = mart_amel ) ``` # Import `kallisto` quantifications using `tximport` ```{r} # Castes -castes_txi <- tximport::tximport( - castes_files, +txi_castes_amel <- tximport::tximport( + files_castes_amel, type = "kallisto", - tx2gene = amel_tx2gene, + tx2gene = tx2gene_amel, txOut = FALSE ) # Tissues -tissues_txi <- tximport::tximport( - tissues_files, +txi_tissues_amel <- tximport::tximport( + files_tissues_amel, type = "kallisto", - tx2gene = amel_tx2gene, + tx2gene = tx2gene_amel, txOut = FALSE ) ``` # Create vector of nAChR gene identifiers ```{r} -amel_nachrs <- c( +nachrs_amel <- c( "GB42850", "GB42644", "GB47845", "GB43275", "GB43064", "GB43416", "GB53053", "GB40923", "GB53427", "GB53055", "GB53428" ) @@ -263,32 +265,35 @@ amel_nachrs <- c( # Analysis of castes ## Create a `DESeqDataSet` object ```{r} -castes_dds <- DESeq2::DESeqDataSetFromTximport(castes_txi, - colData = castes_table, - design = ~caste +dds_castes_amel <- DESeq2::DESeqDataSetFromTximport(txi_castes_amel, + colData = table_castes_amel, + design = ~ caste ) # Choose the reference level for caste factor -castes_dds$caste <- relevel(castes_dds$caste, ref = "worker") +dds_castes_amel$caste <- relevel(dds_castes_amel$caste, ref = "worker") # Estimate the size factors -castes_dds <- DESeq2::estimateSizeFactors(castes_dds) +dds_castes_amel <- DESeq2::estimateSizeFactors(dds_castes_amel) + # Pre-filter the low-count genes -dim(castes_dds) -keep <- rowSums(DESeq2::counts(castes_dds) >= 10) >= 3 -dds <- castes_dds[keep, ] -dim(castes_dds) +dim(dds_castes_amel) +# 15314 18 +keep <- rowSums(DESeq2::counts(dds_castes_amel) >= 10) >= 3 +dds_castes_amel <- dds_castes_amel[keep, ] +dim(dds_castes_amel) +# 12232 18 ``` ## Use normalised counts to calculate the subunit proportions per sample ```{r} # Normalise counts -castes_counts <- DESeq2::counts(castes_dds, normalized = TRUE) +counts_castes_amel <- DESeq2::counts(dds_castes_amel, normalized = TRUE) -castes_counts_nachrs <- castes_counts %>% +counts_nachrs_castes_amel <- counts_castes_amel %>% as.data.frame() %>% tibble::rownames_to_column(var = "ens_gene") %>% - dplyr::filter(ens_gene %in% amel_nachrs) %>% + dplyr::filter(ens_gene %in% nachrs_amel) %>% tidyr::gather(key = "sample", value = "norm_count", -ens_gene) %>% dplyr::group_by(sample) %>% dplyr::mutate(proportion = norm_count / sum(norm_count)) %>% @@ -310,32 +315,33 @@ castes_counts_nachrs <- castes_counts %>% grepl("worker", sample) ~ "worker", grepl("queen", sample) ~ "queen" )) %>% - dplyr::mutate_if(is.numeric, round, digits = 4) + dplyr::mutate_if(is.numeric, round, digits = 4) %>% + dplyr::ungroup() ``` ## Fit glm with subunit proportion as the response variable ```{r} # Fit glm -castes_glmfit <- glm( +glmfit_castes_amel <- glm( formula = proportion ~ ens_gene + caste + ens_gene * caste, - family = quasibinomial(link = "logit"), data = castes_counts_nachrs + family = quasibinomial(link = "logit"), data = counts_nachrs_castes_amel ) # Save summary sink(here::here( - "results", "2020-07-28-gene_expression", + "results", "2022-07-20-gene_expression", "amel_castes_proportions_glm_summary.txt" )) -print(summary(castes_glmfit)) +print(summary(glmfit_castes_amel)) sink() # returns output to the console # Save glm results as a tidy tibble -castes_glmfit_tidy <- broom::tidy(castes_glmfit) %>% +glmfit_castes_tidy_amel <- broom::tidy(glmfit_castes_amel) %>% dplyr::mutate_if(is.numeric, round, digits = 4) -write.csv(castes_glmfit_tidy, +write.csv(glmfit_castes_tidy_amel, file = here::here( - "results", "2020-07-28-gene_expression", + "results", "2022-07-20-gene_expression", "bter_life_stages_proportions_glm_tidy.csv" ), row.names = FALSE @@ -345,38 +351,38 @@ write.csv(castes_glmfit_tidy, # Analysis of tissues ## Create a `DESeqDataSet` object ```{r} -tissues_dds <- DESeq2::DESeqDataSetFromTximport(tissues_txi, - colData = tissues_table, - design = ~tissue +dds_tissues_amel <- DESeq2::DESeqDataSetFromTximport(txi_tissues_amel, + colData = table_tissues_amel, + design = ~ tissue ) # Choose the reference level for tissue factor -tissues_dds$tissue <- relevel(tissues_dds$tissue, ref = "midgut") +dds_tissues_amel$tissue <- relevel(dds_tissues_amel$tissue, ref = "midgut") ``` ## Pre-filter the low-count genes ```{r} # Estimate the size factors -tissues_dds <- DESeq2::estimateSizeFactors(tissues_dds) +dds_tissues_amel <- DESeq2::estimateSizeFactors(dds_tissues_amel) # Pre-filter the dataset -dim(tissues_dds) +dim(dds_tissues_amel) # 15314 48 -keep <- rowSums(DESeq2::counts(tissues_dds) >= 10) >= 3 -tissues_dds <- tissues_dds[keep, ] -dim(tissues_dds) +keep <- rowSums(DESeq2::counts(dds_tissues_amel) >= 10) >= 3 +dds_tissues_amel <- dds_tissues_amel[keep, ] +dim(dds_tissues_amel) # 13114 48 ``` ## Use normalised counts to calculate the subunit proportions per sample ```{r} # Normalise counts -tissues_counts <- DESeq2::counts(tissues_dds, normalized = TRUE) +counts_tissues_amel <- DESeq2::counts(dds_tissues_amel, normalized = TRUE) -tissues_counts_nachrs <- tissues_counts %>% +counts_nachrs_tissues_amel <- counts_tissues_amel %>% as.data.frame() %>% tibble::rownames_to_column(var = "ens_gene") %>% - dplyr::filter(ens_gene %in% amel_nachrs) %>% + dplyr::filter(ens_gene %in% nachrs_amel) %>% tidyr::gather(key = "sample", value = "norm_count", -ens_gene) %>% dplyr::group_by(sample) %>% dplyr::mutate(proportion = norm_count / sum(norm_count)) %>% @@ -411,26 +417,26 @@ tissues_counts_nachrs <- tissues_counts %>% ## Fit glm with subunit proportion as the response variable ```{r} # Fit glm -tissues_glmfit <- glm( +glmfit_tissues_amel <- glm( formula = proportion ~ ens_gene + tissue + ens_gene * tissue, - family = quasibinomial(link = "logit"), data = tissues_counts_nachrs + family = quasibinomial(link = "logit"), data = counts_nachrs_tissues_amel ) # Save summary sink(here::here( - "results", "2020-07-28-gene_expression", + "results", "2022-07-20-gene_expression", "amel_tissues_proportions_glm_summary.txt" )) -print(summary(tissues_glmfit)) +print(summary(glmfit_tissues_amel)) sink() # returns output to the console # Save glm results as a tidy tibble -tissues_glmfit_tidy <- broom::tidy(tissues_glmfit) %>% +glmfit_tissues_tidy_amel <- broom::tidy(glmfit_tissues_amel) %>% dplyr::mutate_if(is.numeric, round, digits = 4) -write.csv(tissues_glmfit_tidy, +write.csv(glmfit_tissues_tidy_amel, file = here::here( - "results", "2020-07-28-gene_expression", + "results", "2022-07-20-gene_expression", "amel_tissues_proportions_glm_tidy.csv" ), row.names = FALSE @@ -440,22 +446,33 @@ write.csv(tissues_glmfit_tidy, # Supplementary materials: data visualisation ## Stacked bar chart of subunit proportions in tissues ```{r} -tissues_levels <- c( +levels_tissues_amel <- c( "brain", "thoracic_ganglion", "antenna", "hypopharyngeal_gland", "mandibular_gland", "muscle", "midgut", "malpighian_tubule", "nasonov_gland" ) -subunit_levels <- c( +levels_subunits <- c( "alpha1", "alpha2", "alpha3", "alpha4", "alpha5", "alpha6", "alpha7", "alpha8", "alpha9", "beta1", "beta2" ) -ggplot(tissues_counts_nachrs, aes( - x = factor(tissue, levels = tissues_levels), y = proportion, - fill = factor(subunit, levels = subunit_levels), - color = factor(subunit, levels = subunit_levels) -)) + +counts_nachrs_tissues_amel %>% + dplyr::mutate( + tissue = stringr::str_to_sentence(tissue) + ) %>% + dplyr::mutate( + tissue = stringr::str_replace(tissue, "_", " ") + ) %>% + ggplot(aes( + x = factor(tissue, + levels = stringr::str_to_sentence(levels_tissues_amel) %>% + stringr::str_replace("_", " ") + ), + y = proportion, + fill = factor(subunit, levels = levels_subunits), + color = factor(subunit, levels = levels_subunits) + )) + geom_bar(position = "fill", stat = "identity", width = 0.5, alpha = 0.9) + theme( plot.title = element_text(hjust = 0.5, size = 14, face = "plain"),