From 784365d20655f7ffec573c20834a6670e3242ef9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Federico=20L=C3=B3pez-Osorio?= <flopezo84@gmail.com>
Date: Tue, 13 Sep 2022 17:13:59 +0100
Subject: [PATCH] Updates in response to MEC-21-1370 reviews

---
 .../amel_composition_glm.Rmd                  | 261 ++++++++++--------
 1 file changed, 139 insertions(+), 122 deletions(-)

diff --git a/03-composition_analysis/amel_composition_glm.Rmd b/03-composition_analysis/amel_composition_glm.Rmd
index b4a36da..a710044 100755
--- a/03-composition_analysis/amel_composition_glm.Rmd
+++ b/03-composition_analysis/amel_composition_glm.Rmd
@@ -33,228 +33,230 @@ load_pkgs <- function(pkg) {
 }
 
 cran_pkgs <- c(
-  "BiocManager", "tidyverse", "RColorBrewer", "styler", "here"
+  "BiocManager", "tidyverse", "GGally", "ggtext", "ggh4x", "ggforce",
+  "RColorBrewer", "styler", "patchwork", "here"
 )
+
 load_pkgs(cran_pkgs)
 
 bioconductor_pkgs <- c(
-  "biomaRt", "rhdf5", "tximport", "DESeq2"
+  "biomaRt", "rhdf5", "tximport", "DESeq2", "vsn", "hexbin"
 )
+
 load_pkgs(bioconductor_pkgs)
 ```
 
-# Set a custom ggplot theme
+# Set a custom `ggplot` theme
 ```{r}
 # Set a custom ggplot theme
 # devtools::install_github("cttobin/ggthemr")
 library(ggthemr)
+
+# Diverging
 custom_palette <- c(
-  "#9E9E9E", "#59BDEF", "#EBCC2A", "#E1AF00", "#F27C8D", "#7B73D1", "#7B9FE0",
-  "#F5CC7F", "#66C2A5", "#28A7B6", "#F2CB57", "#F2A057", "#F2845C"
+  "#9E9E9E", "#88CCEE", "#18913D", "#E9C83A", "#882255", "#CC6677",
+  "#332288", "#7B73D1", "#2857BA", "#28A7B6", "#F2CB57", "#F2845C"
 )
+
 # ggthemr::colour_plot(custom_palette)
 
 custom_theme <- ggthemr::define_palette(
   swatch = custom_palette,
-  gradient = c(lower = "#59BDEF", upper = "#FF6B5A")
+  gradient = c(lower = "#E9C83A", upper = "#882255")
 )
+
 ggthemr::ggthemr(custom_theme)
 ```
 
-# Create directory for results
 ```{r}
 # here::here()
 # Create directory for results
-dir.create(here::here("results", "2020-07-28-gene_expression"),
+dir.create(here::here("results", "2022-07-20-gene_expression"),
   recursive = TRUE
 )
 ```
 
 # Read tables of samples and covariates
 ```{r}
-# Jasper et al. 2015
+# Jasper et al., 2015
 # Set sample identifiers using the names of the kallisto output directories
-jasper_ids <- dir(here::here(
-  "2015-jasper_worker_tissues", "2020-06-16-kallisto", "results"
+ids_jasper <- dir(here::here(
+  "results", "2015-jasper_worker_tissues", "2020-06-16-kallisto", "results"
 ))
 
 # Read metadata table
-jasper_table <- read.csv(
+table_jasper <- read.csv(
   here::here(
-    "amel_metadata",
+    "metadata_amel",
     "2015-jasper_worker_tissues_metadata.csv"
   ),
   header = TRUE,
   stringsAsFactors = TRUE
 )
 
-jasper_files <- here::here(
-  "2015-jasper_worker_tissues", "2020-06-16-kallisto", "results",
-  jasper_table$sample,
+files_jasper <- here::here(
+  "results", "2015-jasper_worker_tissues", "2020-06-16-kallisto", "results",
+  table_jasper$sample,
   "abundance.h5"
 )
 
-names(jasper_files) <- jasper_table$sample
+names(files_jasper) <- table_jasper$sample
 
 # Check that sample names match
-all(jasper_ids %in% jasper_table$sample)
+all(ids_jasper %in% table_jasper$sample)
 
 # Check that all files exist
-all(file.exists(jasper_files))
+all(file.exists(files_jasper))
 
 # Select the worker "brain" samples
-jasper_brain_table <- jasper_table %>%
+jasper_brain_table <- table_jasper %>%
   dplyr::filter(tissue == "brain")
 
-jasper_brain_files <- jasper_files[grep("brain", jasper_files)]
+jasper_brain_files <- files_jasper[grep("brain", files_jasper)]
 
-# Manfredini et al. 2015
+# Manfredini et al., 2015
 # Set sample identifiers using the names of the kallisto output directories
-amel_manfredini_ids <- dir(here::here(
-  "2015-manfredini_queen_brains", "2020-06-18-kallisto", "results"
+ids_manfredini_amel <- dir(here::here(
+  "results", "2015-manfredini_queen_brains", "2020-06-18-kallisto", "results"
 ))
 
 # Read metadata table
-amel_manfredini_table <- read.csv(
+table_manfredini_amel <- read.csv(
   here::here(
-    "amel_metadata",
+    "metadata_amel",
     "2015-manfredini_queen_brains_metadata.csv"
   ),
   header = TRUE,
   stringsAsFactors = TRUE
 )
 
-amel_manfredini_files <- here::here(
-  "2015-manfredini_queen_brains", "2020-06-18-kallisto", "results",
-  amel_manfredini_table$sample,
+files_manfredini_amel <- here::here(
+  "results", "2015-manfredini_queen_brains", "2020-06-18-kallisto", "results",
+  table_manfredini_amel$sample,
   "abundance.h5"
 )
 
-names(amel_manfredini_files) <- amel_manfredini_table$sample
+names(files_manfredini_amel) <- table_manfredini_amel$sample
 
 # Check that sample names match
-all(amel_manfredini_ids %in% amel_manfredini_table$sample)
+all(ids_manfredini_amel %in% table_manfredini_amel$sample)
 
 # Check that all files exist
-all(file.exists(amel_manfredini_files))
+all(file.exists(files_manfredini_amel))
 
-# Liberti et al. 2015
+# Liberti et al., 2019
 # Set sample identifiers using the names of the kallisto output directories
-liberti_ids <- dir(here::here(
-  "2015-liberti_queen_brains", "2020-06-18-kallisto", "results"
+ids_liberti <- dir(here::here(
+  "results", "2019-liberti_queen_brains", "2020-06-18-kallisto", "results"
 ))
 
 # Read metadata table
-liberti_table <- read.csv(
+table_liberti <- read.csv(
   here::here(
-    "amel_metadata",
-    "2015-liberti_queen_brains_metadata.csv"
+    "metadata_amel",
+    "2019-liberti_queen_brains_metadata.csv"
   ),
   header = TRUE,
   stringsAsFactors = TRUE
 )
 
-liberti_files <- here::here(
-  "2015-liberti_queen_brains", "2020-06-18-kallisto", "results",
-  liberti_table$sample,
+files_liberti <- here::here(
+  "results", "2019-liberti_queen_brains", "2020-06-18-kallisto", "results",
+  table_liberti$sample,
   "abundance.h5"
 )
 
-names(liberti_files) <- liberti_table$sample
+names(files_liberti) <- table_liberti$sample
 
 # Check that sample names match
-all(liberti_ids %in% liberti_table$sample)
+all(ids_liberti %in% table_liberti$sample)
 
 # Check that all files exist
-all(file.exists(liberti_files))
+all(file.exists(files_liberti))
 
-# Christen et al. 2015
+# Christen et al., 2015
 # Set sample identifiers using the names of the kallisto output directories
-christen_ids <- dir(here::here(
-  "2018-christen_worker_brains", "2020-06-18-kallisto", "results"
+ids_christen <- dir(here::here(
+  "results", "2018-christen_worker_brains", "2020-06-18-kallisto", "results"
 ))
 
 # Read metadata table
-christen_table <- read.csv(
+table_christen <- read.csv(
   here::here(
-    "amel_metadata",
+    "metadata_amel",
     "2018-christen_worker_brains_metadata.csv"
   ),
   header = TRUE,
   stringsAsFactors = TRUE
 )
 
-christen_files <- here::here(
-  "2018-christen_worker_brains", "2020-06-18-kallisto", "results",
-  christen_table$sample,
+files_christen <- here::here(
+  "results", "2018-christen_worker_brains", "2020-06-18-kallisto", "results",
+  table_christen$sample,
   "abundance.h5"
 )
 
-names(christen_files) <- christen_table$sample
+names(files_christen) <- table_christen$sample
 
 # Check that sample names match
-all(christen_ids %in% christen_table$sample)
+all(ids_christen %in% table_christen$sample)
 
 # Check that all files exist
-all(file.exists(christen_files))
+all(file.exists(files_christen))
 ```
 
 # Combine input files and information about the samples
 ```{r}
 # Castes
-castes_files <- c(
-  jasper_brain_files, amel_manfredini_files, liberti_files, christen_files
+files_castes_amel <- c(
+  jasper_brain_files, files_manfredini_amel, files_liberti, files_christen
 )
-castes_table <- dplyr::bind_rows(
-  jasper_brain_table, amel_manfredini_table, liberti_table, christen_table
+table_castes_amel <- dplyr::bind_rows(
+  jasper_brain_table, table_manfredini_amel, table_liberti, table_christen
 ) %>%
   dplyr::select(sample, caste, tissue, study, species)
 
 # Tissues
-tissues_files <- jasper_files
-tissues_table <- jasper_table
+files_tissues_amel <- files_jasper
+table_tissues_amel <- table_jasper
 ```
 
 # Associate transcripts to genes
 ```{r}
-biomaRt::listMarts(host = "metazoa.ensembl.org")
-# biomaRt::listMarts(host = "eg40-metazoa.ensembl.org") # version 40
-# biomaRt::listEnsemblArchives()
-
-amel_mart <- biomaRt::useMart(
+mart_amel <- biomaRt::useMart(
   biomart = "metazoa_mart",
   dataset = "amellifera_eg_gene",
-  host = "metazoa.ensembl.org" # version 47
+  host = "sep2019-metazoa.ensembl.org" # version 45
 )
 
-amel_tx2gene <- biomaRt::getBM(
+tx2gene_amel <- biomaRt::getBM(
   attributes = c("ensembl_transcript_id", "ensembl_gene_id"),
-  mart = amel_mart
+  mart = mart_amel
 )
 ```
 
 # Import `kallisto` quantifications using `tximport`
 ```{r}
 # Castes
-castes_txi <- tximport::tximport(
-  castes_files,
+txi_castes_amel <- tximport::tximport(
+  files_castes_amel,
   type = "kallisto",
-  tx2gene = amel_tx2gene,
+  tx2gene = tx2gene_amel,
   txOut = FALSE
 )
 
 # Tissues
-tissues_txi <- tximport::tximport(
-  tissues_files,
+txi_tissues_amel <- tximport::tximport(
+  files_tissues_amel,
   type = "kallisto",
-  tx2gene = amel_tx2gene,
+  tx2gene = tx2gene_amel,
   txOut = FALSE
 )
 ```
 
 # Create vector of nAChR gene identifiers
 ```{r}
-amel_nachrs <- c(
+nachrs_amel <- c(
   "GB42850", "GB42644", "GB47845", "GB43275", "GB43064", "GB43416",
   "GB53053", "GB40923", "GB53427", "GB53055", "GB53428"
 )
@@ -263,32 +265,35 @@ amel_nachrs <- c(
 # Analysis of castes
 ## Create a `DESeqDataSet` object
 ```{r}
-castes_dds <- DESeq2::DESeqDataSetFromTximport(castes_txi,
-  colData = castes_table,
-  design = ~caste
+dds_castes_amel <- DESeq2::DESeqDataSetFromTximport(txi_castes_amel,
+  colData = table_castes_amel,
+  design = ~ caste
 )
 
 # Choose the reference level for caste factor
-castes_dds$caste <- relevel(castes_dds$caste, ref = "worker")
+dds_castes_amel$caste <- relevel(dds_castes_amel$caste, ref = "worker")
 
 # Estimate the size factors
-castes_dds <- DESeq2::estimateSizeFactors(castes_dds)
+dds_castes_amel <- DESeq2::estimateSizeFactors(dds_castes_amel)
+
 # Pre-filter the low-count genes
-dim(castes_dds)
-keep <- rowSums(DESeq2::counts(castes_dds) >= 10) >= 3
-dds <- castes_dds[keep, ]
-dim(castes_dds)
+dim(dds_castes_amel)
+# 15314    18
+keep <- rowSums(DESeq2::counts(dds_castes_amel) >= 10) >= 3
+dds_castes_amel <- dds_castes_amel[keep, ]
+dim(dds_castes_amel)
+# 12232    18
 ```
 
 ## Use normalised counts to calculate the subunit proportions per sample
 ```{r}
 # Normalise counts
-castes_counts <- DESeq2::counts(castes_dds, normalized = TRUE)
+counts_castes_amel <- DESeq2::counts(dds_castes_amel, normalized = TRUE)
 
-castes_counts_nachrs <- castes_counts %>%
+counts_nachrs_castes_amel <- counts_castes_amel %>%
   as.data.frame() %>%
   tibble::rownames_to_column(var = "ens_gene") %>%
-  dplyr::filter(ens_gene %in% amel_nachrs) %>%
+  dplyr::filter(ens_gene %in% nachrs_amel) %>%
   tidyr::gather(key = "sample", value = "norm_count", -ens_gene) %>%
   dplyr::group_by(sample) %>%
   dplyr::mutate(proportion = norm_count / sum(norm_count)) %>%
@@ -310,32 +315,33 @@ castes_counts_nachrs <- castes_counts %>%
     grepl("worker", sample) ~ "worker",
     grepl("queen", sample) ~ "queen"
   )) %>%
-  dplyr::mutate_if(is.numeric, round, digits = 4)
+  dplyr::mutate_if(is.numeric, round, digits = 4) %>%
+  dplyr::ungroup()
 ```
 
 ## Fit glm with subunit proportion as the response variable
 ```{r}
 # Fit glm
-castes_glmfit <- glm(
+glmfit_castes_amel <- glm(
   formula = proportion ~ ens_gene + caste + ens_gene * caste,
-  family = quasibinomial(link = "logit"), data = castes_counts_nachrs
+  family = quasibinomial(link = "logit"), data = counts_nachrs_castes_amel
 )
 
 # Save summary
 sink(here::here(
-  "results", "2020-07-28-gene_expression",
+  "results", "2022-07-20-gene_expression",
   "amel_castes_proportions_glm_summary.txt"
 ))
-print(summary(castes_glmfit))
+print(summary(glmfit_castes_amel))
 sink() # returns output to the console
 
 # Save glm results as a tidy tibble
-castes_glmfit_tidy <- broom::tidy(castes_glmfit) %>%
+glmfit_castes_tidy_amel <- broom::tidy(glmfit_castes_amel) %>%
   dplyr::mutate_if(is.numeric, round, digits = 4)
 
-write.csv(castes_glmfit_tidy,
+write.csv(glmfit_castes_tidy_amel,
   file = here::here(
-    "results", "2020-07-28-gene_expression",
+    "results", "2022-07-20-gene_expression",
     "bter_life_stages_proportions_glm_tidy.csv"
   ),
   row.names = FALSE
@@ -345,38 +351,38 @@ write.csv(castes_glmfit_tidy,
 # Analysis of tissues
 ## Create a `DESeqDataSet` object
 ```{r}
-tissues_dds <- DESeq2::DESeqDataSetFromTximport(tissues_txi,
-  colData = tissues_table,
-  design = ~tissue
+dds_tissues_amel <- DESeq2::DESeqDataSetFromTximport(txi_tissues_amel,
+  colData = table_tissues_amel,
+  design = ~ tissue
 )
 
 # Choose the reference level for tissue factor
-tissues_dds$tissue <- relevel(tissues_dds$tissue, ref = "midgut")
+dds_tissues_amel$tissue <- relevel(dds_tissues_amel$tissue, ref = "midgut")
 ```
 
 ## Pre-filter the low-count genes
 ```{r}
 # Estimate the size factors
-tissues_dds <- DESeq2::estimateSizeFactors(tissues_dds)
+dds_tissues_amel <- DESeq2::estimateSizeFactors(dds_tissues_amel)
 
 # Pre-filter the dataset
-dim(tissues_dds)
+dim(dds_tissues_amel)
 # 15314    48
-keep <- rowSums(DESeq2::counts(tissues_dds) >= 10) >= 3
-tissues_dds <- tissues_dds[keep, ]
-dim(tissues_dds)
+keep <- rowSums(DESeq2::counts(dds_tissues_amel) >= 10) >= 3
+dds_tissues_amel <- dds_tissues_amel[keep, ]
+dim(dds_tissues_amel)
 # 13114    48
 ```
 
 ## Use normalised counts to calculate the subunit proportions per sample
 ```{r}
 # Normalise counts
-tissues_counts <- DESeq2::counts(tissues_dds, normalized = TRUE)
+counts_tissues_amel <- DESeq2::counts(dds_tissues_amel, normalized = TRUE)
 
-tissues_counts_nachrs <- tissues_counts %>%
+counts_nachrs_tissues_amel <- counts_tissues_amel %>%
   as.data.frame() %>%
   tibble::rownames_to_column(var = "ens_gene") %>%
-  dplyr::filter(ens_gene %in% amel_nachrs) %>%
+  dplyr::filter(ens_gene %in% nachrs_amel) %>%
   tidyr::gather(key = "sample", value = "norm_count", -ens_gene) %>%
   dplyr::group_by(sample) %>%
   dplyr::mutate(proportion = norm_count / sum(norm_count)) %>%
@@ -411,26 +417,26 @@ tissues_counts_nachrs <- tissues_counts %>%
 ## Fit glm with subunit proportion as the response variable
 ```{r}
 # Fit glm
-tissues_glmfit <- glm(
+glmfit_tissues_amel <- glm(
   formula = proportion ~ ens_gene + tissue + ens_gene * tissue,
-  family = quasibinomial(link = "logit"), data = tissues_counts_nachrs
+  family = quasibinomial(link = "logit"), data = counts_nachrs_tissues_amel
 )
 
 # Save summary
 sink(here::here(
-  "results", "2020-07-28-gene_expression",
+  "results", "2022-07-20-gene_expression",
   "amel_tissues_proportions_glm_summary.txt"
 ))
-print(summary(tissues_glmfit))
+print(summary(glmfit_tissues_amel))
 sink() # returns output to the console
 
 # Save glm results as a tidy tibble
-tissues_glmfit_tidy <- broom::tidy(tissues_glmfit) %>%
+glmfit_tissues_tidy_amel <- broom::tidy(glmfit_tissues_amel) %>%
   dplyr::mutate_if(is.numeric, round, digits = 4)
 
-write.csv(tissues_glmfit_tidy,
+write.csv(glmfit_tissues_tidy_amel,
   file = here::here(
-    "results", "2020-07-28-gene_expression",
+    "results", "2022-07-20-gene_expression",
     "amel_tissues_proportions_glm_tidy.csv"
   ),
   row.names = FALSE
@@ -440,22 +446,33 @@ write.csv(tissues_glmfit_tidy,
 # Supplementary materials: data visualisation
 ## Stacked bar chart of subunit proportions in tissues
 ```{r}
-tissues_levels <- c(
+levels_tissues_amel <- c(
   "brain", "thoracic_ganglion", "antenna",
   "hypopharyngeal_gland", "mandibular_gland", "muscle",
   "midgut", "malpighian_tubule", "nasonov_gland"
 )
 
-subunit_levels <- c(
+levels_subunits <- c(
   "alpha1", "alpha2", "alpha3", "alpha4", "alpha5", "alpha6",
   "alpha7", "alpha8", "alpha9", "beta1", "beta2"
 )
 
-ggplot(tissues_counts_nachrs, aes(
-  x = factor(tissue, levels = tissues_levels), y = proportion,
-  fill = factor(subunit, levels = subunit_levels),
-  color = factor(subunit, levels = subunit_levels)
-)) +
+counts_nachrs_tissues_amel %>%
+  dplyr::mutate(
+    tissue = stringr::str_to_sentence(tissue)
+  ) %>%
+  dplyr::mutate(
+    tissue = stringr::str_replace(tissue, "_", " ")
+  ) %>%
+  ggplot(aes(
+    x = factor(tissue,
+      levels = stringr::str_to_sentence(levels_tissues_amel) %>%
+        stringr::str_replace("_", " ")
+    ),
+    y = proportion,
+    fill = factor(subunit, levels = levels_subunits),
+    color = factor(subunit, levels = levels_subunits)
+  )) +
   geom_bar(position = "fill", stat = "identity", width = 0.5, alpha = 0.9) +
   theme(
     plot.title = element_text(hjust = 0.5, size = 14, face = "plain"),