From e0da1d64c9a51e8826c02a2505ebacb380161e29 Mon Sep 17 00:00:00 2001 From: Federico Lopez Date: Sat, 19 Sep 2020 11:35:40 +0100 Subject: [PATCH] Delete amel_bter_queen_brains_correlation.Rmd --- .../amel_bter_queen_brains_correlation.Rmd | 410 ------------------ 1 file changed, 410 deletions(-) delete mode 100644 04-species_comparison/amel_bter_queen_brains_correlation.Rmd diff --git a/04-species_comparison/amel_bter_queen_brains_correlation.Rmd b/04-species_comparison/amel_bter_queen_brains_correlation.Rmd deleted file mode 100644 index 0fe0011..0000000 --- a/04-species_comparison/amel_bter_queen_brains_correlation.Rmd +++ /dev/null @@ -1,410 +0,0 @@ ---- -title: "Comparison of _A. mellifera_ and _B. terrestris_ queen brains" -author: "Alicja Witwicka, Federico Lopez" -date: '`r Sys.Date()`' -output: - github_document: - toc: yes - pdf_document: - fig_caption: yes - toc: yes - html_document: - toc: yes -editor_options: - chunk_output_type: console -geometry: margin = 1cm ---- - -```{r setup, echo = FALSE} -knitr::opts_chunk$set( - echo = TRUE, - message = FALSE, - warning = FALSE, - cache.lazy = FALSE, - include = FALSE, - out.height = "\textheight", - out.width = "\textwidth" -) -``` - -# Load libraries -```{r} -load_cran_pkgs <- function(pkg) { - sapply(pkg, require, character.only = TRUE) -} -cran_pkgs <- c( - "BiocManager", "tidyverse", "GGally", "RColorBrewer", "pheatmap", "styler", - "here" -) -load_cran_pkgs(cran_pkgs) - -load_bioconductor_pkgs <- function(pkg) { - sapply(pkg, require, character.only = TRUE) -} -bioconductor_pkgs <- c( - "biomaRt", "rhdf5", "tximport", "DESeq2", "vsn", "hexbin" -) -load_bioconductor_pkgs(bioconductor_pkgs) -``` - -# Set a custom `ggplot` theme -```{r} -# Set a custom ggplot theme -library(ggthemr) -custom_palette <- c( - "#9E9E9E", "#59BDEF", "#EBCC2A", "#E1AF00", "#F27C8D", "#7B73D1", "#7B9FE0", - "#F5CC7F", "#66C2A5", "#28A7B6", "#F2CB57", "#F2A057", "#F2845C" -) -# ggthemr::colour_plot(custom_palette) - -custom_theme <- ggthemr::define_palette( - swatch = custom_palette, - gradient = c(lower = "#59BDEF", upper = "#FF6B5A") -) -ggthemr::ggthemr(custom_theme) -``` - -# Associate transcripts to genes -I used biomaRt to associate transcripts to genes (from Ensembl Metazoa) -seperately for both species. -```{r} -biomaRt::listMarts(host = "metazoa.ensembl.org") -# biomaRt::listMarts(host = "eg40-metazoa.ensembl.org") # version 40 - -amel_mart <- biomaRt::useMart( - biomart = "metazoa_mart", - dataset = "amellifera_eg_gene", - host = "metazoa.ensembl.org" -) - -bter_mart <- biomaRt::useMart( - biomart = "metazoa_mart", - dataset = "bterrestris_eg_gene", - host = "metazoa.ensembl.org" -) - -amel_transcript_to_gene <- biomaRt::getBM( - attributes = c("ensembl_transcript_id", "ensembl_gene_id"), - mart = amel_mart -) - -bter_transcript_to_gene <- biomaRt::getBM( - attributes = c("ensembl_transcript_id", "ensembl_gene_id"), - mart = bter_mart -) -``` - -# Create vectors of nAChR gene identifiers -```{r} -amel_nachrs <- c( - "GB42850", "GB42644", "GB47845", "GB43275", "GB50159", "GB43416", "GB53053", - "GB40923", "GB53427", "GB53055", "GB53428" -) - -bter_nachrs <- c( - "LOC100643274", "LOC100643282", "LOC100645032", "LOC100646787", - "LOC100647301", "LOC100647350", "LOC100647624", "LOC100648987", - "LOC100649515", "LOC100649612", "LOC100649796" -) -``` - -# Introduction -## Questions -1. Are there differences in the use of nAChR subunits in the queen brains -of _A. mellifera_ and _B. terrestris_? - -Datasets used: -Manfredini et al. 2015 (amel) -Manfredini et al. 2018 (bter) - -# Import `kallisto` quantifications -I imported the `kallisto` quantifications using `tximport`. Then, I used the -accompanying metadata to generate `DESeqDataSet` objects. - -## _Apis mellifera_ -```{r} -# Set sample identifiers using the names of the kallisto output directories -amel_sample_ids <- dir(here::here( - "2015-manfredini_queen_brains", "2020-06-18-kallisto", "results" -)) - -# Read metadata table -amel_samples_table <- read.csv( - here::here("amel_metadata", "2015-manfredini_queen_brains_metadata.csv"), - header = TRUE, - stringsAsFactors = TRUE -) - -# Read .h5 files in -amel_input_files <- here::here( - "2015-manfredini_queen_brains", "2020-06-18-kallisto", "results", - amel_samples_table$sample, - "abundance.h5" -) - -# Assign names to files -names(amel_input_files) <- amel_samples_table$sample - -# Check that sample names match -all(amel_sample_ids %in% amel_samples_table$sample) - -# Check that all files exist -all(file.exists(amel_input_files)) - -# Import kallisto quantifications -txi_kallisto_amel <- tximport::tximport(amel_input_files, - type = "kallisto", - tx2gene = amel_transcript_to_gene, - txOut = FALSE -) -``` - -## _Bombus terrestris_ -```{r} -# Set sample identifiers using the names of the kallisto output directories -bter_sample_ids <- dir(here::here( - "2017-manfredini_queen_brain", "2020-08-08-kallisto", "results" -)) - -# Read metadata table -bter_samples_table <- read.csv( - here::here("bter_metadata", "2017-manfredini_queen_brain_metadata.csv"), - header = TRUE, - stringsAsFactors = TRUE -) - -# Read .h5 files in -bter_input_files <- here::here( - "2017-manfredini_queen_brain", "2020-08-08-kallisto", "results", - bter_samples_table$sample, - "abundance.h5" -) - -# Assign names to files -names(bter_input_files) <- bter_samples_table$sample - -# Check that sample names match -all(bter_sample_ids %in% bter_samples_table$sample) - -# Check that all files exist -all(file.exists(bter_input_files)) - -# Import kallisto quantifications -bter_txi_kallisto <- tximport::tximport(bter_input_files, - type = "kallisto", - tx2gene = bter_transcript_to_gene, - txOut = FALSE -) -``` - -# Create `DESeqDataSet` objects -```{r} -amel_dds <- DESeq2::DESeqDataSetFromTximport(txi_kallisto_amel, - colData = amel_samples_table, - design = ~1 -) - -bter_dds <- DESeq2::DESeqDataSetFromTximport(bter_txi_kallisto, - colData = bter_samples_table, - design = ~1 -) -``` - -# Normalise counts and calculate expression proportions -```{r} -# Pre-filter low count genes -amel_dds <- DESeq2::estimateSizeFactors(amel_dds) -dim(amel_dds) -keep <- rowSums(DESeq2::counts(amel_dds) >= 10) >= 4 -amel_dds <- amel_dds[keep, ] -dim(amel_dds) - -bter_dds <- DESeq2::estimateSizeFactors(bter_dds) -dim(bter_dds) -keep <- rowSums(DESeq2::counts(bter_dds) >= 10) >= 4 -bter_dds <- bter_dds[keep, ] -dim(bter_dds) - -# Create data frames of normalised counts and calculate expression proportions -# Discard data from divergent subunits -divergent_subunits <- c("alpha9", "beta2") - -# Apis mellifera -amel_counts <- DESeq2::counts(amel_dds, normalized = TRUE) -amel_counts <- amel_counts %>% - as.data.frame() %>% - tibble::rownames_to_column(var = "ens_gene") %>% - tidyr::gather(key = "sample", value = "norm_count", -ens_gene) %>% - dplyr::mutate_if(is.numeric, round, digits = 4) %>% - dplyr::filter(ens_gene %in% amel_nachrs) %>% - dplyr::group_by(sample) %>% - dplyr::mutate(proportion = norm_count / sum(norm_count)) %>% - dplyr::mutate(subunit = case_when( - ens_gene == "GB42850" ~ "alpha1", - ens_gene == "GB42644" ~ "alpha2", - ens_gene == "GB47845" ~ "alpha3", - ens_gene == "GB43275" ~ "alpha4", - ens_gene == "GB50159" ~ "alpha5", - ens_gene == "GB43416" ~ "alpha6", - ens_gene == "GB53053" ~ "alpha7", - ens_gene == "GB40923" ~ "alpha8", - ens_gene == "GB53427" ~ "alpha9", - ens_gene == "GB53055" ~ "beta1", - ens_gene == "GB53428" ~ "beta2" - )) %>% - dplyr::mutate(species = "amel") %>% - dplyr::filter(!subunit %in% divergent_subunits) # Remove divergent subunits - -bter_counts <- DESeq2::counts(bter_dds, normalized = TRUE) -bter_counts <- bter_counts %>% - as.data.frame() %>% - tibble::rownames_to_column(var = "ens_gene") %>% - tidyr::gather(key = "sample", value = "norm_count", -ens_gene) %>% - dplyr::mutate_if(is.numeric, round, digits = 4) %>% - dplyr::filter(ens_gene %in% bter_nachrs) %>% - dplyr::filter(ens_gene != "LOC100645032") %>% # Exclude low-expression subunit - dplyr::group_by(sample) %>% - dplyr::mutate(proportion = norm_count / sum(norm_count)) %>% - dplyr::mutate(subunit = case_when( - ens_gene == "LOC100647624" ~ "alpha1", - ens_gene == "LOC100647301" ~ "alpha2", - ens_gene == "LOC100647350" ~ "alpha3", - ens_gene == "LOC100648987" ~ "alpha4e5", - ens_gene == "LOC100649515" ~ "alpha5", - ens_gene == "LOC100643274" ~ "alpha6", - ens_gene == "LOC100649796" ~ "alpha7", - ens_gene == "LOC100643282" ~ "alpha8", - ens_gene == "LOC100649612" ~ "beta1", - ens_gene == "LOC100646787" ~ "beta2" - )) %>% - dplyr::mutate(species = "bter") %>% - dplyr::filter(!subunit %in% divergent_subunits) # Remove divergent subunits - -# Join the data frames -counts_nachrs <- dplyr::bind_rows(amel_counts, bter_counts) - -# Recode subunit "alpha4" in Bombus terrestris -counts_nachrs$subunit <- dplyr::recode(counts_nachrs$subunit, - "alpha4e5" = "alpha4" -) - -# Plot proportion densities -ggplot(counts_nachrs, aes(x = proportion, color = species)) + - geom_density() - -# Plot densities of log counts -# Add pseudocount equal to 1 -ggplot(counts_nachrs, aes(x = norm_count + 1, color = species)) + - geom_density() + - scale_x_continuous(trans = "log10") - -# Plot the proportions to compare each subunit between species -# Match the colors assigned to subunits in other figures (Fig. 1) -# Create vector of colors for subunits -# Although included here, the colors of divergent subunits will not be plotted -subunit_breaks <- c( - "alpha1", "alpha2", "alpha3", "alpha4", "alpha5", "alpha6", "alpha7", - "alpha8", "alpha9", "beta1", "beta2" -) -subunit_colors <- c( - "#59BDEF", "#EBCC2A", "#E1AF00", "#F27C8D", "#7B73D1", "#7B9FE0", - "#F5CC7F", "#66C2A5", "#28A7B6", "#F2CB57", "#F2A057" -) - -ggplot(counts_nachrs, aes( - x = species, y = proportion, color = subunit, shape = subunit -)) + - geom_point(size = 2, stroke = 1) + - scale_shape_manual(values = 0:10) + - scale_color_manual(breaks = subunit_breaks, values = subunit_colors) + - theme( - plot.title = element_text(hjust = 0.5, size = 16, face = "plain"), - strip.text = element_text(size = 12, face = "plain", hjust = 0.5), - legend.position = "bottom", - axis.title = element_text(size = 14), - axis.text = element_text(size = 12), - axis.text.x = element_text(angle = 45, hjust = 1), - legend.text = element_text(size = 12) - ) + - facet_wrap(~subunit, ncol = 10) - -ggsave( - filename = here::here("results", "amel_bter_queens_brains_stripchart.pdf"), - width = 8, - height = 5, - units = "in" -) -``` - -# Use the median of subunit proportions to test for correlation between species -For each subunit per species, calculate the median over replicates. Use the -median values to test for the association in subunit expression between species. -```{r} -median_proportions <- counts_nachrs %>% - dplyr::ungroup() %>% # Remove grouping variable previously created - dplyr::select(species, subunit, proportion) %>% - dplyr::group_by(species, subunit) %>% - dplyr::summarise_at(vars(-group_cols()), ~ median(.)) %>% - dplyr::rename(median_proportion = proportion) - -# Verify that the median values match the expected results -counts_nachrs %>% - dplyr::filter(subunit == "alpha1" & species == "amel") %>% - dplyr::pull(var = proportion) -median(c(0.0213, 0.0264, 0.0161, 0.0310)) -# 0.02385 - -counts_nachrs %>% - dplyr::filter(subunit == "alpha2" & species == "amel") %>% - dplyr::pull(var = proportion) -median(c(0.123, 0.187, 0.205, 0.201)) -# 0.194 - -# Change to wide format to run the correlation test -median_proportions_wide <- median_proportions %>% - tidyr::spread(key = species, value = median_proportion) - -# Run correlation test using the Spearman method -cor.test( - median_proportions_wide$amel, - median_proportions_wide$bter, - method = "spearman" -) - -# Create a scatterplot with the median values -ggplot(median_proportions_wide, aes( - x = amel, y = bter, color = subunit, fill = subunit, shape = subunit -)) + - geom_point(size = 4, stroke = 2) + - scale_shape_manual(values = c(3, 4, 7:9, 12, 21:25)) + - scale_color_manual(breaks = subunit_breaks, values = subunit_colors) + - scale_fill_manual(breaks = subunit_breaks, values = subunit_colors) + - theme( - plot.title = element_text(hjust = 0.5, size = 16, face = "plain"), - strip.text = element_text(size = 12, face = "plain", hjust = 0.5), - legend.position = "bottom", - axis.title = element_text(size = 14), - axis.text = element_text(size = 12), - legend.text = element_text(size = 12) - ) + - labs( - title = "Proportion of nAChR subunit expression in queen brains", - x = expression(paste( - "Proportion of expression in ", - italic("A. mellifera") - )), - y = expression(paste( - "Proportion of expression in ", - italic("B. terrestris") - )) - ) - -ggsave( - filename = here::here( - "results", "amel_bter_queens_brains_scatterplot_median_proportions.pdf" - ), - width = 7, - height = 5, - units = "in" -) -```