diff --git a/docs/assets/bibliography.bib b/docs/assets/bibliography.bib index f1d69d4e..dace52d8 100644 --- a/docs/assets/bibliography.bib +++ b/docs/assets/bibliography.bib @@ -753,6 +753,21 @@ @article{fuller_PopulationGeneticsCoral_2020 publisher = {{American Association for the Advancement of Science}}, doi = {10.1126/science.aba4674} } +@article{garrison_HaplotypebasedVariantDetection_2012, + title = {Haplotype-Based Variant Detection from Short-Read Sequencing}, + author = {Garrison, Erik and Marth, Gabor}, + year = {2012}, + month = jul, + journal = {arXiv:1207.3907 [q-bio]}, + eprint = {1207.3907}, + primaryclass = {q-bio}, + url = {http://arxiv.org/abs/1207.3907}, + urldate = {2018-04-05}, + abstract = {The direct detection of haplotypes from short-read DNA sequencing data requires changes to existing small-variant detection methods. Here, we develop a Bayesian statistical framework which is capable of modeling multiallelic loci in sets of individuals with non-uniform copy number. We then describe our implementation of this framework in a haplotype-based variant detector, FreeBayes.}, + archiveprefix = {arxiv}, + keywords = {Quantitative Biology - Genomics,Quantitative Biology - Quantitative Methods}, + note = {Comment: 9 pages, partial draft}, +} @article{garrison_SpectrumFreeSoftware_2022, title = {A Spectrum of Free Software Tools for Processing the {{VCF}} Variant Call Format: Vcflib, Bio-Vcf, Cyvcf2, Hts-Nim and Slivar}, shorttitle = {A Spectrum of Free Software Tools for Processing the {{VCF}} Variant Call Format}, @@ -949,6 +964,7 @@ @article{hou_BalancingEfficientAnalysis_2021 langid = {english}, keywords = {Data processing,Genome informatics,Software} } + @incollection{hubisz_InferenceAncestralRecombination_2020, title = {Inference of {{Ancestral Recombination Graphs Using ARGweaver}}}, booktitle = {Statistical {{Population Genomics}}}, @@ -967,7 +983,6 @@ @incollection{hubisz_InferenceAncestralRecombination_2020 langid = {english}, keywords = {Ancestral recombination graph,Local ancestry,Markov chain Monte Carlo,Sequentially Markov coalescent}, } - @article{hurst_GeneticsUnderstandingSelection_2009, title = {Genetics and the Understanding of Selection}, author = {Hurst, Laurence D.}, @@ -1063,6 +1078,23 @@ @article{kimura_ProteinPolymorphismPhase_1971 langid = {english}, keywords = {Journal club EBC,Population Genetics Gillespie book}, } +@article{korneliussen_ANGSDAnalysisNext_2014, + title = {{{ANGSD}}: {{Analysis}} of {{Next Generation Sequencing Data}}}, + shorttitle = {{{ANGSD}}}, + author = {Korneliussen, Thorfinn Sand and Albrechtsen, Anders and Nielsen, Rasmus}, + year = {2014}, + month = nov, + journal = {BMC Bioinformatics}, + volume = {15}, + number = {1}, + pages = {356}, + issn = {1471-2105}, + doi = {10.1186/s12859-014-0356-4}, + url = {https://doi.org/10.1186/s12859-014-0356-4}, + urldate = {2023-10-22}, + abstract = {High-throughput DNA sequencing technologies are generating vast amounts of data. Fast, flexible and memory efficient implementations are needed in order to facilitate analyses of thousands of samples simultaneously.}, + keywords = {Association studies,Bioinformatics,Next-generation sequencing,Population genetics}, +} @article{korunes_PixyUnbiasedEstimation_2021, title = {Pixy: {{Unbiased}} Estimation of Nucleotide Diversity and Divergence in the Presence of Missing Data}, shorttitle = {Pixy}, @@ -1483,6 +1515,7 @@ @article{nielsen_GenotypeSNPCalling_2011 langid = {english}, keywords = {Bioinformatics,Genomics,Next-generation sequencing,Population genetics,Technology}, } + @article{nielsen_MolecularSignaturesNatural_2005, title = {Molecular {{Signatures}} of {{Natural Selection}}}, author = {Nielsen, Rasmus}, @@ -1515,7 +1548,6 @@ @article{nielsen_SNPCallingGenotype_2012 langid = {english}, keywords = {Algorithms,Heterozygosity,Next-generation sequencing,Nucleotides,Population genetics,Simulation and modeling,Single nucleotide polymorphisms,Variant genotypes}, } - @article{ohta_SlightlyDeleteriousMutant_1973, title = {Slightly {{Deleterious Mutant Substitutions}} in {{Evolution}}}, author = {Ohta, Tomoko}, diff --git a/docs/exercises/genetic_diversity/index.qmd b/docs/exercises/genetic_diversity/index.qmd index bc6da367..a8b73844 100644 --- a/docs/exercises/genetic_diversity/index.qmd +++ b/docs/exercises/genetic_diversity/index.qmd @@ -1,6 +1,6 @@ --- -title: Genetic diversity -subtitle: Diversity calculations and genomic scans +title: Genetic diversity landscapes +subtitle: Investigating the diversity landscape in Monkeyflower author: - Per Unneberg format: html @@ -128,8 +128,8 @@ dependencies: #| echo: false #| eval: true mkdir -p vcftools -OUT=vcftools/allsites -VCF=allsites.vcf.gz +OUT=vcftools/all.allsites +VCF=all.allsites.vcf.gz MISS=0.75 QUAL=30 MIN_DEPTH=260 @@ -147,7 +147,7 @@ vcftools --gzvcf $VCF \ ## Nucleotide diversity {#sec-ex-nucleotide-diversity} -::: {.callout-warning} +::: {.hidden} FIXME: pen-and-paper exercise to calculate diversity based on "fixed" genotype calls and genotype likelihoods to highlight differences; the @@ -218,8 +218,8 @@ easier to type: #| label: variant-setup #| echo: true #| eval: true -VCF=allsites.vcf.gz -REDYLW=redyellow.vcf.gz +VCF=all.allsites.vcf.gz +REDYLW=redyellow.allsites.vcf.gz POPS=populations.txt ``` @@ -228,8 +228,8 @@ POPS=populations.txt #| echo: false #| eval: true Sys.setenv( - VCF = "allsites.vcf.gz", - REDYLW = "redyellow.vcf.gz", + VCF = "all.allsites.vcf.gz", + REDYLW = "redyellow.allsites.vcf.gz", POPS = "populations.txt" ) ``` @@ -247,7 +247,7 @@ creating an output directory for the first exercise `e1`: #| eval: true mkdir -p e1 OUTDIR=e1 -OUT=e1/allsites +OUT=e1/all.allsites ``` ```{r } @@ -255,7 +255,7 @@ OUT=e1/allsites #| echo: false #| eval: true Sys.setenv( - OUT = "e1/allsites", + OUT = "e1/all.allsites", OUTDIR = "e1" ) ``` @@ -278,11 +278,15 @@ pixy --vcf $VCF --populations populations.ALL.txt --stats pi \ cat ${OUTDIR}/ALL.pixy_pi.txt ``` +:::{.hidden} + Compare pixy with vcftools to show that filtering is important FIXME: comparison of pixy and coverage-based filtering approach -::: {.callout-note} +::: + +::: {.hidden} - use sequence masks (0/1) encoded gz files to enable quick calculations of feature-based stats (e.g. 4d-degenerate sites) @@ -298,15 +302,19 @@ Calculate pi, Fst and dxy for: - all sites filtered - all sites variant mask -Compare - ### Combining masks with genome features +:::{.hidden} + Combine / provide gff features (better to generate from gff), combine with coverage mask to get, e.g., gene / exon / intron measures +::: + ## Windowed statistics and genome scans +::: {.hidden} + Windows and genome scans. Provide toolkit to work with data. Goal: prepare students for the problem of finding the region of interest. @@ -339,3 +347,5 @@ them to compile genomic landscape. Provide little instructions. Final touch: add msprime / slim simulation to compare with observed landscape somehow + +::: diff --git a/docs/slides/variant_calling/assets/images/gatk-best-practice.png b/docs/slides/variant_calling/assets/images/gatk-best-practice.png new file mode 100644 index 00000000..9b4a9184 Binary files /dev/null and b/docs/slides/variant_calling/assets/images/gatk-best-practice.png differ diff --git a/docs/slides/variant_calling/index.qmd b/docs/slides/variant_calling/index.qmd index 172b188e..e8c97c1d 100644 --- a/docs/slides/variant_calling/index.qmd +++ b/docs/slides/variant_calling/index.qmd @@ -733,11 +733,17 @@ After preprocessing, reads are mapped to a reference. Observations: ::: -## Potential pitfalls +## Potential error corrections and pitfalls + +#### Instrument + +- PCR duplicates -> MarkDuplicates +- systematic errors from sequencing machine -> BQSR + +#### Reference - quality of reference sequence! - repetitive sequence -- PCR duplicates # Variant calling and genotyping @@ -1219,21 +1225,111 @@ samtools tview -p LG4:6885 -d H -w 3 \ ## GATK best practice -Point out: not optimal for non-model organims +:::: {.columns} + +::: {.column width="70%"} + +![](assets/images/gatk-best-practice.png){width=90% fig-align=center} + +::: {.flushright .smallest} + + + +::: + +::: + +::: {.column width="30%"} + +
Pros
+ +- Best practices +- Large documentation +- Variant quality score recalibration + +
Cons
+ +- Human-centric - very slow runtime on genomes with many sequences +- Complicated setup + +::: + +:::: ## Alternative variant callers +:::: {.columns} + +::: {.column width="33%"} + +##### [freebayes](https://github.com/freebayes/freebayes) + +Bayesian genetic variant detector. Simpler setup. + +May struggle in high-coverage regions. + +::: {.flushright .smallr} + +[@garrison_HaplotypebasedVariantDetection_2012] + +::: + +::: + +::: {.column width="33%"} + +##### [bcftools](https://github.com/samtools/bcftools) + +Utilities for variant calling and manipulating VCFs and BCFs. + +::: {.flushright .smallr} + +[@danecek_TwelveYearsSAMtools_2021] + +::: + +::: + +::: {.column width="33%"} + +##### [ANGSD](http://www.popgen.dk/angsd/index.php/ANGSD) + +For low-coverage sequencing. Doesn't do explicit genotyping; most +methods take genotype uncertainty into account. + +::: {.flushright .smallr} + +[@korneliussen_ANGSDAnalysisNext_2014] + +::: + +::: + +:::: + +:::{.hidden} + Reference bias: plot no. hets vs coverage for real data, e.g., conifer -### freebayes +::: -### bcftools +::: {.notes} -### ANGSd +::: # Exercise on variant calling -## The monkeyflower system +## The monkeyflower system {.smallr} + +:::{} + + + +![](https://www.science.org/cms/10.1126/science.365.6456.854/asset/db271bee-8914-4712-9965-dcf1615b6a43/assets/graphic/365_854_f1.jpeg){fig-alt="The allure of monkeyflowers (DOI: 10.1126/science.365.6456.854)" width=50% fig-align=center} + + + +::: From @@ -1243,4 +1339,19 @@ From > experimental tractability, rapidly growing research community, and > the JGI-generated reference genome for M. guttatus. +## Learning outcomes + +Follow GATK best practices to + +- Perform qc on sequencing reads and interpret results +- Prepare reference for read mapping +- Map reads to reference +- Mark duplicates +- Perform raw variant calling to generate a set of sites to exclude + from recalibration +- Perform base quality score recalibration +- Perform variant calling on base recalibrated data +- Do genotyping on all samples and combine results to a raw variant + call set + ## Bibliography