From 4410e3da55030ef28cf90773b304d27afe71c1be Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Wed, 18 Oct 2023 16:54:30 +0200
Subject: [PATCH] Update variant filtering exercise

---
 .pre-commit-config.yaml                       |   2 +-
 docs/_data.yml                                |   2 +-
 docs/_quarto.yml                              |   2 +-
 docs/exercises/index.qmd                      |   2 +-
 .../index.qmd                                 | 346 ++++++++++++------
 5 files changed, 246 insertions(+), 108 deletions(-)
 rename docs/exercises/{data_filtering => variant_filtering}/index.qmd (60%)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index acb353aa..9ae5269d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -85,7 +85,7 @@ repos:
           docs/slides/pgip/index\.qmd|
           docs/slides/datageneration/index\.qmd|
           docs/slides/foundations/index\.qmd|
-          docs/exercises/data_filtering/index\.qmd|
+          docs/exercises/variant_filtering/index\.qmd|
           docs/slides/simulation/index\.qmd
           )$
         minimum_pre_commit_version: "2.13.0"
diff --git a/docs/_data.yml b/docs/_data.yml
index 56e42dce..1dc5f9cd 100644
--- a/docs/_data.yml
+++ b/docs/_data.yml
@@ -106,7 +106,7 @@ exercises/variant_calling:
     - __variants__
     - __redyellow_variants__
 
-exercises/data_filtering:
+exercises/variant_filtering:
   __data__:
     - __red_recal_bam__
     - __yellow_recal_bam__
diff --git a/docs/_quarto.yml b/docs/_quarto.yml
index 260965d1..d5ab23cc 100644
--- a/docs/_quarto.yml
+++ b/docs/_quarto.yml
@@ -22,7 +22,7 @@ project:
     - "exercises/index.qmd"
     - "exercises/simulation/index.qmd"
     - "exercises/variant_calling/index.qmd"
-    - "exercises/data_filtering/index.qmd"
+    - "exercises/variant_filtering/index.qmd"
     # Slides
     - "slides/index.qmd"
     - "slides/pgip/index.qmd"
diff --git a/docs/exercises/index.qmd b/docs/exercises/index.qmd
index 1df5dce5..a4614da6 100644
--- a/docs/exercises/index.qmd
+++ b/docs/exercises/index.qmd
@@ -13,7 +13,7 @@ listing:
     - datasets/monkeyflowers.qmd
     - simulation/index.qmd
     - variant_calling/index.qmd
-    - data_filtering/index.qmd
+    - variant_filtering/index.qmd
 date: ""
 toc: false
 sidebar: false
diff --git a/docs/exercises/data_filtering/index.qmd b/docs/exercises/variant_filtering/index.qmd
similarity index 60%
rename from docs/exercises/data_filtering/index.qmd
rename to docs/exercises/variant_filtering/index.qmd
index be6506dc..695889d5 100644
--- a/docs/exercises/data_filtering/index.qmd
+++ b/docs/exercises/variant_filtering/index.qmd
@@ -1,10 +1,10 @@
 ---
-title: Filtering of resequencing data
+title: Variant filtering
 author:
   - Per Unneberg
 format: html
 execute:
-  cache: true
+  cache: false
 ---
 
 <!-- markdownlint-disable MD041 -->
@@ -21,8 +21,11 @@ execute:
 
 <!-- markdownlint-enable MD041 -->
 
-In this exercise we will look at ways of filtering data, mainly based
-on coverage.
+In this exercise we will look at ways of filtering variant data. We
+will begin by applying filters to the variant file containing all
+sites data, followed by a more general approach based on sequencing
+coverage that works also when the variant file lacks information about
+monomorphic sites.
 
 ::: {.callout-tip collapse=true}
 
@@ -83,8 +86,11 @@ channels:
   - default
 dependencies:
   - bedtools=2.31.0
-  - csvtk=0.25.0
+  - bcftools=1.15.1
+  - csvtk=0.28.0
   - mosdepth=0.3.3
+  - samtools=1.15.1
+  - vcftools=0.1.16
 ```
 
 ### UPPMAX modules
@@ -95,10 +101,13 @@ Execute the following command to load modules:
 #| label: uppmax-load-modules
 #| echo: true
 #| eval: false
-module load uppmax bioinfo-tools mosdepth/0.3.3 BEDTools/2.29.2 bcftools/1.17 vcftools/0.1.16
+module load uppmax bioinfo-tools mosdepth/0.3.3 \
+    BEDTools/2.29.2 samtools/1.17 bcftools/1.17 vcftools/
 ```
 
-Note that `csvtk` may need to be manually installed.
+Note that `csvtk` may require [manual
+installation](https://bioinf.shenwei.me/csvtk/#installation)^[18-Oct-2023:
+We will aim to make csvtk available via the module system].
 
 :::
 
@@ -122,14 +131,11 @@ in practice leads to an underrepresentation of heterozygote calls.
 To improve the accuracy of downstream inference, a number of
 analysis-dependent quality control filters can be applied to the raw
 variant call set (for a concise summary, see
-@lou_BeginnerGuideLowcoverage_2021). In this exercise, we will focus
-on coverage-based filters, with the aim of generating *sequence masks*
-to denote regions of a reference sequence that contain sufficient
-information across individuals and populations. Furthermore, the masks
-will be applied in the context of genetic diversity calculations, in
-which case specific filters on polymorphic sites (e.g., *p*-value or
-minimum minor allele frequency (MAF)) should **not** be applied (all
-sites contain information).
+@lou_BeginnerGuideLowcoverage_2021). In this exercise, we will begin
+by applying filters to the variant file containing all sites data,
+followed by a more general approach based on sequencing coverage that
+works also when the variant file lacks information about monomorphic
+sites.
 
 ```{r, engine='tikz', fig.ext="svg"}
 #| label: fig-coverage-filters
@@ -183,13 +189,13 @@ Minimum and maximum depth filters could be applied to the aggregate
 coverages of all samples, or samples grouped by population, to
 eliminate sites confounding data support.
 
-The vcf we have produced contains all sites; that is, both monomorphic
+The VCF we have produced contains all sites; that is, both monomorphic
 and polymorphic sites are present. Every site contains information
 about depth and other metadata, which makes it possible to apply
 coverage filters directly to the variant file itself. We will make use
 of direct filters in the first session below (@sec-basic-filtering).
 
-However, it may not always be possible to generate a vcf with all
+However, it may not always be possible to generate a VCF with all
 sites. Species with large genomes will produce files so large that
 they prevent efficient downstream processing. Under these
 circumstances, *ad hoc* coverage filters can be applied to the bam
@@ -203,6 +209,14 @@ FIXME: add basic filtering commands with `vcftools`
 
 ## Advanced filtering based coverage analyses {#sec-advanced-filtering}
 
+In this section, we will focus on coverage-based filters, with the aim
+of generating *sequence masks* to denote regions of a reference
+sequence that contain sufficient information across individuals and
+populations. Furthermore, the masks will be applied in the context of
+genetic diversity calculations, in which case specific filters on
+polymorphic sites (e.g., *p*-value or minimum minor allele frequency
+(MAF)) should **not** be applied (all sites contain information).
+
 Mapped reads provide information about how well a given genomic region
 has been represented during sequencing, and this information is
 usually summarized as the sequencing coverage. For any locus, this is
@@ -245,6 +259,7 @@ sufficient.
 #| eval: true
 export REF=M_aurantiacus_v1.fasta
 samtools faidx ${REF}
+
 ```
 
 ```{r }
@@ -262,23 +277,39 @@ exclude reads with a mapping quality less than 20:
 #| label: mosdepth-one-sample
 #| echo: true
 #| eval: true
-mosdepth -Q 20 INJ INJ.sort.dup.recal.bam
+mosdepth -Q 20 PUN-Y-INJ PUN-Y-INJ.sort.dup.recal.bam
 ```
 
 The per-base coverage output file will be named
-`INJ.per-base.bed.gz` and can be viewed with `bgzip`:
+`PUN-Y-INJ.per-base.bed.gz` and can be viewed with `bgzip`:
 
 ```{bash }
 #| label: bgzip-view-sample
 #| echo: true
 #| eval: true
-bgzip -c -d INJ.per-base.bed.gz | head -n 5
+bgzip -c -d PUN-Y-INJ.per-base.bed.gz | head -n 5
 ```
 
 To get an idea of what the coverage looks like over the chromsome, we
-can use `bedtools` and `cvstk` in a one-liner to generate a simple
-coverage plot (@fig-plot-coverage)^[The one-liner combines the results
-of several commands in a pipe stream. Also, [Bash
+will make use of two very versatile programs for dealing with genomic
+interval data and delimited text files. Both programs operate on
+streams, enabling the construction of powerful piped commands at the
+command line (see below).
+
+The first is `bedtools`, which consists of a set of tools to perform
+*genome arithmetic* on `bed`-like file formats. The
+[`bed`](https://genome.ucsc.edu/FAQ/FAQformat.html#format1) format is
+a tab-delimited format that at its simplest consists of the three
+columns `chrom` (the chromosome name), `chromStart`, and `chromEnd`,
+the start and end coordinates of a region.
+
+The second is `csvtk`, a program that provides tools for dealing with
+tab- or comma-separated text files. Avid `R` users will see that many
+of the subcommands are similar to those of the `tidyverse` ecosystem.
+
+We can use these programs in a one-liner to generate a simple coverage
+plot (@fig-plot-coverage)^[The one-liner combines the results of
+several commands in a pipe stream. Also, [Bash
 redirections](https://www.gnu.org/software/bash/manual/html_node/Redirections.html)
 are used to gather the results from the output of `bedtools
 makewindows` to `bedtools intersect`. The intersection commands
@@ -290,7 +321,7 @@ collects coverage data in 1kb windows that are then summarized by
 #| echo: true
 #| eval: true
 bedtools intersect -a <(bedtools makewindows -g ${REF}.fai -w 1000) \
-      -b INJ.per-base.bed.gz -wa -wb | \
+      -b PUN-Y-INJ.per-base.bed.gz -wa -wb | \
   bedtools groupby -i - -g 1,2,3 -c 7 -o mean | \
   csvtk plot -t line -x 2 -y 4 --point-size 0.01 --xlab Position \
       --ylab Coverage --width 9.0 --height 3.5 > fig-plot-coverage.png
@@ -300,7 +331,7 @@ bedtools intersect -a <(bedtools makewindows -g ${REF}.fai -w 1000) \
 
 ![](fig-plot-coverage.png)
 
-Coverage for sample INJ in 1kb windows. Experiment changing the
+Coverage for sample PUN-Y-INJ in 1kb windows. Experiment changing the
 window size (`-w`) parameter to change smoothing.
 
 :::
@@ -318,16 +349,23 @@ for more information.]
 #| label: mosdepth-compile-coverage-data
 #| echo: true
 #| eval: true
+# [A-Z] matches characters A-Z, * is a wildcard character that matches
+# anything
 for f in [A-Z]*.sort.dup.recal.bam; do
-  prefix=${f%.sort.dup.recal.bam}
-  mosdepth -Q 20 $prefix $f
-  echo -e -n "$prefix\t"
-  cat $prefix.mosdepth.summary.txt | grep total
+ # Extract the prefix by removing .sort.dup.recal.bam
+ prefix=${f%.sort.dup.recal.bam}
+ mosdepth -Q 20 $prefix $f
+ # Print sample name
+ echo -e -n "$prefix\t"
+ # Get the summary line containing the total coverage
+ cat $prefix.mosdepth.summary.txt | grep total
 done > ALL.mosdepth.summary.txt
+# View the file
 cat ALL.mosdepth.summary.txt
 ```
 
-We can calculate the total coverage with `csvtk` as follows:
+We can calculate the total coverage by summing the values of the fifth
+column with `csvtk` as follows:
 
 ```{bash }
 #| label: mosdepth-compile-coverage-data-sum
@@ -342,6 +380,7 @@ csvtk summary -H -t ALL.mosdepth.summary.txt -f 5:sum
 #| eval: true
 x <- read.table("ALL.mosdepth.summary.txt")
 total_coverage <- sum(x$V5)
+
 ```
 
 to get the total coverage `r total_coverage`, which gives a hint at
@@ -349,12 +388,21 @@ where the diploid coverage peak should be.
 
 ### Sample set coverages
 
-We can combine coverage intervals with `bedtools unionbedg`. We
-collect the bed file names and generate matching sample names to pass
-as arguments to option `-names`. Also, we include positions with no
-coverage (`-empty`) which requires the use of a genome file (option
-`-g`). The bed output is piped to `bgzip` which compresses the output,
-before finally indexing with `tabix`:
+In this section, we will summarize coverage information for different
+sample sets, the motivation being that different filters may be
+warranted depending on what samples are being analysed. For instance,
+the coverage cutoffs for all samples will most likely be different
+from those applied to the subpopulations.
+
+We can combine the coverage output from different samples with
+`bedtools unionbedg`. We begin by generating a coverage file for all
+samples, where the output columns will correspond to individual
+samples. To save typing, we collect the sample names and generate
+matching bed file names to pass as arguments to options `-names` and
+`-i`, respectively. Also, we include positions with no coverage
+(`-empty`) which requires the use of a genome file (option `-g`). The
+bed output is piped to `bgzip` which compresses the output, before
+finally indexing with `tabix`:
 
 <!-- markdownlint-disable MD013 -->
 
@@ -362,34 +410,68 @@ before finally indexing with `tabix`:
 #| label: bedtools-unionbedg-all
 #| echo: true
 #| eval: true
-SAMPLES=$(cat sampleinfo.csv | csvtk grep -r -p .*red.* -r -p .*yellow.* -f Taxon | csvtk cut -f Sample | grep -v Sample | tr "\n" " ")
+SAMPLES=$(csvtk grep -f Taxon -r -p "yellow" -r -p "red" sampleinfo.csv | csvtk cut -f SampleAlias | grep -v SampleAlias | tr "\n" " ")
 BEDGZ=$(for sm in $SAMPLES; do echo -e -n "${sm}.per-base.bed.gz "; done)
 bedtools unionbedg -header -names $SAMPLES -g ${REF}.fai -empty -i $BEDGZ | bgzip > ALL.bg.gz
 tabix -f -p bed -S 1 ALL.bg.gz
 ```
 
+::: {.callout-note collapse=true}
+
+## {{< fa brands linux >}} Command line magic
+
+The code above works as follows. We first use `csvtk` to `grep`
+(search) for the population names red and yellow in the `Taxon` column
+in `sampleinfo.csv`, thereby filtering the output to lines where
+`Taxon` matches the population names. Then, we cut out the interesting
+column `SampleAlias` and remove the header (`grep -v SampleAlias`
+matches anything but `SampleAlias`). Finally, `tr` translates newline
+character `\n` to space. The output is stored in the `SAMPLES`
+variable through the command substitution (`$()`) syntax.
+
+We then iterate through the `$SAMPLES` to generate the input file
+names with the `echo` command, storing the output in `$BEDGZ`. These
+variables are passed on to `bedtools unionbedg` to generate a
+[bedgraph file](https://genome.ucsc.edu/goldenPath/help/bedgraph.html)
+combining all samples.
+
+:::
+
 <!-- markdownlint-enable MD013 -->
 
-We also need to combine coverages per populations yellow and red.
+As mentioned previously, we also need to combine coverages per
+populations yellow and red.
 
 ::: {.callout-exercise}
 
 Using the previous command as a template, try to generate per
 population coverage files.
 
+::: {.callout-hint}
+
+You only need to modify the code that generates `SAMPLES` by grepping
+for each population separately (`csvtk grep -f Taxon -r -p red` and so
+on). Remember also to modify the output file name (e.g.,
+`red.bg.gz`).
+
+:::
+
 ::: {.callout-answer}
 
 <!-- markdownlint-disable MD013 -->
 
+An example using a for loop is shown here. You could copy-paste the
+code above and explicitly write out the population labels.
+
 ```{bash }
 #| label: bedtools-unionbedg-per-population
 #| echo: true
 #| eval: true
 for pop in red yellow; do
- SAMPLES=$(cat sampleinfo.csv | csvtk grep -r -p .*${pop}.* -f Taxon | csvtk cut -f Sample | grep -v Sample | tr "\n" " ")
+ SAMPLES=$(csvtk grep -f Taxon -r -p $pop sampleinfo.csv | csvtk cut -f SampleAlias | grep -v SampleAlias | tr "\n" " ")
  BEDGZ=$(for sm in $SAMPLES; do echo -e -n "${sm}.per-base.bed.gz "; done)
- bedtools unionbedg -header -names $SAMPLES -g ${REF} -empty -i $BEDGZ | bgzip > $pop.bed.gz
- tabix -f -p bed -S 1 $pop.bed.gz
+ bedtools unionbedg -header -names $SAMPLES -g ${REF}.fai -empty -i $BEDGZ | bgzip > $pop.bg.gz
+ tabix -f -p bed -S 1 $pop.bg.gz
 done
 ```
 
@@ -402,18 +484,25 @@ done
 ### Total coverage
 
 Since we eventually want to filter on total coverage, we sum per
-sample coverages for each sample set with `awk`:
+sample coverages for each sample set with `awk`^[Unfortunately `csvtk`
+doesn't seem to have support for calculating column margins out of the
+box, which is why we have to resort to this complicated construct]:
 
 <!-- markdownlint-disable MD013 -->
 
 ```{bash }
 #| label: awk-sum-ALL-coverage
 #| echo: true
-#| eval: false
-bgzip -c -d ALL.bg.gz | awk -v FS="\t" -v OFS="\t" 'NR > 1 {sum=0; for (i=4; i<=NF; i++) sum+=$i; print $1, $2, $3, sum}' | bgzip > ALL.sum.bed.gz
+#| eval: true
+bgzip -c -d ALL.bg.gz | \
+ awk -v FS="\t" -v OFS="\t" 'NR > 1 {sum=0; for (i=4; i<=NF; i++) sum+=$i; print $1, $2, $3, sum}' | \
+ bgzip > ALL.sum.bed.gz
 tabix -f -p bed ALL.sum.bed.gz
 ```
 
+Here we use `awk` to sum from columns 4 and up (`NF` is the number of
+the last column).
+
 <!-- markdownlint-enable MD013 -->
 
 For illustration, we plot the total coverage:
@@ -423,7 +512,7 @@ For illustration, we plot the total coverage:
 ```{bash }
 #| label: total-coverage
 #| echo: true
-#| eval: false
+#| eval: true
 #| fig-show: asis
 #| fig-cap: Coverage for ALL samples in 1kb windows. Experiment changing the window size (`-w`) parameter to change smoothing.
 bedtools intersect -a <(bedtools makewindows -g ${REF}.fai -w 1000) \
@@ -443,23 +532,48 @@ Total coverage in 1kb windows.
 
 :::
 
-In order to define thresholds for subsequent filtering, we plot the
+In order to define thresholds for subsequent filtering, we need to
+know the total coverage distribution. Therefore, we plot the
 proportion of the genome coverage versus depth of coverage (similar to
-k-mer plots in sequence assembly projects). In
+k-mer plots in sequence assembly projects). To do so we must summarize
+our coverage file such that we count how many bases have a given
+coverage. This can be achieved by noting that each row in the bed file
+consists of the columns `CHROM`, `START`, `END`, and `COVERAGE`. We
+can generate a histogram table by, for each value of `COVERAGE`,
+summing the length of the regions (`END` - `START`).
+
+<!-- markdownlint-disable MD013 -->
+
+```{bash }
+#| label: compile-genome-coverage-for-plot
+#| echo: true
+#| eval: true
+# Add column containing length of region (end - start)
+csvtk mutate2 -t -H -w 0 -e '$3-$2' ALL.sum.bed.gz | \
+ # Sum the regions and *group* by the coverage (fourth column);
+ # this gives the total number of bases with a given coverage
+ csvtk summary -t -H -g 4 -f 5:sum -w 0 | \
+ csvtk sort -t -k 1:n | \
+ awk -v cumsum=0 'BEGIN {OFS=","; cumsum=0} {cumsum += $2; print $1,$2,cumsum}' > ALL.sum.bed.csv
+```
+
+<!-- markdownlint-enable MD013 -->
+
+We plot the coverage distribution below, along with a plot of the
+cumulative coverage.
 
 <!-- markdownlint-disable MD013 -->
 
 ```{bash }
 #| label: total-depth-of-coverage-distribution
-#| echo: false
-#| eval: false
-zcat ALL.sum.bed.gz | \
-  awk -v h=0 'BEGIN {OFS=","; cumsum=0} {a[$4] += ($3-$2)/1000; if ($4 > h) {h=$4}} END{for (i=0; i<=h; i++) {if (a[i]) {cumsum+=a[i]} else {a[i]=0;} print i, a[i], cumsum}}' \
-    > ALL.sum.bed.csv
-cat ALL.sum.bed.csv | csvtk plot line -x 1 -y 2 --point-size 0.01 --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" --width 9.0 --height 3.5 \
-    > fig-plot-total-coverage-distribution.png
-cat ALL.sum.bed.csv | csvtk plot line -x 1 -y 3 --point-size 0.01 --xlab "Depth of coverage (X)" --ylab "Cumulative genome coverage (kbp)" --width 9.0 --height 3.5 \
-    > fig-plot-total-coverage-distribution-cumulative.png
+#| echo: true
+#| eval: true
+csvtk plot line -H ALL.sum.bed.csv -x 1 -y 2 --point-size 0.01 \
+   --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" \
+   --width 9.0 --height 3.5 > fig-plot-total-coverage-distribution.png
+csvtk plot line -H ALL.sum.bed.csv -x 1 -y 3 --point-size 0.01 \
+   --xlab "Depth of coverage (X)" --ylab "Cumulative genome coverage (kbp)" \
+   --width 9.0 --height 3.5 > fig-plot-total-coverage-distribution-cumulative.png
 ```
 
 ::: {#fig-plot-total-coverage-distribution attr-output='.details summary="Output"' layout-nrow=2}
@@ -473,29 +587,27 @@ Genome coverage vs depth of coverage.
 
 <!-- markdownlint-enable MD013 -->
 
-In @fig-plot-total-coverage-distribution a, a diploid peak is
-evident just below coverage X=150; we zoom in on that region to get a
-better view:
+In @fig-plot-total-coverage-distribution a, a diploid peak is evident
+at around coverage X=100; we zoom in on that region to get a better
+view:
 
 ```{bash }
 #| label: total-depth-of-coverage-zoom-in
 #| echo: true
 #| eval: false
-cat ALL.sum.bed.csv | \
-  csvtk plot line -x 1 -y 2 --point-size 0.01 \
+csvtk plot line -H ALL.sum.bed.csv -x 1 -y 2 --point-size 0.01 \
     --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" \
-    --width 9.0 --height 3.5 --x-min 90 --x-max 160
+    --width 9.0 --height 3.5 --x-min 40 --x-max 140
 ```
 
 ```{bash }
 #| label: plot-total-depth-of-coverage-zoom-in
 #| echo: false
-#| eval: false
-cat ALL.sum.bed.csv | \
-  csvtk plot line -x 1 -y 2 --point-size 0.01 \
-    --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" \
-    --width 9.0 --height 3.5 --x-min 90 --x-max 160 > \
-    fig-plot-total-coverage-distribution-hist-zoom-in.png
+#| eval: true
+csvtk plot line -H ALL.sum.bed.csv -x 1 -y 2 --point-size 0.01 \
+   --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" \
+   --width 9.0 --height 3.5 --x-min 40 --x-max 140 > \
+   fig-plot-total-coverage-distribution-hist-zoom-in.png
 ```
 
 <!-- markdownlint-disable MD013 -->
@@ -511,12 +623,12 @@ Genome coverage vs depth of coverage.
 <!-- markdownlint-enable MD013 -->
 
 [@lou_BeginnerGuideLowcoverage_2021] point out that appropriate
-thresholds depend on the data set, but as a general rule recommends a
-minimum depth threshold at <0.8x average coverage, and a maximum depth
-threshold at mean coverage plus one or two standard deviations. Given
-the unrealistically homogenous coverage of our simulated data, we set
-thresholds based on @fig-plot-total-coverage-distribution-zoom-in to
-e.g., 100-160.
+thresholds depend on the data set, but as a general rule recommend a
+minimum depth threshold at <0.8X average coverage, and a maximum depth
+threshold at mean coverage plus one or two standard deviations. For
+the sake of simplicity, you could here infer a cutoff simply by
+manually inspecting @fig-plot-total-coverage-distribution-zoom-in;
+here, we will use the range 50-110.
 
 We then use these thresholds to generate a bed file containing regions
 that are accessible, i.e., have sufficient coverage for downstream
@@ -528,8 +640,11 @@ filtering criteria.
 ```{bash }
 #| label: filter-all-bed-file-on-coverage
 #| echo: true
-#| eval: false
-zcat ALL.sum.bed.gz | awk '{if (($4 >= 100) && ($4 <= 160)) print $0}' > ALL.sum.depth.bed.gz
+#| eval: true
+#| results: hide
+csvtk filter -t -H ALL.sum.bed.gz -f '4>50' | \
+ csvtk filter -t -H -f '4<110' | \
+ bgzip -c > ALL.sum.depth.bed.gz
 bedtools genomecov -i ALL.sum.depth.bed.gz -g ${REF}.fai  | grep genome
 ```
 
@@ -539,36 +654,39 @@ bedtools genomecov -i ALL.sum.depth.bed.gz -g ${REF}.fai  | grep genome
 #| label: r-compute-genome-coverage
 #| echo: false
 #| eval: true
-#x <- read.table("ALL.sum.depth.bed.gz", header=FALSE)
-#cov <- format(sum(x$V3-x$V2) / 1e6 * 100, digits=3)
-cov <- 10
+x <- read.table("ALL.sum.depth.bed.gz", header=FALSE)
+cov <- format(sum(x$V3-x$V2) / 1e5 * 100, digits=3)
 ```
 
 Consequently, `r cov`% of the genome is accessible by depth.
 
 ::: {.callout-note}
 
-#### Exercise
+## Exercise
 
-Generate coverage sums for sample sets CEU, CHB, and YRI, and from
+Generate coverage sums for the red and yellow sample sets, and from
 these determine coverage thresholds and apply the thresholds to
 generate bed files with accessible regions.
 
+::: {.callout-answer}
+
+We base the answer on the previous code.
+
 <!-- markdownlint-disable MD013 -->
 
 ```{bash }
 #| label: awk-sum-sample-set-coverage
 #| echo: true
-#| eval: false
-#| code-fold: true
+#| eval: true
 for pop in red yellow; do
-  bgzip -c -d $pop.bed.gz | \
-    awk -v FS="\t" -v OFS="\t" 'NR > 1 {sum=0; for (i=4; i<=NF; i++) sum+=$i; print $1, $2, $3, sum}' | \
-    bgzip > $pop.sum.bed.gz
-  tabix -f -p bed $pop.sum.bed.gz
-  zcat $pop.sum.bed.gz | \
-    awk -v h=0 'BEGIN {OFS=","; cumsum=0} {a[$4] += ($3-$2)/1000; if ($4 > h) {h=$4}} END{for (i=0; i<=h; i++) {if (a[i]) {cumsum+=a[i]} else {a[i]=0;} print i, a[i], cumsum}}' \
-    > $pop.sum.bed.csv
+ bgzip -c -d $pop.bg.gz | \
+  awk -v FS="\t" -v OFS="\t" 'NR > 1 {sum=0; for (i=4; i<=NF; i++) sum+=$i; print $1, $2, $3, sum}' | \
+  bgzip > $pop.sum.bed.gz
+ tabix -f -p bed $pop.sum.bed.gz
+ csvtk mutate2 -t -H -w 0 -e '$3-$2' $pop.sum.bed.gz | \
+ csvtk summary -t -H -g 4 -f 5:sum -w 0 | \
+ csvtk sort -t -k 1:n | \
+ awk -v cumsum=0 'BEGIN {OFS=","; cumsum=0} {cumsum += $2; print $1,$2,cumsum}' > $pop.sum.bed.csv
 done
 ```
 
@@ -578,38 +696,48 @@ done
 #| label: plot-sum-sample-set-coverage
 #| echo: true
 #| eval: true
-#| code-fold: true
 for pop in red yellow; do
 cat ${pop}.sum.bed.csv | \
   csvtk plot line -x 1 -y 2 --point-size 0.01 \
     --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" \
-    --width 9.0 --height 3.5 --x-min 10 --x-max 100 > \
+    --width 9.0 --height 3.5 --x-min 0 --x-max 100 > \
     fig-plot-total-coverage-distribution-hist-zoom-in-$pop.png
 done
-
 ```
 
+::: {#fig-plot-population-coverage-distribution attr-output='.details summary="Output"' layout-nrow=2}
+
+![Zoomed in genome coverage, red](fig-plot-total-coverage-distribution-hist-zoom-in-red.png){#fig-plot-total-coverage-distribution-hist-zoom-in-red}
+
+![Zoomed in genome coverage, yellow](fig-plot-total-coverage-distribution-hist-zoom-in-yellow.png){#fig-plot-total-coverage-distribution-hist-zoom-in-yellow}
+
+Zoomed in coverage distribution for red and yellow ecotypes.
+:::
+
+:::
+
 :::
 
 Now we have combined total per sample coverage for ALL samples, and
-for sample sets CEU, CHB, and YRI. The upcoming task will be to
-generate sequence masks from the total coverage and minimum number of
+for sample sets red and yellow. The upcoming task will be to generate
+sequence masks from the total coverage and minimum number of
 individuals with coverage greater than zero.
 
 ### Filter on minimum number of individuals
 
-In addition to filtering on coverage, we will also filter on the
-minimum number of individuals with calls. This is to account for cases
-where coverages that pass the coverage filter originate from just a
-few samples. Here, we will remove sites where less than 50% of
-individuals have a call.
+In addition to filtering on total coverage, we will also filter on the
+minimum number of individuals with a minimum depth. This is to account
+for cases where regions that pass the minimum coverage filter
+originate from just a few samples with unusually high coverage. Here,
+we will remove sites where more than 50% of individuals have zero
+coverage.
 
 <!-- markdownlint-disable MD013 -->
 
 ```{bash }
 #| label: awk-filter-50pct-ALL
 #| echo: true
-#| eval: false
+#| eval: true
 bgzip -c -d ALL.bg.gz | \
   awk -v FS="\t" 'BEGIN {OFS="\t"} NR > 1 {count=0; for (i=4; i<=NF; i++) {if ($i>0) count+=1}; if (count>=((NF-3)*0.5)) {print $1, $2, $3}}' | \
   bgzip > ALL.ind.bed.gz
@@ -618,9 +746,13 @@ tabix -f -p bed ALL.ind.bed.gz
 
 <!-- markdownlint-enable MD013 -->
 
-#### Exercise
+::: {.callout-exercise}
 
-Generate coverage sums for sample sets CEU, CHB, and YRI.
+## Exercise
+
+Generate coverage sums for red and yellow sample sets.
+
+::: {.callout-answer}
 
 <!-- markdownlint-disable MD013 -->
 
@@ -637,7 +769,11 @@ done
 
 <!-- markdownlint-enable MD013 -->
 
-## Sequence masks
+:::
+
+:::
+
+### Sequence masks
 
 Finally, for each sample set, we will use `bedtools intersect` to
 generate combined bed files for total sum coverages and the filter on
@@ -672,3 +808,5 @@ bedtools maskfasta -fi ${REF} -mc 1 -fo ${REF}.mask.fa -bed ${REF}.bed
 ```
 
 <!-- markdownlint-enable MD013 -->
+
+## References