From 4410e3da55030ef28cf90773b304d27afe71c1be Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Wed, 18 Oct 2023 16:54:30 +0200 Subject: [PATCH] Update variant filtering exercise --- .pre-commit-config.yaml | 2 +- docs/_data.yml | 2 +- docs/_quarto.yml | 2 +- docs/exercises/index.qmd | 2 +- .../index.qmd | 346 ++++++++++++------ 5 files changed, 246 insertions(+), 108 deletions(-) rename docs/exercises/{data_filtering => variant_filtering}/index.qmd (60%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index acb353aa..9ae5269d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -85,7 +85,7 @@ repos: docs/slides/pgip/index\.qmd| docs/slides/datageneration/index\.qmd| docs/slides/foundations/index\.qmd| - docs/exercises/data_filtering/index\.qmd| + docs/exercises/variant_filtering/index\.qmd| docs/slides/simulation/index\.qmd )$ minimum_pre_commit_version: "2.13.0" diff --git a/docs/_data.yml b/docs/_data.yml index 56e42dce..1dc5f9cd 100644 --- a/docs/_data.yml +++ b/docs/_data.yml @@ -106,7 +106,7 @@ exercises/variant_calling: - __variants__ - __redyellow_variants__ -exercises/data_filtering: +exercises/variant_filtering: __data__: - __red_recal_bam__ - __yellow_recal_bam__ diff --git a/docs/_quarto.yml b/docs/_quarto.yml index 260965d1..d5ab23cc 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -22,7 +22,7 @@ project: - "exercises/index.qmd" - "exercises/simulation/index.qmd" - "exercises/variant_calling/index.qmd" - - "exercises/data_filtering/index.qmd" + - "exercises/variant_filtering/index.qmd" # Slides - "slides/index.qmd" - "slides/pgip/index.qmd" diff --git a/docs/exercises/index.qmd b/docs/exercises/index.qmd index 1df5dce5..a4614da6 100644 --- a/docs/exercises/index.qmd +++ b/docs/exercises/index.qmd @@ -13,7 +13,7 @@ listing: - datasets/monkeyflowers.qmd - simulation/index.qmd - variant_calling/index.qmd - - data_filtering/index.qmd + - variant_filtering/index.qmd date: "" toc: false sidebar: false diff --git a/docs/exercises/data_filtering/index.qmd b/docs/exercises/variant_filtering/index.qmd similarity index 60% rename from docs/exercises/data_filtering/index.qmd rename to docs/exercises/variant_filtering/index.qmd index be6506dc..695889d5 100644 --- a/docs/exercises/data_filtering/index.qmd +++ b/docs/exercises/variant_filtering/index.qmd @@ -1,10 +1,10 @@ --- -title: Filtering of resequencing data +title: Variant filtering author: - Per Unneberg format: html execute: - cache: true + cache: false --- @@ -21,8 +21,11 @@ execute: -In this exercise we will look at ways of filtering data, mainly based -on coverage. +In this exercise we will look at ways of filtering variant data. We +will begin by applying filters to the variant file containing all +sites data, followed by a more general approach based on sequencing +coverage that works also when the variant file lacks information about +monomorphic sites. ::: {.callout-tip collapse=true} @@ -83,8 +86,11 @@ channels: - default dependencies: - bedtools=2.31.0 - - csvtk=0.25.0 + - bcftools=1.15.1 + - csvtk=0.28.0 - mosdepth=0.3.3 + - samtools=1.15.1 + - vcftools=0.1.16 ``` ### UPPMAX modules @@ -95,10 +101,13 @@ Execute the following command to load modules: #| label: uppmax-load-modules #| echo: true #| eval: false -module load uppmax bioinfo-tools mosdepth/0.3.3 BEDTools/2.29.2 bcftools/1.17 vcftools/0.1.16 +module load uppmax bioinfo-tools mosdepth/0.3.3 \ + BEDTools/2.29.2 samtools/1.17 bcftools/1.17 vcftools/ ``` -Note that `csvtk` may need to be manually installed. +Note that `csvtk` may require [manual +installation](https://bioinf.shenwei.me/csvtk/#installation)^[18-Oct-2023: +We will aim to make csvtk available via the module system]. ::: @@ -122,14 +131,11 @@ in practice leads to an underrepresentation of heterozygote calls. To improve the accuracy of downstream inference, a number of analysis-dependent quality control filters can be applied to the raw variant call set (for a concise summary, see -@lou_BeginnerGuideLowcoverage_2021). In this exercise, we will focus -on coverage-based filters, with the aim of generating *sequence masks* -to denote regions of a reference sequence that contain sufficient -information across individuals and populations. Furthermore, the masks -will be applied in the context of genetic diversity calculations, in -which case specific filters on polymorphic sites (e.g., *p*-value or -minimum minor allele frequency (MAF)) should **not** be applied (all -sites contain information). +@lou_BeginnerGuideLowcoverage_2021). In this exercise, we will begin +by applying filters to the variant file containing all sites data, +followed by a more general approach based on sequencing coverage that +works also when the variant file lacks information about monomorphic +sites. ```{r, engine='tikz', fig.ext="svg"} #| label: fig-coverage-filters @@ -183,13 +189,13 @@ Minimum and maximum depth filters could be applied to the aggregate coverages of all samples, or samples grouped by population, to eliminate sites confounding data support. -The vcf we have produced contains all sites; that is, both monomorphic +The VCF we have produced contains all sites; that is, both monomorphic and polymorphic sites are present. Every site contains information about depth and other metadata, which makes it possible to apply coverage filters directly to the variant file itself. We will make use of direct filters in the first session below (@sec-basic-filtering). -However, it may not always be possible to generate a vcf with all +However, it may not always be possible to generate a VCF with all sites. Species with large genomes will produce files so large that they prevent efficient downstream processing. Under these circumstances, *ad hoc* coverage filters can be applied to the bam @@ -203,6 +209,14 @@ FIXME: add basic filtering commands with `vcftools` ## Advanced filtering based coverage analyses {#sec-advanced-filtering} +In this section, we will focus on coverage-based filters, with the aim +of generating *sequence masks* to denote regions of a reference +sequence that contain sufficient information across individuals and +populations. Furthermore, the masks will be applied in the context of +genetic diversity calculations, in which case specific filters on +polymorphic sites (e.g., *p*-value or minimum minor allele frequency +(MAF)) should **not** be applied (all sites contain information). + Mapped reads provide information about how well a given genomic region has been represented during sequencing, and this information is usually summarized as the sequencing coverage. For any locus, this is @@ -245,6 +259,7 @@ sufficient. #| eval: true export REF=M_aurantiacus_v1.fasta samtools faidx ${REF} + ``` ```{r } @@ -262,23 +277,39 @@ exclude reads with a mapping quality less than 20: #| label: mosdepth-one-sample #| echo: true #| eval: true -mosdepth -Q 20 INJ INJ.sort.dup.recal.bam +mosdepth -Q 20 PUN-Y-INJ PUN-Y-INJ.sort.dup.recal.bam ``` The per-base coverage output file will be named -`INJ.per-base.bed.gz` and can be viewed with `bgzip`: +`PUN-Y-INJ.per-base.bed.gz` and can be viewed with `bgzip`: ```{bash } #| label: bgzip-view-sample #| echo: true #| eval: true -bgzip -c -d INJ.per-base.bed.gz | head -n 5 +bgzip -c -d PUN-Y-INJ.per-base.bed.gz | head -n 5 ``` To get an idea of what the coverage looks like over the chromsome, we -can use `bedtools` and `cvstk` in a one-liner to generate a simple -coverage plot (@fig-plot-coverage)^[The one-liner combines the results -of several commands in a pipe stream. Also, [Bash +will make use of two very versatile programs for dealing with genomic +interval data and delimited text files. Both programs operate on +streams, enabling the construction of powerful piped commands at the +command line (see below). + +The first is `bedtools`, which consists of a set of tools to perform +*genome arithmetic* on `bed`-like file formats. The +[`bed`](https://genome.ucsc.edu/FAQ/FAQformat.html#format1) format is +a tab-delimited format that at its simplest consists of the three +columns `chrom` (the chromosome name), `chromStart`, and `chromEnd`, +the start and end coordinates of a region. + +The second is `csvtk`, a program that provides tools for dealing with +tab- or comma-separated text files. Avid `R` users will see that many +of the subcommands are similar to those of the `tidyverse` ecosystem. + +We can use these programs in a one-liner to generate a simple coverage +plot (@fig-plot-coverage)^[The one-liner combines the results of +several commands in a pipe stream. Also, [Bash redirections](https://www.gnu.org/software/bash/manual/html_node/Redirections.html) are used to gather the results from the output of `bedtools makewindows` to `bedtools intersect`. The intersection commands @@ -290,7 +321,7 @@ collects coverage data in 1kb windows that are then summarized by #| echo: true #| eval: true bedtools intersect -a <(bedtools makewindows -g ${REF}.fai -w 1000) \ - -b INJ.per-base.bed.gz -wa -wb | \ + -b PUN-Y-INJ.per-base.bed.gz -wa -wb | \ bedtools groupby -i - -g 1,2,3 -c 7 -o mean | \ csvtk plot -t line -x 2 -y 4 --point-size 0.01 --xlab Position \ --ylab Coverage --width 9.0 --height 3.5 > fig-plot-coverage.png @@ -300,7 +331,7 @@ bedtools intersect -a <(bedtools makewindows -g ${REF}.fai -w 1000) \ ![](fig-plot-coverage.png) -Coverage for sample INJ in 1kb windows. Experiment changing the +Coverage for sample PUN-Y-INJ in 1kb windows. Experiment changing the window size (`-w`) parameter to change smoothing. ::: @@ -318,16 +349,23 @@ for more information.] #| label: mosdepth-compile-coverage-data #| echo: true #| eval: true +# [A-Z] matches characters A-Z, * is a wildcard character that matches +# anything for f in [A-Z]*.sort.dup.recal.bam; do - prefix=${f%.sort.dup.recal.bam} - mosdepth -Q 20 $prefix $f - echo -e -n "$prefix\t" - cat $prefix.mosdepth.summary.txt | grep total + # Extract the prefix by removing .sort.dup.recal.bam + prefix=${f%.sort.dup.recal.bam} + mosdepth -Q 20 $prefix $f + # Print sample name + echo -e -n "$prefix\t" + # Get the summary line containing the total coverage + cat $prefix.mosdepth.summary.txt | grep total done > ALL.mosdepth.summary.txt +# View the file cat ALL.mosdepth.summary.txt ``` -We can calculate the total coverage with `csvtk` as follows: +We can calculate the total coverage by summing the values of the fifth +column with `csvtk` as follows: ```{bash } #| label: mosdepth-compile-coverage-data-sum @@ -342,6 +380,7 @@ csvtk summary -H -t ALL.mosdepth.summary.txt -f 5:sum #| eval: true x <- read.table("ALL.mosdepth.summary.txt") total_coverage <- sum(x$V5) + ``` to get the total coverage `r total_coverage`, which gives a hint at @@ -349,12 +388,21 @@ where the diploid coverage peak should be. ### Sample set coverages -We can combine coverage intervals with `bedtools unionbedg`. We -collect the bed file names and generate matching sample names to pass -as arguments to option `-names`. Also, we include positions with no -coverage (`-empty`) which requires the use of a genome file (option -`-g`). The bed output is piped to `bgzip` which compresses the output, -before finally indexing with `tabix`: +In this section, we will summarize coverage information for different +sample sets, the motivation being that different filters may be +warranted depending on what samples are being analysed. For instance, +the coverage cutoffs for all samples will most likely be different +from those applied to the subpopulations. + +We can combine the coverage output from different samples with +`bedtools unionbedg`. We begin by generating a coverage file for all +samples, where the output columns will correspond to individual +samples. To save typing, we collect the sample names and generate +matching bed file names to pass as arguments to options `-names` and +`-i`, respectively. Also, we include positions with no coverage +(`-empty`) which requires the use of a genome file (option `-g`). The +bed output is piped to `bgzip` which compresses the output, before +finally indexing with `tabix`: @@ -362,34 +410,68 @@ before finally indexing with `tabix`: #| label: bedtools-unionbedg-all #| echo: true #| eval: true -SAMPLES=$(cat sampleinfo.csv | csvtk grep -r -p .*red.* -r -p .*yellow.* -f Taxon | csvtk cut -f Sample | grep -v Sample | tr "\n" " ") +SAMPLES=$(csvtk grep -f Taxon -r -p "yellow" -r -p "red" sampleinfo.csv | csvtk cut -f SampleAlias | grep -v SampleAlias | tr "\n" " ") BEDGZ=$(for sm in $SAMPLES; do echo -e -n "${sm}.per-base.bed.gz "; done) bedtools unionbedg -header -names $SAMPLES -g ${REF}.fai -empty -i $BEDGZ | bgzip > ALL.bg.gz tabix -f -p bed -S 1 ALL.bg.gz ``` +::: {.callout-note collapse=true} + +## {{< fa brands linux >}} Command line magic + +The code above works as follows. We first use `csvtk` to `grep` +(search) for the population names red and yellow in the `Taxon` column +in `sampleinfo.csv`, thereby filtering the output to lines where +`Taxon` matches the population names. Then, we cut out the interesting +column `SampleAlias` and remove the header (`grep -v SampleAlias` +matches anything but `SampleAlias`). Finally, `tr` translates newline +character `\n` to space. The output is stored in the `SAMPLES` +variable through the command substitution (`$()`) syntax. + +We then iterate through the `$SAMPLES` to generate the input file +names with the `echo` command, storing the output in `$BEDGZ`. These +variables are passed on to `bedtools unionbedg` to generate a +[bedgraph file](https://genome.ucsc.edu/goldenPath/help/bedgraph.html) +combining all samples. + +::: + -We also need to combine coverages per populations yellow and red. +As mentioned previously, we also need to combine coverages per +populations yellow and red. ::: {.callout-exercise} Using the previous command as a template, try to generate per population coverage files. +::: {.callout-hint} + +You only need to modify the code that generates `SAMPLES` by grepping +for each population separately (`csvtk grep -f Taxon -r -p red` and so +on). Remember also to modify the output file name (e.g., +`red.bg.gz`). + +::: + ::: {.callout-answer} +An example using a for loop is shown here. You could copy-paste the +code above and explicitly write out the population labels. + ```{bash } #| label: bedtools-unionbedg-per-population #| echo: true #| eval: true for pop in red yellow; do - SAMPLES=$(cat sampleinfo.csv | csvtk grep -r -p .*${pop}.* -f Taxon | csvtk cut -f Sample | grep -v Sample | tr "\n" " ") + SAMPLES=$(csvtk grep -f Taxon -r -p $pop sampleinfo.csv | csvtk cut -f SampleAlias | grep -v SampleAlias | tr "\n" " ") BEDGZ=$(for sm in $SAMPLES; do echo -e -n "${sm}.per-base.bed.gz "; done) - bedtools unionbedg -header -names $SAMPLES -g ${REF} -empty -i $BEDGZ | bgzip > $pop.bed.gz - tabix -f -p bed -S 1 $pop.bed.gz + bedtools unionbedg -header -names $SAMPLES -g ${REF}.fai -empty -i $BEDGZ | bgzip > $pop.bg.gz + tabix -f -p bed -S 1 $pop.bg.gz done ``` @@ -402,18 +484,25 @@ done ### Total coverage Since we eventually want to filter on total coverage, we sum per -sample coverages for each sample set with `awk`: +sample coverages for each sample set with `awk`^[Unfortunately `csvtk` +doesn't seem to have support for calculating column margins out of the +box, which is why we have to resort to this complicated construct]: ```{bash } #| label: awk-sum-ALL-coverage #| echo: true -#| eval: false -bgzip -c -d ALL.bg.gz | awk -v FS="\t" -v OFS="\t" 'NR > 1 {sum=0; for (i=4; i<=NF; i++) sum+=$i; print $1, $2, $3, sum}' | bgzip > ALL.sum.bed.gz +#| eval: true +bgzip -c -d ALL.bg.gz | \ + awk -v FS="\t" -v OFS="\t" 'NR > 1 {sum=0; for (i=4; i<=NF; i++) sum+=$i; print $1, $2, $3, sum}' | \ + bgzip > ALL.sum.bed.gz tabix -f -p bed ALL.sum.bed.gz ``` +Here we use `awk` to sum from columns 4 and up (`NF` is the number of +the last column). + For illustration, we plot the total coverage: @@ -423,7 +512,7 @@ For illustration, we plot the total coverage: ```{bash } #| label: total-coverage #| echo: true -#| eval: false +#| eval: true #| fig-show: asis #| fig-cap: Coverage for ALL samples in 1kb windows. Experiment changing the window size (`-w`) parameter to change smoothing. bedtools intersect -a <(bedtools makewindows -g ${REF}.fai -w 1000) \ @@ -443,23 +532,48 @@ Total coverage in 1kb windows. ::: -In order to define thresholds for subsequent filtering, we plot the +In order to define thresholds for subsequent filtering, we need to +know the total coverage distribution. Therefore, we plot the proportion of the genome coverage versus depth of coverage (similar to -k-mer plots in sequence assembly projects). In +k-mer plots in sequence assembly projects). To do so we must summarize +our coverage file such that we count how many bases have a given +coverage. This can be achieved by noting that each row in the bed file +consists of the columns `CHROM`, `START`, `END`, and `COVERAGE`. We +can generate a histogram table by, for each value of `COVERAGE`, +summing the length of the regions (`END` - `START`). + + + +```{bash } +#| label: compile-genome-coverage-for-plot +#| echo: true +#| eval: true +# Add column containing length of region (end - start) +csvtk mutate2 -t -H -w 0 -e '$3-$2' ALL.sum.bed.gz | \ + # Sum the regions and *group* by the coverage (fourth column); + # this gives the total number of bases with a given coverage + csvtk summary -t -H -g 4 -f 5:sum -w 0 | \ + csvtk sort -t -k 1:n | \ + awk -v cumsum=0 'BEGIN {OFS=","; cumsum=0} {cumsum += $2; print $1,$2,cumsum}' > ALL.sum.bed.csv +``` + + + +We plot the coverage distribution below, along with a plot of the +cumulative coverage. ```{bash } #| label: total-depth-of-coverage-distribution -#| echo: false -#| eval: false -zcat ALL.sum.bed.gz | \ - awk -v h=0 'BEGIN {OFS=","; cumsum=0} {a[$4] += ($3-$2)/1000; if ($4 > h) {h=$4}} END{for (i=0; i<=h; i++) {if (a[i]) {cumsum+=a[i]} else {a[i]=0;} print i, a[i], cumsum}}' \ - > ALL.sum.bed.csv -cat ALL.sum.bed.csv | csvtk plot line -x 1 -y 2 --point-size 0.01 --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" --width 9.0 --height 3.5 \ - > fig-plot-total-coverage-distribution.png -cat ALL.sum.bed.csv | csvtk plot line -x 1 -y 3 --point-size 0.01 --xlab "Depth of coverage (X)" --ylab "Cumulative genome coverage (kbp)" --width 9.0 --height 3.5 \ - > fig-plot-total-coverage-distribution-cumulative.png +#| echo: true +#| eval: true +csvtk plot line -H ALL.sum.bed.csv -x 1 -y 2 --point-size 0.01 \ + --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" \ + --width 9.0 --height 3.5 > fig-plot-total-coverage-distribution.png +csvtk plot line -H ALL.sum.bed.csv -x 1 -y 3 --point-size 0.01 \ + --xlab "Depth of coverage (X)" --ylab "Cumulative genome coverage (kbp)" \ + --width 9.0 --height 3.5 > fig-plot-total-coverage-distribution-cumulative.png ``` ::: {#fig-plot-total-coverage-distribution attr-output='.details summary="Output"' layout-nrow=2} @@ -473,29 +587,27 @@ Genome coverage vs depth of coverage. -In @fig-plot-total-coverage-distribution a, a diploid peak is -evident just below coverage X=150; we zoom in on that region to get a -better view: +In @fig-plot-total-coverage-distribution a, a diploid peak is evident +at around coverage X=100; we zoom in on that region to get a better +view: ```{bash } #| label: total-depth-of-coverage-zoom-in #| echo: true #| eval: false -cat ALL.sum.bed.csv | \ - csvtk plot line -x 1 -y 2 --point-size 0.01 \ +csvtk plot line -H ALL.sum.bed.csv -x 1 -y 2 --point-size 0.01 \ --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" \ - --width 9.0 --height 3.5 --x-min 90 --x-max 160 + --width 9.0 --height 3.5 --x-min 40 --x-max 140 ``` ```{bash } #| label: plot-total-depth-of-coverage-zoom-in #| echo: false -#| eval: false -cat ALL.sum.bed.csv | \ - csvtk plot line -x 1 -y 2 --point-size 0.01 \ - --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" \ - --width 9.0 --height 3.5 --x-min 90 --x-max 160 > \ - fig-plot-total-coverage-distribution-hist-zoom-in.png +#| eval: true +csvtk plot line -H ALL.sum.bed.csv -x 1 -y 2 --point-size 0.01 \ + --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" \ + --width 9.0 --height 3.5 --x-min 40 --x-max 140 > \ + fig-plot-total-coverage-distribution-hist-zoom-in.png ``` @@ -511,12 +623,12 @@ Genome coverage vs depth of coverage. [@lou_BeginnerGuideLowcoverage_2021] point out that appropriate -thresholds depend on the data set, but as a general rule recommends a -minimum depth threshold at <0.8x average coverage, and a maximum depth -threshold at mean coverage plus one or two standard deviations. Given -the unrealistically homogenous coverage of our simulated data, we set -thresholds based on @fig-plot-total-coverage-distribution-zoom-in to -e.g., 100-160. +thresholds depend on the data set, but as a general rule recommend a +minimum depth threshold at <0.8X average coverage, and a maximum depth +threshold at mean coverage plus one or two standard deviations. For +the sake of simplicity, you could here infer a cutoff simply by +manually inspecting @fig-plot-total-coverage-distribution-zoom-in; +here, we will use the range 50-110. We then use these thresholds to generate a bed file containing regions that are accessible, i.e., have sufficient coverage for downstream @@ -528,8 +640,11 @@ filtering criteria. ```{bash } #| label: filter-all-bed-file-on-coverage #| echo: true -#| eval: false -zcat ALL.sum.bed.gz | awk '{if (($4 >= 100) && ($4 <= 160)) print $0}' > ALL.sum.depth.bed.gz +#| eval: true +#| results: hide +csvtk filter -t -H ALL.sum.bed.gz -f '4>50' | \ + csvtk filter -t -H -f '4<110' | \ + bgzip -c > ALL.sum.depth.bed.gz bedtools genomecov -i ALL.sum.depth.bed.gz -g ${REF}.fai | grep genome ``` @@ -539,36 +654,39 @@ bedtools genomecov -i ALL.sum.depth.bed.gz -g ${REF}.fai | grep genome #| label: r-compute-genome-coverage #| echo: false #| eval: true -#x <- read.table("ALL.sum.depth.bed.gz", header=FALSE) -#cov <- format(sum(x$V3-x$V2) / 1e6 * 100, digits=3) -cov <- 10 +x <- read.table("ALL.sum.depth.bed.gz", header=FALSE) +cov <- format(sum(x$V3-x$V2) / 1e5 * 100, digits=3) ``` Consequently, `r cov`% of the genome is accessible by depth. ::: {.callout-note} -#### Exercise +## Exercise -Generate coverage sums for sample sets CEU, CHB, and YRI, and from +Generate coverage sums for the red and yellow sample sets, and from these determine coverage thresholds and apply the thresholds to generate bed files with accessible regions. +::: {.callout-answer} + +We base the answer on the previous code. + ```{bash } #| label: awk-sum-sample-set-coverage #| echo: true -#| eval: false -#| code-fold: true +#| eval: true for pop in red yellow; do - bgzip -c -d $pop.bed.gz | \ - awk -v FS="\t" -v OFS="\t" 'NR > 1 {sum=0; for (i=4; i<=NF; i++) sum+=$i; print $1, $2, $3, sum}' | \ - bgzip > $pop.sum.bed.gz - tabix -f -p bed $pop.sum.bed.gz - zcat $pop.sum.bed.gz | \ - awk -v h=0 'BEGIN {OFS=","; cumsum=0} {a[$4] += ($3-$2)/1000; if ($4 > h) {h=$4}} END{for (i=0; i<=h; i++) {if (a[i]) {cumsum+=a[i]} else {a[i]=0;} print i, a[i], cumsum}}' \ - > $pop.sum.bed.csv + bgzip -c -d $pop.bg.gz | \ + awk -v FS="\t" -v OFS="\t" 'NR > 1 {sum=0; for (i=4; i<=NF; i++) sum+=$i; print $1, $2, $3, sum}' | \ + bgzip > $pop.sum.bed.gz + tabix -f -p bed $pop.sum.bed.gz + csvtk mutate2 -t -H -w 0 -e '$3-$2' $pop.sum.bed.gz | \ + csvtk summary -t -H -g 4 -f 5:sum -w 0 | \ + csvtk sort -t -k 1:n | \ + awk -v cumsum=0 'BEGIN {OFS=","; cumsum=0} {cumsum += $2; print $1,$2,cumsum}' > $pop.sum.bed.csv done ``` @@ -578,38 +696,48 @@ done #| label: plot-sum-sample-set-coverage #| echo: true #| eval: true -#| code-fold: true for pop in red yellow; do cat ${pop}.sum.bed.csv | \ csvtk plot line -x 1 -y 2 --point-size 0.01 \ --xlab "Depth of coverage (X)" --ylab "Genome coverage (kbp)" \ - --width 9.0 --height 3.5 --x-min 10 --x-max 100 > \ + --width 9.0 --height 3.5 --x-min 0 --x-max 100 > \ fig-plot-total-coverage-distribution-hist-zoom-in-$pop.png done - ``` +::: {#fig-plot-population-coverage-distribution attr-output='.details summary="Output"' layout-nrow=2} + +![Zoomed in genome coverage, red](fig-plot-total-coverage-distribution-hist-zoom-in-red.png){#fig-plot-total-coverage-distribution-hist-zoom-in-red} + +![Zoomed in genome coverage, yellow](fig-plot-total-coverage-distribution-hist-zoom-in-yellow.png){#fig-plot-total-coverage-distribution-hist-zoom-in-yellow} + +Zoomed in coverage distribution for red and yellow ecotypes. +::: + +::: + ::: Now we have combined total per sample coverage for ALL samples, and -for sample sets CEU, CHB, and YRI. The upcoming task will be to -generate sequence masks from the total coverage and minimum number of +for sample sets red and yellow. The upcoming task will be to generate +sequence masks from the total coverage and minimum number of individuals with coverage greater than zero. ### Filter on minimum number of individuals -In addition to filtering on coverage, we will also filter on the -minimum number of individuals with calls. This is to account for cases -where coverages that pass the coverage filter originate from just a -few samples. Here, we will remove sites where less than 50% of -individuals have a call. +In addition to filtering on total coverage, we will also filter on the +minimum number of individuals with a minimum depth. This is to account +for cases where regions that pass the minimum coverage filter +originate from just a few samples with unusually high coverage. Here, +we will remove sites where more than 50% of individuals have zero +coverage. ```{bash } #| label: awk-filter-50pct-ALL #| echo: true -#| eval: false +#| eval: true bgzip -c -d ALL.bg.gz | \ awk -v FS="\t" 'BEGIN {OFS="\t"} NR > 1 {count=0; for (i=4; i<=NF; i++) {if ($i>0) count+=1}; if (count>=((NF-3)*0.5)) {print $1, $2, $3}}' | \ bgzip > ALL.ind.bed.gz @@ -618,9 +746,13 @@ tabix -f -p bed ALL.ind.bed.gz -#### Exercise +::: {.callout-exercise} -Generate coverage sums for sample sets CEU, CHB, and YRI. +## Exercise + +Generate coverage sums for red and yellow sample sets. + +::: {.callout-answer} @@ -637,7 +769,11 @@ done -## Sequence masks +::: + +::: + +### Sequence masks Finally, for each sample set, we will use `bedtools intersect` to generate combined bed files for total sum coverages and the filter on @@ -672,3 +808,5 @@ bedtools maskfasta -fi ${REF} -mc 1 -fo ${REF}.mask.fa -bed ${REF}.bed ``` + +## References