Skip to content
This repository has been archived by the owner on Dec 18, 2023. It is now read-only.

Commit

Permalink
Merge branch 'report' into 'master'
Browse files Browse the repository at this point in the history
Add more details on read count to the report

See merge request pipelines/hydra!22
  • Loading branch information
mdehollander committed Sep 19, 2018
2 parents bdebe18 + 065d89f commit 9274a52
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 68 deletions.
102 changes: 87 additions & 15 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import os

#min_version("3.12") # R Markdown reports have been introduced in Snakemake 3.12

report: "report/workflow.rst"

if os.path.isfile("config.yaml"):
configfile: "config.yaml"

Expand Down Expand Up @@ -51,6 +53,34 @@ rule rename:
shell("zcat {input.reverse} | awk '{{print (NR%4 == 1) ? \"@{params.prefix}_\" substr($0,2) : $0}}' | gzip -c > {output.reverse}")
shell("md5sum {output.reverse} > {output.reverse_md5}")

rule readstat_raw:
input:
"{project}/gunzip/{data}_R1.fastq.gz",
output:
readstats = temporary("{project}/stats/raw/readstat.{data}.csv"),
readcount = temporary("{project}/stats/raw/readcount.{data}.csv")
params:
sample="{data}"
log:
"{project}/stats/raw/readstat.{data}.log"
conda:
"envs/khmer.yaml"
threads: 1
shell: """
readstats.py {input} --csv -o {output.readstats} 2> {log}
printf "%s\t" {params.sample} > {output.readcount}
tail -n +2 {output.readstats} | cut -d, -f 2 >> {output.readcount}
"""

rule readstat_raw_merge:
input:
expand("{project}/stats/raw/readcount.{data}.csv", project=config['project'], data=config["data"])
output:
protected("{project}/stats/readstat_raw.csv")
shell: """ echo "#SampleID\traw" > {output}
cat {input} >> {output}"""


rule filter_primers:
input:
forward="{project}/gunzip/{data}_R1.fastq.gz",
Expand Down Expand Up @@ -116,24 +146,50 @@ if config["barcode_in_header"]:
shell: """extract_barcodes.py -f {input.forward} -s{params.sep} -l {params.length} -o {params.outdir} -c barcode_in_label && fastq_to_fasta < {output.barcodes} > {output.barcodes_fasta} && \
trimmomatic PE -threads {threads} -phred33 {input.forward} {input.reverse} {output.forward} {output.forward_unpaired} {output.reverse} {output.reverse_unpaired} ILLUMINACLIP:{output.barcodes_fasta}:0:0:{params.threshold} 2> {log}"""

rule readstat_reverse:
rule readstat_filter:
input:
"{project}/filter/{data}_R1.fastq",
output:
readstats = temporary("{project}/stats/filter/readstat.{data}_R1.csv"),
readcount = temporary("{project}/stats/filter/readcount.{data}_R1.csv")
params:
sample="{data}"
log:
"{project}/stats/filter/readstat.{data}_R1.log"
conda:
"envs/khmer.yaml"
threads: 1
shell: """
readstats.py {input} --csv -o {output.readstats} 2> {log}
printf "%s\t" {params.sample} > {output.readcount}
tail -n +2 {output.readstats} | cut -d, -f 2 >> {output.readcount}
"""

rule readstat_filter_merge:
input:
expand("{project}/stats/filter/readcount.{data}_R1.csv", project=config['project'], data=config["data"])
output:
protected("{project}/stats/readstat_filter_R1.csv")
shell: """ echo "#SampleID\tfilter" > {output}
cat {input} >> {output}"""

rule readstat_filter_reverse:
input:
"{project}/barcode/{data}_R2.fastq" if config["barcode_in_header"] else\
"{project}/filter/{data}_R2.fastq",
output:
temporary("{project}/stats/reverse/readstat.{data}.csv")
temporary("{project}/stats/filter/readstat.{data}_R2.csv")
log:
"{project}/stats/reverse/readstat.{data}.log"
"{project}/stats/filter/readstat.{data}_R2.log"
conda:
"envs/khmer.yaml"
threads: 1
shell: "readstats.py {input} --csv -o {output} 2> {log}"

rule readstat_reverse_merge:
input:
expand("{project}/stats/reverse/readstat.{data}.csv", project=config['project'], data=config["data"])
expand("{project}/stats/filter/readstat.{data}_R2.csv", project=config['project'], data=config["data"])
output:
protected("{project}/stats/readstat_R2.csv")
protected("{project}/stats/readstat_filter_R2.csv")
shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;"


Expand Down Expand Up @@ -200,20 +256,29 @@ rule readstat_mergepairs:
input:
fasta = "{project}/mergepairs/{data}.fasta"
output:
temporary("{project}/stats/readstat.{data}.csv")
readstats = temporary("{project}/stats/mergepairs/readstat_mergepairs.{data}.csv"),
readcount = temporary("{project}/stats/mergepairs/readcount_mergepairs.{data}.csv")
params:
sample="{data}"
log:
"{project}/stats/readstat.{data}.log"
"{project}/stats/readstat_mergepairs.{data}.log"
conda:
"envs/khmer.yaml"
threads: 1
shell: "readstats.py {input} --csv -o {output} 2> {log}"
shell: """
readstats.py {input} --csv -o {output.readstats} 2> {log}
printf "%s\t" {params.sample} > {output.readcount}
tail -n +2 {output.readstats} | cut -d, -f 2 >> {output.readcount}
"""

rule readstat_all:
rule readstat_mergepairs_merge:
input:
expand("{project}/stats/readstat.{data}.csv", project=config['project'], data=config["data"])
expand("{project}/stats/mergepairs/readcount_mergepairs.{data}.csv", project=config['project'], data=config["data"])
output:
protected("{project}/stats/readstat.csv")
shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;"
protected("{project}/stats/readstat_mergepairs.csv")
shell: """ echo "#SampleID\tmerged" > {output}
cat {input} >> {output}"""


if config['mergepairs'] == 'vsearch':
rule vsearch_merge:
Expand Down Expand Up @@ -728,12 +793,19 @@ rule workflow_graph:
conda: "envs/rulegraph.yaml"
shell: "snakemake --rulegraph | dot -Tsvg > {output}"

rule combine_readcount:
input:
"{project}/stats/readstat_raw.csv",
"{project}/stats/readstat_filter_R1.csv",
"{project}/stats/readstat_mergepairs.csv"
output:
"{project}/stats/readcount.csv"
shell: "paste -d '\\t' {input} | cut -f 1,2,4,6 > {output}"

rule report:
input:
workflow = "{project}/report/workflow.svg",
readstat = "{project}/stats/readstat.csv",
readstat_reverse = "{project}/stats/readstat_R2.csv",
readstat = "{project}/stats/readcount.csv",
biom = expand("{{project}}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.biom", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']),
otutable = expand("{{project}}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.otutable.txt", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']),
otus= expand("{{project}}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.fasta", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']),
Expand Down
7 changes: 3 additions & 4 deletions envs/report.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@ channels:
- conda-forge
- bioconda
- r
- ericmjl
dependencies:
- r-base==3.3.1 # needed for bioconductor-phyloseq
- rpy2
- r-rmarkdown
- pandoc==1.19.2
- pandoc
# - pandoc==1.19.2
- r-plotly
- bioconductor-phyloseq
- r-ampvis
- pandoc-fignos
53 changes: 4 additions & 49 deletions report.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -41,56 +41,11 @@ library(knitr)
kable(snakemake@config$data)
```

## Summary

```{r set_plot_theme,echo=F}
library(ggplot2)
plot.theme <- theme(axis.text.x=element_text(size=12,angle=0,colour="black"),
axis.text.y=element_text(size=12,angle=0,colour="black"),
axis.title=element_text(size=18,angle=0,colour="black"),
plot.title=element_text(size=18,angle=0,colour="black"),
text=element_text(margin=margin(1,1,1,1,"mm"),debug=F),
panel.grid.minor=element_line(color="grey95"),
strip.text=element_text(size=16),
panel.border=element_rect(linetype="solid",colour="grey")
)
```


## Plot
```{r readstat, message=FALSE, warning=FALSE}
library(plotly)
readstat = read.csv(snakemake@input$readstat)
p <- ggplot(readstat, aes(y=readstat$avg_len, x = seq(1, length(readstat$avg_len)))) + geom_point()
ggplotly(p)
```

## Sequence length reverse reads
```{r readstat_reverse, message=FALSE, warning=FALSE}
library(plotly)
readstat = read.csv(snakemake@input$readstat_reverse)
p <- ggplot(readstat, aes(y=readstat$avg_len, x = seq(1, length(readstat$avg_len)))) + geom_point()
### Read counts

ggplotly(p)
```

## phyloseq
```{r phyloseq, message=FALSE, warning=FALSE}
library(phyloseq)
import_biom(snakemake@input$biom)
```

## ampvis plot
```{r amvpis, message=FALSE, warning=FALSE}
library(phyloseq)
library(ampvis)
ds <- import_biom(snakemake@input$biom, parseFunction=parse_taxonomy_greengenes)
dsn <- transform_sample_counts(ds, function(x) x / sum(x) * 100)
amp_core(data = dsn,
plot.type = "frequency",
weight = T,
scale.seq = 100)
```{r readstats, message=FALSE, warning=FALSE}
library(knitr)
kable(read.delim(snakemake@input$readstat))
```

## Downloads
Expand Down
1 change: 1 addition & 0 deletions report/workflow.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is the workflow description.

0 comments on commit 9274a52

Please sign in to comment.