diff --git a/Snakefile b/Snakefile index a1c1a47..9154f35 100644 --- a/Snakefile +++ b/Snakefile @@ -4,6 +4,8 @@ import os #min_version("3.12") # R Markdown reports have been introduced in Snakemake 3.12 +report: "report/workflow.rst" + if os.path.isfile("config.yaml"): configfile: "config.yaml" @@ -51,6 +53,34 @@ rule rename: shell("zcat {input.reverse} | awk '{{print (NR%4 == 1) ? \"@{params.prefix}_\" substr($0,2) : $0}}' | gzip -c > {output.reverse}") shell("md5sum {output.reverse} > {output.reverse_md5}") +rule readstat_raw: + input: + "{project}/gunzip/{data}_R1.fastq.gz", + output: + readstats = temporary("{project}/stats/raw/readstat.{data}.csv"), + readcount = temporary("{project}/stats/raw/readcount.{data}.csv") + params: + sample="{data}" + log: + "{project}/stats/raw/readstat.{data}.log" + conda: + "envs/khmer.yaml" + threads: 1 + shell: """ + readstats.py {input} --csv -o {output.readstats} 2> {log} + printf "%s\t" {params.sample} > {output.readcount} + tail -n +2 {output.readstats} | cut -d, -f 2 >> {output.readcount} + """ + +rule readstat_raw_merge: + input: + expand("{project}/stats/raw/readcount.{data}.csv", project=config['project'], data=config["data"]) + output: + protected("{project}/stats/readstat_raw.csv") + shell: """ echo "#SampleID\traw" > {output} + cat {input} >> {output}""" + + rule filter_primers: input: forward="{project}/gunzip/{data}_R1.fastq.gz", @@ -116,14 +146,40 @@ if config["barcode_in_header"]: shell: """extract_barcodes.py -f {input.forward} -s{params.sep} -l {params.length} -o {params.outdir} -c barcode_in_label && fastq_to_fasta < {output.barcodes} > {output.barcodes_fasta} && \ trimmomatic PE -threads {threads} -phred33 {input.forward} {input.reverse} {output.forward} {output.forward_unpaired} {output.reverse} {output.reverse_unpaired} ILLUMINACLIP:{output.barcodes_fasta}:0:0:{params.threshold} 2> {log}""" -rule readstat_reverse: +rule readstat_filter: + input: + "{project}/filter/{data}_R1.fastq", + output: + readstats = temporary("{project}/stats/filter/readstat.{data}_R1.csv"), + readcount = temporary("{project}/stats/filter/readcount.{data}_R1.csv") + params: + sample="{data}" + log: + "{project}/stats/filter/readstat.{data}_R1.log" + conda: + "envs/khmer.yaml" + threads: 1 + shell: """ + readstats.py {input} --csv -o {output.readstats} 2> {log} + printf "%s\t" {params.sample} > {output.readcount} + tail -n +2 {output.readstats} | cut -d, -f 2 >> {output.readcount} + """ + +rule readstat_filter_merge: + input: + expand("{project}/stats/filter/readcount.{data}_R1.csv", project=config['project'], data=config["data"]) + output: + protected("{project}/stats/readstat_filter_R1.csv") + shell: """ echo "#SampleID\tfilter" > {output} + cat {input} >> {output}""" + +rule readstat_filter_reverse: input: - "{project}/barcode/{data}_R2.fastq" if config["barcode_in_header"] else\ "{project}/filter/{data}_R2.fastq", output: - temporary("{project}/stats/reverse/readstat.{data}.csv") + temporary("{project}/stats/filter/readstat.{data}_R2.csv") log: - "{project}/stats/reverse/readstat.{data}.log" + "{project}/stats/filter/readstat.{data}_R2.log" conda: "envs/khmer.yaml" threads: 1 @@ -131,9 +187,9 @@ rule readstat_reverse: rule readstat_reverse_merge: input: - expand("{project}/stats/reverse/readstat.{data}.csv", project=config['project'], data=config["data"]) + expand("{project}/stats/filter/readstat.{data}_R2.csv", project=config['project'], data=config["data"]) output: - protected("{project}/stats/readstat_R2.csv") + protected("{project}/stats/readstat_filter_R2.csv") shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;" @@ -200,20 +256,29 @@ rule readstat_mergepairs: input: fasta = "{project}/mergepairs/{data}.fasta" output: - temporary("{project}/stats/readstat.{data}.csv") + readstats = temporary("{project}/stats/mergepairs/readstat_mergepairs.{data}.csv"), + readcount = temporary("{project}/stats/mergepairs/readcount_mergepairs.{data}.csv") + params: + sample="{data}" log: - "{project}/stats/readstat.{data}.log" + "{project}/stats/readstat_mergepairs.{data}.log" conda: "envs/khmer.yaml" threads: 1 - shell: "readstats.py {input} --csv -o {output} 2> {log}" + shell: """ + readstats.py {input} --csv -o {output.readstats} 2> {log} + printf "%s\t" {params.sample} > {output.readcount} + tail -n +2 {output.readstats} | cut -d, -f 2 >> {output.readcount} + """ -rule readstat_all: +rule readstat_mergepairs_merge: input: - expand("{project}/stats/readstat.{data}.csv", project=config['project'], data=config["data"]) + expand("{project}/stats/mergepairs/readcount_mergepairs.{data}.csv", project=config['project'], data=config["data"]) output: - protected("{project}/stats/readstat.csv") - shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;" + protected("{project}/stats/readstat_mergepairs.csv") + shell: """ echo "#SampleID\tmerged" > {output} + cat {input} >> {output}""" + if config['mergepairs'] == 'vsearch': rule vsearch_merge: @@ -728,12 +793,19 @@ rule workflow_graph: conda: "envs/rulegraph.yaml" shell: "snakemake --rulegraph | dot -Tsvg > {output}" +rule combine_readcount: + input: + "{project}/stats/readstat_raw.csv", + "{project}/stats/readstat_filter_R1.csv", + "{project}/stats/readstat_mergepairs.csv" + output: + "{project}/stats/readcount.csv" + shell: "paste -d '\\t' {input} | cut -f 1,2,4,6 > {output}" rule report: input: workflow = "{project}/report/workflow.svg", - readstat = "{project}/stats/readstat.csv", - readstat_reverse = "{project}/stats/readstat_R2.csv", + readstat = "{project}/stats/readcount.csv", biom = expand("{{project}}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.biom", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']), otutable = expand("{{project}}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.otutable.txt", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']), otus= expand("{{project}}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.fasta", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']), diff --git a/envs/report.yaml b/envs/report.yaml index 850aebd..37d90c0 100644 --- a/envs/report.yaml +++ b/envs/report.yaml @@ -2,12 +2,11 @@ channels: - conda-forge - bioconda - r - - ericmjl dependencies: + - r-base==3.3.1 # needed for bioconductor-phyloseq - rpy2 - r-rmarkdown - - pandoc==1.19.2 + - pandoc +# - pandoc==1.19.2 - r-plotly - bioconductor-phyloseq - - r-ampvis - - pandoc-fignos diff --git a/report.Rmd b/report.Rmd index 3c39554..15ca23f 100644 --- a/report.Rmd +++ b/report.Rmd @@ -41,56 +41,11 @@ library(knitr) kable(snakemake@config$data) ``` -## Summary - -```{r set_plot_theme,echo=F} -library(ggplot2) -plot.theme <- theme(axis.text.x=element_text(size=12,angle=0,colour="black"), - axis.text.y=element_text(size=12,angle=0,colour="black"), - axis.title=element_text(size=18,angle=0,colour="black"), - plot.title=element_text(size=18,angle=0,colour="black"), - text=element_text(margin=margin(1,1,1,1,"mm"),debug=F), - panel.grid.minor=element_line(color="grey95"), - strip.text=element_text(size=16), - panel.border=element_rect(linetype="solid",colour="grey") - ) -``` - - -## Plot -```{r readstat, message=FALSE, warning=FALSE} -library(plotly) -readstat = read.csv(snakemake@input$readstat) -p <- ggplot(readstat, aes(y=readstat$avg_len, x = seq(1, length(readstat$avg_len)))) + geom_point() - -ggplotly(p) -``` - -## Sequence length reverse reads -```{r readstat_reverse, message=FALSE, warning=FALSE} -library(plotly) -readstat = read.csv(snakemake@input$readstat_reverse) -p <- ggplot(readstat, aes(y=readstat$avg_len, x = seq(1, length(readstat$avg_len)))) + geom_point() +### Read counts -ggplotly(p) -``` - -## phyloseq -```{r phyloseq, message=FALSE, warning=FALSE} -library(phyloseq) -import_biom(snakemake@input$biom) -``` - -## ampvis plot -```{r amvpis, message=FALSE, warning=FALSE} -library(phyloseq) -library(ampvis) -ds <- import_biom(snakemake@input$biom, parseFunction=parse_taxonomy_greengenes) -dsn <- transform_sample_counts(ds, function(x) x / sum(x) * 100) -amp_core(data = dsn, - plot.type = "frequency", - weight = T, - scale.seq = 100) +```{r readstats, message=FALSE, warning=FALSE} +library(knitr) +kable(read.delim(snakemake@input$readstat)) ``` ## Downloads diff --git a/report/workflow.rst b/report/workflow.rst new file mode 100644 index 0000000..6b06b41 --- /dev/null +++ b/report/workflow.rst @@ -0,0 +1 @@ +This is the workflow description.