From 00e11ba643754d657c82b91bf4627b278b8f5b10 Mon Sep 17 00:00:00 2001 From: Mattias Date: Mon, 10 Sep 2018 14:49:55 +0200 Subject: [PATCH 1/6] Also do stats for forward filtered reads --- Snakefile | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/Snakefile b/Snakefile index a1c1a47..b30033e 100644 --- a/Snakefile +++ b/Snakefile @@ -116,14 +116,33 @@ if config["barcode_in_header"]: shell: """extract_barcodes.py -f {input.forward} -s{params.sep} -l {params.length} -o {params.outdir} -c barcode_in_label && fastq_to_fasta < {output.barcodes} > {output.barcodes_fasta} && \ trimmomatic PE -threads {threads} -phred33 {input.forward} {input.reverse} {output.forward} {output.forward_unpaired} {output.reverse} {output.reverse_unpaired} ILLUMINACLIP:{output.barcodes_fasta}:0:0:{params.threshold} 2> {log}""" -rule readstat_reverse: +rule readstat_filter: + input: + "{project}/filter/{data}_R1.fastq", + output: + temporary("{project}/stats/filter/readstat.{data}_R1.csv") + log: + "{project}/stats/filter/readstat.{data}_R1.log" + conda: + "envs/khmer.yaml" + threads: 1 + shell: "readstats.py {input} --csv -o {output} 2> {log}" + +rule readstat_filter_merge: + input: + expand("{project}/stats/filter/readstat.{data}_R1.csv", project=config['project'], data=config["data"]) + output: + protected("{project}/stats/readstat_filter_R1.csv") + shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;" + + +rule readstat_filter_reverse: input: - "{project}/barcode/{data}_R2.fastq" if config["barcode_in_header"] else\ "{project}/filter/{data}_R2.fastq", output: - temporary("{project}/stats/reverse/readstat.{data}.csv") + temporary("{project}/stats/filter/readstat.{data}_R2.csv") log: - "{project}/stats/reverse/readstat.{data}.log" + "{project}/stats/filter/readstat.{data}_R2.log" conda: "envs/khmer.yaml" threads: 1 @@ -131,9 +150,9 @@ rule readstat_reverse: rule readstat_reverse_merge: input: - expand("{project}/stats/reverse/readstat.{data}.csv", project=config['project'], data=config["data"]) + expand("{project}/stats/filter/readstat.{data}_R2.csv", project=config['project'], data=config["data"]) output: - protected("{project}/stats/readstat_R2.csv") + protected("{project}/stats/readstat_filter_R2.csv") shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;" @@ -733,7 +752,8 @@ rule report: input: workflow = "{project}/report/workflow.svg", readstat = "{project}/stats/readstat.csv", - readstat_reverse = "{project}/stats/readstat_R2.csv", + readstat_filter = "{project}/stats/readstat_filter_R1.csv", + readstat_reverse = "{project}/stats/readstat_filter_R2.csv", biom = expand("{{project}}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.biom", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']), otutable = expand("{{project}}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.otutable.txt", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']), otus= expand("{{project}}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.fasta", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']), From 3db42586c11056c50370b521e419aa2fc077d5fe Mon Sep 17 00:00:00 2001 From: Mattias Date: Mon, 10 Sep 2018 15:04:30 +0200 Subject: [PATCH 2/6] Add readstat raw --- Snakefile | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Snakefile b/Snakefile index b30033e..a8cda96 100644 --- a/Snakefile +++ b/Snakefile @@ -51,6 +51,26 @@ rule rename: shell("zcat {input.reverse} | awk '{{print (NR%4 == 1) ? \"@{params.prefix}_\" substr($0,2) : $0}}' | gzip -c > {output.reverse}") shell("md5sum {output.reverse} > {output.reverse_md5}") +rule readstat_raw: + input: + "{project}/gunzip/{data}_R1.fastq.gz", + output: + temporary("{project}/stats/raw/readstat.{data}.csv") + log: + "{project}/stats/raw/readstat.{data}.log" + conda: + "envs/khmer.yaml" + threads: 1 + shell: "readstats.py {input} --csv -o {output} 2> {log}" + +rule readstat_raw_merge: + input: + expand("{project}/stats/raw/readstat.{data}.csv", project=config['project'], data=config["data"]) + output: + protected("{project}/stats/readstat_raw.csv") + shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;" + + rule filter_primers: input: forward="{project}/gunzip/{data}_R1.fastq.gz", @@ -751,6 +771,7 @@ rule workflow_graph: rule report: input: workflow = "{project}/report/workflow.svg", + readstat_raw = "{project}/stats/readstat_raw.csv", readstat = "{project}/stats/readstat.csv", readstat_filter = "{project}/stats/readstat_filter_R1.csv", readstat_reverse = "{project}/stats/readstat_filter_R2.csv", From 790e0796752b5e64e5c181788878456ac560e08f Mon Sep 17 00:00:00 2001 From: Mattias Date: Mon, 10 Sep 2018 15:17:38 +0200 Subject: [PATCH 3/6] Rename mergepairs stats --- Snakefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index a8cda96..fcf7429 100644 --- a/Snakefile +++ b/Snakefile @@ -239,9 +239,9 @@ rule readstat_mergepairs: input: fasta = "{project}/mergepairs/{data}.fasta" output: - temporary("{project}/stats/readstat.{data}.csv") + temporary("{project}/stats/readstat_mergepairs.{data}.csv") log: - "{project}/stats/readstat.{data}.log" + "{project}/stats/readstat_mergepairs.{data}.log" conda: "envs/khmer.yaml" threads: 1 @@ -249,9 +249,9 @@ rule readstat_mergepairs: rule readstat_all: input: - expand("{project}/stats/readstat.{data}.csv", project=config['project'], data=config["data"]) + expand("{project}/stats/readstat_mergepairs.{data}.csv", project=config['project'], data=config["data"]) output: - protected("{project}/stats/readstat.csv") + protected("{project}/stats/readstat_mergepairs.csv") shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;" if config['mergepairs'] == 'vsearch': @@ -772,7 +772,7 @@ rule report: input: workflow = "{project}/report/workflow.svg", readstat_raw = "{project}/stats/readstat_raw.csv", - readstat = "{project}/stats/readstat.csv", + readstat_mergepairs = "{project}/stats/readstat_mergepairs.csv", readstat_filter = "{project}/stats/readstat_filter_R1.csv", readstat_reverse = "{project}/stats/readstat_filter_R2.csv", biom = expand("{{project}}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.biom", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']), From fb399eb74518de8c57ea3aecb278fedc7882640d Mon Sep 17 00:00:00 2001 From: Mattias Date: Mon, 10 Sep 2018 16:21:59 +0200 Subject: [PATCH 4/6] Get readstats in a single file with sample name --- Snakefile | 52 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/Snakefile b/Snakefile index fcf7429..1483db5 100644 --- a/Snakefile +++ b/Snakefile @@ -55,20 +55,28 @@ rule readstat_raw: input: "{project}/gunzip/{data}_R1.fastq.gz", output: - temporary("{project}/stats/raw/readstat.{data}.csv") + readstats = temporary("{project}/stats/raw/readstat.{data}.csv"), + readcount = temporary("{project}/stats/raw/readcount.{data}.csv") + params: + sample="{data}" log: "{project}/stats/raw/readstat.{data}.log" conda: "envs/khmer.yaml" threads: 1 - shell: "readstats.py {input} --csv -o {output} 2> {log}" + shell: """ + readstats.py {input} --csv -o {output.readstats} 2> {log} + printf "%s\t" {params.sample} > {output.readcount} + tail -n +2 {output.readstats} | cut -d, -f 2 >> {output.readcount} + """ rule readstat_raw_merge: input: - expand("{project}/stats/raw/readstat.{data}.csv", project=config['project'], data=config["data"]) + expand("{project}/stats/raw/readcount.{data}.csv", project=config['project'], data=config["data"]) output: protected("{project}/stats/readstat_raw.csv") - shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;" + shell: """ echo "#SampleID\traw" > {output} + cat {input} >> {output}""" rule filter_primers: @@ -140,21 +148,28 @@ rule readstat_filter: input: "{project}/filter/{data}_R1.fastq", output: - temporary("{project}/stats/filter/readstat.{data}_R1.csv") + readstats = temporary("{project}/stats/filter/readstat.{data}_R1.csv"), + readcount = temporary("{project}/stats/filter/readcount.{data}_R1.csv") + params: + sample="{data}" log: "{project}/stats/filter/readstat.{data}_R1.log" conda: "envs/khmer.yaml" threads: 1 - shell: "readstats.py {input} --csv -o {output} 2> {log}" + shell: """ + readstats.py {input} --csv -o {output.readstats} 2> {log} + printf "%s\t" {params.sample} > {output.readcount} + tail -n +2 {output.readstats} | cut -d, -f 2 >> {output.readcount} + """ rule readstat_filter_merge: input: - expand("{project}/stats/filter/readstat.{data}_R1.csv", project=config['project'], data=config["data"]) + expand("{project}/stats/filter/readcount.{data}_R1.csv", project=config['project'], data=config["data"]) output: protected("{project}/stats/readstat_filter_R1.csv") - shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;" - + shell: """ echo "#SampleID\tfilter" > {output} + cat {input} >> {output}""" rule readstat_filter_reverse: input: @@ -239,20 +254,29 @@ rule readstat_mergepairs: input: fasta = "{project}/mergepairs/{data}.fasta" output: - temporary("{project}/stats/readstat_mergepairs.{data}.csv") + readstats = temporary("{project}/stats/mergepairs/readstat_mergepairs.{data}.csv"), + readcount = temporary("{project}/stats/mergepairs/readcount_mergepairs.{data}.csv") + params: + sample="{data}" log: "{project}/stats/readstat_mergepairs.{data}.log" conda: "envs/khmer.yaml" threads: 1 - shell: "readstats.py {input} --csv -o {output} 2> {log}" + shell: """ + readstats.py {input} --csv -o {output.readstats} 2> {log} + printf "%s\t" {params.sample} > {output.readcount} + tail -n +2 {output.readstats} | cut -d, -f 2 >> {output.readcount} + """ -rule readstat_all: +rule readstat_mergepairs_merge: input: - expand("{project}/stats/readstat_mergepairs.{data}.csv", project=config['project'], data=config["data"]) + expand("{project}/stats/mergepairs/readcount_mergepairs.{data}.csv", project=config['project'], data=config["data"]) output: protected("{project}/stats/readstat_mergepairs.csv") - shell: "cat {input[0]} | head -n 1 > {output} && for file in {input}; do tail -n +2 $file >> {output}; done;" + shell: """ echo "#SampleID\tmerged" > {output} + cat {input} >> {output}""" + if config['mergepairs'] == 'vsearch': rule vsearch_merge: From 1d72c5789734fd5a40c51abc7027081dc36edfd0 Mon Sep 17 00:00:00 2001 From: Mattias Date: Mon, 10 Sep 2018 16:28:32 +0200 Subject: [PATCH 5/6] Create a single file with read counts --- Snakefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Snakefile b/Snakefile index 1483db5..1b50890 100644 --- a/Snakefile +++ b/Snakefile @@ -791,6 +791,14 @@ rule workflow_graph: conda: "envs/rulegraph.yaml" shell: "snakemake --rulegraph | dot -Tsvg > {output}" +rule combine_readcount: + input: + "{project}/stats/readstat_raw.csv", + "{project}/stats/readstat_filter_R1.csv", + "{project}/stats/readstat_mergepairs.csv" + output: + "{project}/stats/readcount.csv" + shell: "paste -d '\\t' {input} | cut -f 1,2,4,6 > {output}" rule report: input: From 065d89f3cb8c5ff54e4c06255c18eb0c9b7b3039 Mon Sep 17 00:00:00 2001 From: Mattias Date: Wed, 19 Sep 2018 15:13:44 +0200 Subject: [PATCH 6/6] Add read counts to report and minimalize report --- Snakefile | 7 +++--- envs/report.yaml | 7 +++--- report.Rmd | 53 ++++----------------------------------------- report/workflow.rst | 1 + 4 files changed, 11 insertions(+), 57 deletions(-) create mode 100644 report/workflow.rst diff --git a/Snakefile b/Snakefile index 1b50890..9154f35 100644 --- a/Snakefile +++ b/Snakefile @@ -4,6 +4,8 @@ import os #min_version("3.12") # R Markdown reports have been introduced in Snakemake 3.12 +report: "report/workflow.rst" + if os.path.isfile("config.yaml"): configfile: "config.yaml" @@ -803,10 +805,7 @@ rule combine_readcount: rule report: input: workflow = "{project}/report/workflow.svg", - readstat_raw = "{project}/stats/readstat_raw.csv", - readstat_mergepairs = "{project}/stats/readstat_mergepairs.csv", - readstat_filter = "{project}/stats/readstat_filter_R1.csv", - readstat_reverse = "{project}/stats/readstat_filter_R2.csv", + readstat = "{project}/stats/readcount.csv", biom = expand("{{project}}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.biom", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']), otutable = expand("{{project}}/{prog}/{ds}.minsize{minsize}.{clmethod}.taxonomy.otutable.txt", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']), otus= expand("{{project}}/{prog}/otus/{ds}.minsize{minsize}.{clmethod}.fasta", prog=["vsearch"],ds=config['project'],minsize=2,clmethod=config['clustering']), diff --git a/envs/report.yaml b/envs/report.yaml index 850aebd..37d90c0 100644 --- a/envs/report.yaml +++ b/envs/report.yaml @@ -2,12 +2,11 @@ channels: - conda-forge - bioconda - r - - ericmjl dependencies: + - r-base==3.3.1 # needed for bioconductor-phyloseq - rpy2 - r-rmarkdown - - pandoc==1.19.2 + - pandoc +# - pandoc==1.19.2 - r-plotly - bioconductor-phyloseq - - r-ampvis - - pandoc-fignos diff --git a/report.Rmd b/report.Rmd index 3c39554..15ca23f 100644 --- a/report.Rmd +++ b/report.Rmd @@ -41,56 +41,11 @@ library(knitr) kable(snakemake@config$data) ``` -## Summary - -```{r set_plot_theme,echo=F} -library(ggplot2) -plot.theme <- theme(axis.text.x=element_text(size=12,angle=0,colour="black"), - axis.text.y=element_text(size=12,angle=0,colour="black"), - axis.title=element_text(size=18,angle=0,colour="black"), - plot.title=element_text(size=18,angle=0,colour="black"), - text=element_text(margin=margin(1,1,1,1,"mm"),debug=F), - panel.grid.minor=element_line(color="grey95"), - strip.text=element_text(size=16), - panel.border=element_rect(linetype="solid",colour="grey") - ) -``` - - -## Plot -```{r readstat, message=FALSE, warning=FALSE} -library(plotly) -readstat = read.csv(snakemake@input$readstat) -p <- ggplot(readstat, aes(y=readstat$avg_len, x = seq(1, length(readstat$avg_len)))) + geom_point() - -ggplotly(p) -``` - -## Sequence length reverse reads -```{r readstat_reverse, message=FALSE, warning=FALSE} -library(plotly) -readstat = read.csv(snakemake@input$readstat_reverse) -p <- ggplot(readstat, aes(y=readstat$avg_len, x = seq(1, length(readstat$avg_len)))) + geom_point() +### Read counts -ggplotly(p) -``` - -## phyloseq -```{r phyloseq, message=FALSE, warning=FALSE} -library(phyloseq) -import_biom(snakemake@input$biom) -``` - -## ampvis plot -```{r amvpis, message=FALSE, warning=FALSE} -library(phyloseq) -library(ampvis) -ds <- import_biom(snakemake@input$biom, parseFunction=parse_taxonomy_greengenes) -dsn <- transform_sample_counts(ds, function(x) x / sum(x) * 100) -amp_core(data = dsn, - plot.type = "frequency", - weight = T, - scale.seq = 100) +```{r readstats, message=FALSE, warning=FALSE} +library(knitr) +kable(read.delim(snakemake@input$readstat)) ``` ## Downloads diff --git a/report/workflow.rst b/report/workflow.rst new file mode 100644 index 0000000..6b06b41 --- /dev/null +++ b/report/workflow.rst @@ -0,0 +1 @@ +This is the workflow description.