From 0b7b0e0c89b16b1704368898374c61fdc091e6a6 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Tue, 18 Jun 2024 10:45:12 +0100 Subject: [PATCH 01/12] add local db support, untested --- .test/config/config_paired_end.yaml | 5 +++-- .test/config/config_single_end.yaml | 5 +++-- config/exampleconfig.yaml | 7 ++++--- workflow/rules/filterAnnotate.smk | 6 +++--- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.test/config/config_paired_end.yaml b/.test/config/config_paired_end.yaml index ce0a522..f95b7cc 100644 --- a/.test/config/config_paired_end.yaml +++ b/.test/config/config_paired_end.yaml @@ -18,8 +18,9 @@ reference: "resources/reference/Anopheles-gambiae-PEST_TRANSCRIPTS_AgamP4.12-X.fa.gz" gff: "resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12-X.gff3" - snpeffdb: - "Anopheles_gambiae" + snpeff: + local: False + dbpath: Anopheles_gambiae genes2transcripts: "resources/Gene2TranscriptMap.tsv" diff --git a/.test/config/config_single_end.yaml b/.test/config/config_single_end.yaml index c8d3417..c21096c 100644 --- a/.test/config/config_single_end.yaml +++ b/.test/config/config_single_end.yaml @@ -20,8 +20,9 @@ reference: "resources/reference/Anopheles-gambiae-PEST_TRANSCRIPTS_AgamP4.12-X.fa.gz" gff: "resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12-X.gff3" - snpeffdb: - "Anopheles_gambiae" + snpeff: + local: False + dbpath: Anopheles_gambiae genes2transcripts: "resources/Gene2TranscriptMap.tsv" diff --git a/config/exampleconfig.yaml b/config/exampleconfig.yaml index d6d20a9..e775493 100755 --- a/config/exampleconfig.yaml +++ b/config/exampleconfig.yaml @@ -27,10 +27,11 @@ reference: "resources/reference/Anopheles-gambiae-PEST_TRANSCRIPTS_AgamP4.12.fa" # Path to transcriptome reference FASTA file gff: "resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3" # Path to GFF annotation file - snpeffdb: - "Anopheles_gambiae" # SNPeff database name + snpeff: + local: False + dbpath: Anopheles_gambiae # SNPeff database name genes2transcripts: - "resources/exampleGene2TranscriptMap.tsv" # gene names file with gene and transcript names + resources/exampleGene2TranscriptMap.tsv # gene names file with gene and transcript names # Chromosome names for the appropriate species. # Please ensure that these correspond exactly to the reference fasta/gff files. Extra unwanted chromosomes (unplaced contigs) can be ignored. diff --git a/workflow/rules/filterAnnotate.smk b/workflow/rules/filterAnnotate.smk index db10ae7..b14aa4f 100644 --- a/workflow/rules/filterAnnotate.smk +++ b/workflow/rules/filterAnnotate.smk @@ -47,7 +47,7 @@ rule snpEffDbDownload: conda: "../envs/snpeff.yaml" params: - ref=config["reference"]["snpeffdb"], + ref=config["reference"]["snpeff"]['dbpath'], dir="resources/snpEffdb", shell: "snpEff download {params.ref} -dataDir {params.dir} 2> {log}" @@ -59,7 +59,7 @@ rule snpEff: """ input: calls="results/variantAnalysis/vcfs/variants.{contig}.vcf", - dl="workflow/scripts/snpEff/db.dl", + db=config['reference']['snpeff']['dbpath'] if config['snpeff']['local'] else "workflow/scripts/snpEff/db.dl", output: calls=expand( "results/variantAnalysis/vcfs/{dataset}.{{contig}}.vcf.gz", @@ -71,7 +71,7 @@ rule snpEff: conda: "../envs/snpeff.yaml" params: - db=config["reference"]["snpeffdb"], + db=config["reference"]["snpeff"]['dbpath'], prefix=lambda w, output: os.path.splitext(output[0])[0], dir="resources/snpEffdb", shell: From f635ddf9e8e7a1a5d4e5261387d3244de2fde392 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Tue, 18 Jun 2024 10:50:45 +0100 Subject: [PATCH 02/12] fix --- workflow/rules/filterAnnotate.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/filterAnnotate.smk b/workflow/rules/filterAnnotate.smk index b14aa4f..cc0540c 100644 --- a/workflow/rules/filterAnnotate.smk +++ b/workflow/rules/filterAnnotate.smk @@ -59,7 +59,7 @@ rule snpEff: """ input: calls="results/variantAnalysis/vcfs/variants.{contig}.vcf", - db=config['reference']['snpeff']['dbpath'] if config['snpeff']['local'] else "workflow/scripts/snpEff/db.dl", + db=config['reference']['snpeff']['dbpath'] if config['reference']['snpeff']['local'] else "workflow/scripts/snpEff/db.dl", output: calls=expand( "results/variantAnalysis/vcfs/{dataset}.{{contig}}.vcf.gz", From eb0902cbf317cf7df467f442bb8927d7b3dc1ec0 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Tue, 18 Jun 2024 10:57:49 +0100 Subject: [PATCH 03/12] edit config naming --- .test/config/config_paired_end.yaml | 8 ++++---- .test/config/config_single_end.yaml | 8 ++++---- workflow/rules/common.smk | 8 ++++---- workflow/rules/jupyter-book.smk | 2 +- workflow/rules/variantAnalysis.smk | 16 ++++++++-------- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.test/config/config_paired_end.yaml b/.test/config/config_paired_end.yaml index f95b7cc..d82038a 100644 --- a/.test/config/config_paired_end.yaml +++ b/.test/config/config_paired_end.yaml @@ -3,7 +3,7 @@ metadata: config/samples.tsv # Dataset name dataset: 'Test-GithubActionsCI' -jupyter-book: +results-jupyterbook: activate: True fastq: @@ -36,7 +36,7 @@ contrasts: QualityControl: fastp-trim: activate: True - mosdepth: + coverage: activate: True multiqc: activate: True @@ -69,7 +69,7 @@ VariantAnalysis: activate: True missingness: 0.4 - summaryStatistics: + geneticDiversity: activate: True missingness: 0.4 @@ -77,7 +77,7 @@ VariantAnalysis: activate: True missingness: 0.5 # Do we want to run pbs (Needs three conditions, two more closely related and a slight outgroup) - pbs: + population-branch-statistic: activate: False contrasts: ['PiriTia_ContTia_Kisumu'] diff --git a/.test/config/config_single_end.yaml b/.test/config/config_single_end.yaml index c21096c..847a94a 100644 --- a/.test/config/config_single_end.yaml +++ b/.test/config/config_single_end.yaml @@ -3,7 +3,7 @@ metadata: config/samples.tsv # Dataset name dataset: 'Test-GithubActionsCI' -jupyter-book: +results-jupyterbook: activate: True fastq: @@ -39,7 +39,7 @@ contrasts: QualityControl: fastp-trim: activate: True - mosdepth: + coverage: activate: True multiqc: activate: True @@ -74,7 +74,7 @@ VariantAnalysis: activate: True missingness: 0.4 - summaryStatistics: + geneticDiversity: activate: True missingness: 0.4 @@ -82,7 +82,7 @@ VariantAnalysis: activate: True missingness: 0.5 # Do we want to run pbs (Needs three conditions, two more closely related and a slight outgroup) - pbs: + population-branch-statistic: activate: False contrasts: - 'PiriTia_ContTia_Kisumu' diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index dcf5e66..400621d 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,4 +1,4 @@ -if config["VariantAnalysis"]["selection"]["pbs"]["activate"]: +if config["VariantAnalysis"]["selection"]["population-branch-statistic"]["activate"]: windowedStats = ["Fst", "Pbs"] else: windowedStats = ["Fst"] @@ -39,7 +39,7 @@ rule remove_input_param_cell_nbs: gsea = "docs/rna-seq-pop-results/notebooks/gene-set-enrichment-analysis.ipynb" if config['DifferentialExpression']['GSEA']['activate'] else [], gene_families = "docs/rna-seq-pop-results/notebooks/gene-families-heatmap.ipynb" if config['miscellaneous']['GeneFamiliesHeatmap']['activate'] else [], pca = "docs/rna-seq-pop-results/notebooks/principal-components-analysis.ipynb" if config['VariantAnalysis']['pca']['activate'] else [], - genetic_diversity = "docs/rna-seq-pop-results/notebooks/genetic-diversity.ipynb" if config['VariantAnalysis']['summaryStatistics']['activate'] else [], + genetic_diversity = "docs/rna-seq-pop-results/notebooks/genetic-diversity.ipynb" if config['VariantAnalysis']['geneticDiversity']['activate'] else [], selection = "docs/rna-seq-pop-results/notebooks/windowed-selection.ipynb" if config['VariantAnalysis']['selection']['activate'] else [], voi = "docs/rna-seq-pop-results/notebooks/variants-of-interest.ipynb" if config['miscellaneous']['VariantsOfInterest']['activate'] else [], karyo = "docs/rna-seq-pop-results/notebooks/karyotype.ipynb" if config['VariantAnalysis']['karyotype']['activate'] else [], @@ -192,7 +192,7 @@ def GetDesiredOutputs(wildcards): ) ) - if config['QualityControl']['mosdepth']['activate']: + if config['QualityControl']['coverage']['activate']: wanted_input.extend( expand( [ @@ -293,7 +293,7 @@ def GetDesiredOutputs(wildcards): ) ) - if config['jupyter-book']['activate']: + if config['results-jupyterbook']['activate']: wanted_input.extend( [ "results/rna-seq-pop-results/_build/html/index.html" diff --git a/workflow/rules/jupyter-book.smk b/workflow/rules/jupyter-book.smk index a4c4345..45aafe1 100644 --- a/workflow/rules/jupyter-book.smk +++ b/workflow/rules/jupyter-book.smk @@ -7,7 +7,7 @@ rule jupyterbook: gsea = "docs/rna-seq-pop-results/notebooks/gene-set-enrichment-analysis.ipynb" if config['DifferentialExpression']['GSEA']['activate'] else [], gene_families = "docs/rna-seq-pop-results/notebooks/gene-families-heatmap.ipynb" if config['miscellaneous']['GeneFamiliesHeatmap']['activate'] else [], pca = "docs/rna-seq-pop-results/notebooks/principal-components-analysis.ipynb" if config['VariantAnalysis']['pca']['activate'] else [], - genetic_diversity = "docs/rna-seq-pop-results/notebooks/genetic-diversity.ipynb" if config['VariantAnalysis']['summaryStatistics']['activate'] else [], + genetic_diversity = "docs/rna-seq-pop-results/notebooks/genetic-diversity.ipynb" if config['VariantAnalysis']['geneticDiversity']['activate'] else [], selection = "docs/rna-seq-pop-results/notebooks/windowed-selection.ipynb" if config['VariantAnalysis']['selection']['activate'] else [], voi = "docs/rna-seq-pop-results/notebooks/variants-of-interest.ipynb" if config['miscellaneous']['VariantsOfInterest']['activate'] else [], karyo = "docs/rna-seq-pop-results/notebooks/karyotype.ipynb" if config['VariantAnalysis']['karyotype']['activate'] else [], diff --git a/workflow/rules/variantAnalysis.smk b/workflow/rules/variantAnalysis.smk index 358b974..8f56267 100644 --- a/workflow/rules/variantAnalysis.smk +++ b/workflow/rules/variantAnalysis.smk @@ -103,7 +103,7 @@ rule pca_notebook: cp {output.nb} {output.docs_nb} 2>> {log} """ -rule SummaryStatistics_notebook: +rule geneticDiversity_notebook: """ Calculate population genetic summary statistics on genotype data """ @@ -132,7 +132,7 @@ rule SummaryStatistics_notebook: dataset=config["dataset"], config_path = configpath, ploidy=config["VariantAnalysis"]["ploidy"], - missingprop=config["VariantAnalysis"]["summaryStatistics"]["missingness"], + missingprop=config["VariantAnalysis"]["geneticDiversity"]["missingness"], qualflt=30, shell: """ @@ -166,11 +166,11 @@ rule WindowedFstPBS_notebook: PBS=( expand( "results/variantAnalysis/selection/pbs/{wsize}/{pbscomp}.PBS.{contig}.svg", - pbscomp=config["VariantAnalysis"]["selection"]["pbs"]["contrasts"], + pbscomp=config["VariantAnalysis"]["selection"]["population-branch-statistic"]["contrasts"], contig=config["contigs"], wsize=['1000snp_window', '2000snp_window', '5000snp_window'], ) - if config["VariantAnalysis"]["selection"]["pbs"]["activate"] + if config["VariantAnalysis"]["selection"]["population-branch-statistic"]["activate"] else [] ), log: @@ -180,8 +180,8 @@ rule WindowedFstPBS_notebook: params: dataset=config["dataset"], config = configpath, - pbs=config["VariantAnalysis"]["selection"]["pbs"]["activate"], - pbscomps=config["VariantAnalysis"]["selection"]["pbs"]["contrasts"], + pbs=config["VariantAnalysis"]["selection"]["population-branch-statistic"]["activate"], + pbscomps=config["VariantAnalysis"]["selection"]["population-branch-statistic"]["contrasts"], ploidy=config["VariantAnalysis"]["ploidy"], missingprop=config["VariantAnalysis"]["selection"]["missingness"], qualflt=30, @@ -219,8 +219,8 @@ rule PerGeneFstPBSDxyPi: params: dataset=config["dataset"], DEcontrasts=config["contrasts"], - pbs=config["VariantAnalysis"]["selection"]["pbs"]["activate"], - pbscomps=config["VariantAnalysis"]["selection"]["pbs"]["contrasts"], + pbs=config["VariantAnalysis"]["selection"]["population-branch-statistic"]["activate"], + pbscomps=config["VariantAnalysis"]["selection"]["population-branch-statistic"]["contrasts"], contigs=config["contigs"], ploidy=config["VariantAnalysis"]["ploidy"], missingprop=config["VariantAnalysis"]["selection"]["missingness"], From a2fde750c58ac891f6340ae298bd8d36e3a4558e Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Tue, 18 Jun 2024 11:51:19 +0100 Subject: [PATCH 04/12] fix --- workflow/notebooks/windowed-selection.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/notebooks/windowed-selection.ipynb b/workflow/notebooks/windowed-selection.ipynb index 9c8c5f6..4f842e9 100644 --- a/workflow/notebooks/windowed-selection.ipynb +++ b/workflow/notebooks/windowed-selection.ipynb @@ -109,7 +109,7 @@ "comparisons = comparisons.contrast.str.split(\"_\", expand=True)\n", "comparisons.columns = ['sus', 'res']\n", "comparisons = [list(row) for i,row in comparisons.iterrows()]\n", - "pbscomps = config_params[\"VariantAnalysis\"]['selection']['pbs']['contrasts']\n", + "pbscomps = config_params[\"VariantAnalysis\"]['selection']['population-branch-statistic']['contrasts']\n", "\n", "for i, contig in enumerate(contigs):\n", "\n", From 1e19229d4e391f03e8411ef446c5d0c33046053c Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Thu, 27 Jun 2024 10:06:32 +0100 Subject: [PATCH 05/12] customsnpeffdb_support --- .test/config/config_paired_end.yaml | 6 ++--- .test/config/config_single_end.yaml | 4 +-- config/exampleconfig.yaml | 10 ++++---- workflow/rules/filterAnnotate.smk | 40 ++++++++++++++++++++++++----- 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/.test/config/config_paired_end.yaml b/.test/config/config_paired_end.yaml index d82038a..41db665 100644 --- a/.test/config/config_paired_end.yaml +++ b/.test/config/config_paired_end.yaml @@ -19,8 +19,8 @@ reference: gff: "resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12-X.gff3" snpeff: - local: False - dbpath: Anopheles_gambiae + customdb: True + dbname: Anopheles_gambiae genes2transcripts: "resources/Gene2TranscriptMap.tsv" @@ -46,7 +46,7 @@ DifferentialExpression: # Activate differential expression gene-level: activate: True isoform-level: - activate: True + activate: False progressiveGenes: activate: True diff --git a/.test/config/config_single_end.yaml b/.test/config/config_single_end.yaml index 847a94a..598cf95 100644 --- a/.test/config/config_single_end.yaml +++ b/.test/config/config_single_end.yaml @@ -21,8 +21,8 @@ reference: gff: "resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12-X.gff3" snpeff: - local: False - dbpath: Anopheles_gambiae + customdb: False + dbname: Anopheles_gambiae genes2transcripts: "resources/Gene2TranscriptMap.tsv" diff --git a/config/exampleconfig.yaml b/config/exampleconfig.yaml index e775493..95fc3d5 100755 --- a/config/exampleconfig.yaml +++ b/config/exampleconfig.yaml @@ -28,8 +28,8 @@ reference: gff: "resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3" # Path to GFF annotation file snpeff: - local: False - dbpath: Anopheles_gambiae # SNPeff database name + customdb: False + dbname: Anopheles_gambiae # SNPeff database name genes2transcripts: resources/exampleGene2TranscriptMap.tsv # gene names file with gene and transcript names @@ -49,7 +49,7 @@ contrasts: QualityControl: fastp-trim: activate: False - mosdepth: + coverage: activate: True multiqc: activate: True @@ -83,7 +83,7 @@ VariantAnalysis: activate: True missingness: 1 - summaryStatistics: # Estimate Population Genetic Summary Statistics such as Dxy, Pi + geneticDiversity: # Estimate Population Genetic Summary Statistics such as Dxy, Pi activate: True missingness: 1 @@ -91,7 +91,7 @@ VariantAnalysis: activate: True missingness: 1 - pbs: + population-branch-statistic: activate: True # Activate Population Branch Statistic analysis (Needs three conditions, two closely related and an outgroup) for resistance, do survivors_unexposed_susceptible contrasts: - 'gambiaePM_gambiaeCont_Kisumu' diff --git a/workflow/rules/filterAnnotate.smk b/workflow/rules/filterAnnotate.smk index cc0540c..45611f8 100644 --- a/workflow/rules/filterAnnotate.smk +++ b/workflow/rules/filterAnnotate.smk @@ -41,25 +41,51 @@ rule snpEffDbDownload: Download the snpEff database for your species """ output: - touch("workflow/scripts/snpEff/db.dl"), + touch("resources/reference/mysnpeffdb/db.dl"), log: "logs/snpEff/snpEffDbDownload.log", conda: "../envs/snpeff.yaml" params: - ref=config["reference"]["snpeff"]['dbpath'], - dir="resources/snpEffdb", + ref=config["reference"]["snpeff"]['dbname'], + dir="resources/reference/mysnpeffdb", shell: "snpEff download {params.ref} -dataDir {params.dir} 2> {log}" +rule createCustomSnpEffDb: + """ + Create a custom SnpEff database from a reference genome and GFF file + """ + input: + fa=config['reference']['genome'].rstrip(".gz"), + gff=config['reference']['gff'], + output: + "resources/reference/mysnpeffdb/sequences.fa", + "resources/reference/mysnpeffdb/genes.gff", + "resources/reference/mysnpeffdb/snpEffectPredictor.bin" + log: + "logs/snpEff/createCustomSnpEffDb.log", + conda: + "../envs/snpeff.yaml" + params: + dataDir=lambda x: wkdir + "/resources/reference", + wkdir=wkdir + shell: + """ + ln -s {params.wkdir}/{input.fa} {params.dataDir}/mysnpeffdb/sequences.fa 2> {log} + ln -s {params.wkdir}/{input.gff} {params.dataDir}/mysnpeffdb/genes.gff 2> {log} + snpEff build -gff3 -v -dataDir {params.dataDir} -configOption mysnpeffdb.genome=mysnpeffdb mysnpeffdb -noCheckCds -noCheckProtein 2>> {log} + """ + + rule snpEff: """ Run snpEff on the VCFs """ input: calls="results/variantAnalysis/vcfs/variants.{contig}.vcf", - db=config['reference']['snpeff']['dbpath'] if config['reference']['snpeff']['local'] else "workflow/scripts/snpEff/db.dl", + db="resources/reference/mysnpeffdb/snpEffectPredictor.bin" if config['reference']['snpeff']['customdb'] else "resources/reference/mysnpeffdb/db.dl", output: calls=expand( "results/variantAnalysis/vcfs/{dataset}.{{contig}}.vcf.gz", @@ -71,12 +97,12 @@ rule snpEff: conda: "../envs/snpeff.yaml" params: - db=config["reference"]["snpeff"]['dbpath'], + db=config["reference"]["snpeff"]['dbname'], prefix=lambda w, output: os.path.splitext(output[0])[0], - dir="resources/snpEffdb", + dataDir=lambda x: wkdir + "resources/reference/mysnpeffdb", shell: """ - snpEff eff {params.db} -dataDir {params.dir} -csvStats {output.csvStats} {input.calls} > {params.prefix} 2> {log} + snpEff eff {params.db} -dataDir {params.dataDir} -csvStats {output.csvStats} {input.calls} > {params.prefix} 2> {log} bgzip {params.prefix} """ From d876ac8a5da35aa36c71b3595228c2920fa5f6e1 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Thu, 27 Jun 2024 10:25:53 +0100 Subject: [PATCH 06/12] restructure --- .test/config/config_paired_end.yaml | 2 +- .test/config/config_single_end.yaml | 2 +- config/exampleconfig.yaml | 4 +- workflow/Snakefile | 10 +- workflow/rules/alignment.smk | 127 ------------------ workflow/rules/common.smk | 46 +------ workflow/rules/diffexp.smk | 39 ++++++ workflow/rules/hisat2-freebayes.smk | 122 +++++++++++++++++ workflow/rules/jupyter-book.smk | 31 ++++- workflow/rules/qc.smk | 8 +- .../rules/{filterAnnotate.smk => snpEff.smk} | 64 --------- ...ricks-gpu.smk => star-haplotypecaller.smk} | 0 workflow/rules/utilities.smk | 69 ++++++++++ workflow/rules/variantCalling.smk | 49 ------- workflow/rules/variantsOfInterest.smk | 4 +- 15 files changed, 277 insertions(+), 300 deletions(-) delete mode 100644 workflow/rules/alignment.smk create mode 100644 workflow/rules/hisat2-freebayes.smk rename workflow/rules/{filterAnnotate.smk => snpEff.smk} (56%) rename workflow/rules/{parabricks-gpu.smk => star-haplotypecaller.smk} (100%) create mode 100644 workflow/rules/utilities.smk delete mode 100644 workflow/rules/variantCalling.smk diff --git a/.test/config/config_paired_end.yaml b/.test/config/config_paired_end.yaml index 6464f1a..4cf72c3 100644 --- a/.test/config/config_paired_end.yaml +++ b/.test/config/config_paired_end.yaml @@ -1,4 +1,5 @@ metadata: config/samples.tsv +pipeline: cpu # Dataset name dataset: 'Test-GithubActionsCI' @@ -61,7 +62,6 @@ DifferentialExpression: # Activate differential expression VariantAnalysis: activate: True - caller: freebayes #haplotypecaller # Options: haplotypecaller, freebayes ploidy: 10 chunks: 9 # Number of chunks to split the genome into when parallelising freebayes # Number of chunks to split the genome into when parallelising freebayes diff --git a/.test/config/config_single_end.yaml b/.test/config/config_single_end.yaml index 1f78d53..8da2aae 100644 --- a/.test/config/config_single_end.yaml +++ b/.test/config/config_single_end.yaml @@ -1,4 +1,5 @@ metadata: config/samples.tsv +pipeline: cpu # Dataset name dataset: 'Test-GithubActionsCI' @@ -66,7 +67,6 @@ DifferentialExpression: # Activate differential expression VariantAnalysis: activate: True - caller: freebayes ploidy: 10 chunks: 9 # Number of chunks to split the genome into when parallelising freebayes # Number of chunks to split the genome into when parallelising freebayes diff --git a/config/exampleconfig.yaml b/config/exampleconfig.yaml index a440aac..a854f12 100755 --- a/config/exampleconfig.yaml +++ b/config/exampleconfig.yaml @@ -1,5 +1,5 @@ # RNA-Seq-Pop - +pipeline: parabricks #parabricks or cpu metadata: config/samples.tsv # samplesheet metadata file dataset: 'Ag_Bouake' # Dataset name: Can be anything, will be used to name some main output files @@ -76,7 +76,6 @@ DifferentialExpression: VariantAnalysis: activate: True - caller: freebayes ploidy: 10 # Ploidy level for freebayes to call at (Generally we are using pooled samples).For diploid organisms, this should be 2 * number of individuals in each pool chunks: 9 # Number of chunks to split each chromosome into when parallelising freebayes. 9 or less is recommended. @@ -115,7 +114,6 @@ VariantAnalysis: # - "2Ru" - miscellaneous: VariantsOfInterest: activate: True diff --git a/workflow/Snakefile b/workflow/Snakefile index 37b4487..7092b91 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -23,14 +23,14 @@ welcome(version="v2.0.2") include: "rules/qc.smk" include: "rules/diffexp.smk" -include: "rules/alignment.smk" -include: "rules/variantCalling.smk" -include: "rules/filterAnnotate.smk" +include: "rules/utilities.smk" +include: "rules/hisat2-freebayes.smk" +include: "rules/snpEff.smk" include: "rules/variantAnalysis.smk" include: "rules/jupyter-book.smk" -if config['VariantAnalysis']['caller'] == 'haplotypecaller': - include: "rules/parabricks-gpu.smk" +if config['pipeline'] == 'parabricks': + include: "rules/star-haplotypecaller.smk" rule all: input: diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk deleted file mode 100644 index 6a713be..0000000 --- a/workflow/rules/alignment.smk +++ /dev/null @@ -1,127 +0,0 @@ - -rule KallistoIndex: - """ - Create a kallisto index of the reference transcriptome - """ - input: - fasta=config["reference"]["transcriptome"], - output: - index="resources/reference/kallisto.idx", - group: - "diffexp" - log: - "logs/kallisto/index.log", - wrapper: - "v1.15.0/bio/kallisto/index" - - -rule KallistoQuant: - """ - Pseudo-align reads for each sample to the reference transcriptome. - Bootstrap to allow for isoform differential expression. - """ - input: - fastq=lambda wildcards: getFASTQs(wildcards=wildcards, rules="KallistoQuant"), - index="resources/reference/kallisto.idx", - output: - directory("results/counts/{sample}"), - group: - "diffexp" - log: - "logs/kallisto/quant_{sample}.log", - params: - extra="-b 100" if config['fastq']['paired'] is True else "-b 100 --single -l 75 -s 5", - threads: 24 - wrapper: - "v1.15.0/bio/kallisto/quant" - - -rule GenomeUnzip: - """ - Index the reference genome with samtools - """ - input: - config["reference"]["genome"], - output: - config["reference"]["genome"].rstrip(".gz"), - log: - "logs/GenomeUnzip.log", - shell: - "gzip -d -c {input} > {output} 2> {log}" - - -rule GenomeIndex: - """ - Index the reference genome with samtools - """ - input: - config["reference"]["genome"].rstrip(".gz"), - output: - config["reference"]["genome"].rstrip(".gz") + ".fai", - log: - "logs/GenomeIndex.log", - wrapper: - "v1.15.0/bio/samtools/faidx" - - -rule HISAT2index: - """ - Make a HISAT2 index of the reference genome - """ - input: - fasta=config["reference"]["genome"].rstrip(".gz"), - output: - "resources/reference/ht2index/idx.1.ht2", - touch("resources/reference/ht2index/.complete"), - log: - "logs/HISAT2/HISAT2index.log", - conda: - "../envs/variants.yaml" - params: - prefix=lambda w, output: output[0].split(os.extsep)[0], - threads: 8 - shell: - "hisat2-build -p {threads} {input.fasta} {params.prefix} 2> {log}" - - -rule HISAT2align: - """ - Align reads to the genome with HISAT2, mark duplicates with samblaster and sort with samtools - """ - input: - reads=lambda wildcards: getFASTQs( - wildcards=wildcards, rules="HISAT2align_input" - ), - idx="resources/reference/ht2index/.complete", - output: - "results/alignments/{sample}.hisat2.bam", - log: - align="logs/HISAT2/{sample}_align.log", - sort="logs/samtoolsSort/{sample}.log", - conda: - "../envs/variants.yaml" - params: - readflags=lambda wildcards: getFASTQs(wildcards=wildcards, rules="HISAT2align"), - extra="--dta -q --rg-id {sample} --rg SM:{sample} --rg PL:ILLUMINA --new-summary", - idx="resources/reference/ht2index/idx", - samblaster="" if config['fastq']['paired'] is True else "--ignoreUnmated" - threads: 12 - shell: - """ - hisat2 {params.extra} --threads {threads} -x {params.idx} {params.readflags} 2> {log.align} | - samblaster {params.samblaster} 2> {log.sort} | samtools sort -@{threads} - -o {output} 2>> {log.sort} - """ - - -rule IndexBams: - """ - Index bams with samtools - """ - input: - "results/alignments/{sample}.hisat2.bam" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam" - output: - "results/alignments/{sample}.hisat2.bam.bai" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam.bai" - log: - "logs/IndexBams/{sample}.log", - wrapper: - "v1.15.0/bio/samtools/index" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 7c8eb21..9c231f4 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -16,46 +16,6 @@ def load_metadata(metadata_path): raise ValueError("Metadata file must be .xlsx or .csv") return metadata - -rule set_kernel: - input: - f'{workflow.basedir}/envs/pythonGenomics.yaml' - output: - touch("results/.kernel.set") - conda: f'{workflow.basedir}/envs/pythonGenomics.yaml' - log: - "logs/notebooks/set_kernel.log" - shell: - """ - python -m ipykernel install --user --name=pythonGenomics 2> {log} - """ - -rule remove_input_param_cell_nbs: - input: - input_nb = f"{workflow.basedir}/notebooks/process-notebooks.ipynb", - counts_qc = "docs/rna-seq-pop-results/notebooks/counts-qc.ipynb", - diffexp = "docs/rna-seq-pop-results/notebooks/differential-expression.ipynb", - qc = "docs/rna-seq-pop-results/notebooks/quality-control.ipynb" if config['QualityControl']['multiqc']['activate'] else [], - gsea = "docs/rna-seq-pop-results/notebooks/gene-set-enrichment-analysis.ipynb" if config['DifferentialExpression']['GSEA']['activate'] else [], - gene_families = "docs/rna-seq-pop-results/notebooks/gene-families-heatmap.ipynb" if config['miscellaneous']['GeneFamiliesHeatmap']['activate'] else [], - pca = "docs/rna-seq-pop-results/notebooks/principal-components-analysis.ipynb" if config['VariantAnalysis']['pca']['activate'] else [], - genetic_diversity = "docs/rna-seq-pop-results/notebooks/genetic-diversity.ipynb" if config['VariantAnalysis']['geneticDiversity']['activate'] else [], - selection = "docs/rna-seq-pop-results/notebooks/windowed-selection.ipynb" if config['VariantAnalysis']['selection']['activate'] else [], - voi = "docs/rna-seq-pop-results/notebooks/variants-of-interest.ipynb" if config['miscellaneous']['VariantsOfInterest']['activate'] else [], - karyo = "docs/rna-seq-pop-results/notebooks/karyotype.ipynb" if config['VariantAnalysis']['karyotype']['activate'] else [], - output: - out_nb = "results/notebooks/process-notebooks.ipynb", - process = touch("results/rna-seq-pop-results/.processed_nbs") - conda: - f'{workflow.basedir}/envs/pythonGenomics.yaml' - log: - "logs/notebooks/remove_input_param_cell_nbs.log" - params: - wkdir = wkdir - shell: - """ - papermill -k pythonGenomics {input.input_nb} {output.out_nb} -p wkdir {params.wkdir} 2> {log} - """ def getFASTQs(wildcards, rules=None): @@ -109,10 +69,10 @@ def getBAM(wildcards): """ Get BAM files depending on aligner """ - if config['variantAnalysis']['caller'] == 'freebayes': - bam = "results/alignments/{sample}.hisat2.bam" - else: + if config['pipeline'] == 'parabricks': bam = "results/alignments/{sample}.star.bam" + else: + bam = "results/alignments/{sample}.hisat2.bam" return bam def GetDesiredOutputs(wildcards): diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk index e62bcc7..f009605 100644 --- a/workflow/rules/diffexp.smk +++ b/workflow/rules/diffexp.smk @@ -1,3 +1,42 @@ + +rule KallistoIndex: + """ + Create a kallisto index of the reference transcriptome + """ + input: + fasta=config["reference"]["transcriptome"], + output: + index="resources/reference/kallisto.idx", + group: + "diffexp" + log: + "logs/kallisto/index.log", + wrapper: + "v1.15.0/bio/kallisto/index" + + +rule KallistoQuant: + """ + Pseudo-align reads for each sample to the reference transcriptome. + Bootstrap to allow for isoform differential expression. + """ + input: + fastq=lambda wildcards: getFASTQs(wildcards=wildcards, rules="KallistoQuant"), + index="resources/reference/kallisto.idx", + output: + directory("results/counts/{sample}"), + group: + "diffexp" + log: + "logs/kallisto/quant_{sample}.log", + params: + extra="-b 100" if config['fastq']['paired'] is True else "-b 100 --single -l 75 -s 5", + threads: 24 + wrapper: + "v1.15.0/bio/kallisto/quant" + + + rule DifferentialGeneExpression: """ Perform differential expression analysis at the gene-level with DESeq2 diff --git a/workflow/rules/hisat2-freebayes.smk b/workflow/rules/hisat2-freebayes.smk new file mode 100644 index 0000000..c4767dd --- /dev/null +++ b/workflow/rules/hisat2-freebayes.smk @@ -0,0 +1,122 @@ + +rule HISAT2index: + """ + Make a HISAT2 index of the reference genome + """ + input: + fasta=config["reference"]["genome"].rstrip(".gz"), + output: + "resources/reference/ht2index/idx.1.ht2", + touch("resources/reference/ht2index/.complete"), + log: + "logs/HISAT2/HISAT2index.log", + conda: + "../envs/variants.yaml" + params: + prefix=lambda w, output: output[0].split(os.extsep)[0], + threads: 8 + shell: + "hisat2-build -p {threads} {input.fasta} {params.prefix} 2> {log}" + + +rule HISAT2align: + """ + Align reads to the genome with HISAT2, mark duplicates with samblaster and sort with samtools + """ + input: + reads=lambda wildcards: getFASTQs( + wildcards=wildcards, rules="HISAT2align_input" + ), + idx="resources/reference/ht2index/.complete", + output: + "results/alignments/{sample}.hisat2.bam", + log: + align="logs/HISAT2/{sample}_align.log", + sort="logs/samtoolsSort/{sample}.log", + conda: + "../envs/variants.yaml" + params: + readflags=lambda wildcards: getFASTQs(wildcards=wildcards, rules="HISAT2align"), + extra="--dta -q --rg-id {sample} --rg SM:{sample} --rg PL:ILLUMINA --new-summary", + idx="resources/reference/ht2index/idx", + samblaster="" if config['fastq']['paired'] is True else "--ignoreUnmated" + threads: 12 + shell: + """ + hisat2 {params.extra} --threads {threads} -x {params.idx} {params.readflags} 2> {log.align} | + samblaster {params.samblaster} 2> {log.sort} | samtools sort -@{threads} - -o {output} 2>> {log.sort} + """ + + +chunks = np.arange(1, config["VariantAnalysis"]["chunks"]) + +rule GenerateFreebayesParams: + input: + ref_idx=config["reference"]["genome"].rstrip(".gz"), + index=config["reference"]["genome"].rstrip(".gz") + ".fai", + bams=expand("results/alignments/{sample}.hisat2.bam", sample=samples), + output: + bamlist="results/alignments/bam.list", + pops="results/alignments/populations.tsv", + regions=expand( + "resources/regions/genome.{contig}.region.{i}.bed", + contig=config["contigs"], + i=chunks, + ), + log: + "logs/GenerateFreebayesParams.log", + params: + metadata=config["metadata"], + contigs=config["contigs"], + chunks=config["VariantAnalysis"]["chunks"], + conda: + "../envs/diffexp.yaml" + script: + "../scripts/GenerateFreebayesParams.R" + + +rule VariantCallingFreebayes: + """ + Run freebayes on chunks of the genome, splitting the samples by population (strain) + """ + input: + bams=expand("results/alignments/{sample}.hisat2.bam", sample=samples), + index=expand("results/alignments/{sample}.hisat2.bam.bai", sample=samples), + ref=config["reference"]["genome"].rstrip(".gz"), + samples=ancient("results/alignments/bam.list"), + pops=ancient("results/alignments/populations.tsv"), + regions=ancient("resources/regions/genome.{contig}.region.{i}.bed"), + output: + temp("results/variantAnalysis/vcfs/freebayes/{contig}/variants.{i}.vcf"), + log: + "logs/VariantCallingFreebayes/{contig}.{i}.log", + params: + ploidy=config["VariantAnalysis"]["ploidy"], + conda: + "../envs/variants.yaml" + threads: 1 + shell: + "freebayes -f {input.ref} -t {input.regions} --ploidy {params.ploidy} --populations {input.pops} --pooled-discrete --use-best-n-alleles 5 -L {input.samples} > {output} 2> {log}" + +chunks = np.arange(1, config["VariantAnalysis"]["chunks"]) + +rule ConcatVCFs: + """ + Concatenate VCFs together + """ + input: + calls=expand( + "results/variantAnalysis/vcfs/freebayes/{{contig}}/variants.{i}.vcf", + i=chunks, + ), + output: + temp("results/variantAnalysis/vcfs/freebayes/variants.{contig}.vcf"), + log: + "logs/ConcatVCFs/{contig}.log", + conda: + "../envs/variants.yaml" + threads: 4 + shell: + "bcftools concat {input.calls} | vcfuniq > {output} 2> {log}" + + diff --git a/workflow/rules/jupyter-book.smk b/workflow/rules/jupyter-book.smk index 45aafe1..d7502c0 100644 --- a/workflow/rules/jupyter-book.smk +++ b/workflow/rules/jupyter-book.smk @@ -24,4 +24,33 @@ rule jupyterbook: """ jupyter-book build --all docs/rna-seq-pop-results --path-output results/rna-seq-pop-results 2> {log} ln -sf docs/rna-seq-pop-results/_build/html/index.html {params.dataset}-results.html 2>> {log} - """ \ No newline at end of file + """ + + + +rule remove_input_param_cell_nbs: + input: + input_nb = f"{workflow.basedir}/notebooks/process-notebooks.ipynb", + counts_qc = "docs/rna-seq-pop-results/notebooks/counts-qc.ipynb", + diffexp = "docs/rna-seq-pop-results/notebooks/differential-expression.ipynb", + qc = "docs/rna-seq-pop-results/notebooks/quality-control.ipynb" if config['QualityControl']['multiqc']['activate'] else [], + gsea = "docs/rna-seq-pop-results/notebooks/gene-set-enrichment-analysis.ipynb" if config['DifferentialExpression']['GSEA']['activate'] else [], + gene_families = "docs/rna-seq-pop-results/notebooks/gene-families-heatmap.ipynb" if config['miscellaneous']['GeneFamiliesHeatmap']['activate'] else [], + pca = "docs/rna-seq-pop-results/notebooks/principal-components-analysis.ipynb" if config['VariantAnalysis']['pca']['activate'] else [], + genetic_diversity = "docs/rna-seq-pop-results/notebooks/genetic-diversity.ipynb" if config['VariantAnalysis']['geneticDiversity']['activate'] else [], + selection = "docs/rna-seq-pop-results/notebooks/windowed-selection.ipynb" if config['VariantAnalysis']['selection']['activate'] else [], + voi = "docs/rna-seq-pop-results/notebooks/variants-of-interest.ipynb" if config['miscellaneous']['VariantsOfInterest']['activate'] else [], + karyo = "docs/rna-seq-pop-results/notebooks/karyotype.ipynb" if config['VariantAnalysis']['karyotype']['activate'] else [], + output: + out_nb = "results/notebooks/process-notebooks.ipynb", + process = touch("results/rna-seq-pop-results/.processed_nbs") + conda: + f'{workflow.basedir}/envs/pythonGenomics.yaml' + log: + "logs/notebooks/remove_input_param_cell_nbs.log" + params: + wkdir = wkdir + shell: + """ + papermill -k pythonGenomics {input.input_nb} {output.out_nb} -p wkdir {params.wkdir} 2> {log} + """ diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 782ac5e..c415458 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -44,8 +44,8 @@ rule BamStats: QC alignment statistics """ input: - bam="results/alignments/{sample}.hisat2.bam" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam", - idx="results/alignments/{sample}.hisat2.bam.bai" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam.bai", + bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam", + idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai", output: stats="results/qc/alignments/{sample}.flagstat", log: @@ -59,8 +59,8 @@ rule Coverage: Calculate coverage with mosdepth """ input: - bam="results/alignments/{sample}.hisat2.bam" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam", - idx="results/alignments/{sample}.hisat2.bam.bai" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam.bai", + bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam", + idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai", output: "results/qc/coverage/{sample}.mosdepth.summary.txt", log: diff --git a/workflow/rules/filterAnnotate.smk b/workflow/rules/snpEff.smk similarity index 56% rename from workflow/rules/filterAnnotate.smk rename to workflow/rules/snpEff.smk index 3722f6b..148ae91 100644 --- a/workflow/rules/filterAnnotate.smk +++ b/workflow/rules/snpEff.smk @@ -1,40 +1,3 @@ -chunks = np.arange(1, config["VariantAnalysis"]["chunks"]) - -rule ConcatVCFs: - """ - Concatenate VCFs together - """ - input: - calls=expand( - "results/variantAnalysis/vcfs/freebayes/{{contig}}/variants.{i}.vcf", - i=chunks, - ), - output: - temp("results/variantAnalysis/vcfs/freebayes/variants.{contig}.vcf"), - log: - "logs/ConcatVCFs/{contig}.log", - conda: - "../envs/variants.yaml" - threads: 4 - shell: - "bcftools concat {input.calls} | vcfuniq > {output} 2> {log}" - - -rule RestrictToSNPs: - """" - Filter out indels - """ - input: - "results/variantAnalysis/vcfs/freebayes/variants.{contig}.vcf" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/variantAnalysis/vcfs/haplotypecaller/variants.{contig}.vcf", - output: - temp("results/variantAnalysis/vcfs/variants.{contig}.vcf"), - log: - "logs/bcftoolsView/{contig}.log", - params: - extra="-v snps", - wrapper: - "v1.15.0/bio/bcftools/view" - rule snpEffDbDownload: """ @@ -52,32 +15,6 @@ rule snpEffDbDownload: shell: "snpEff download {params.ref} -dataDir {params.dir} 2> {log}" -rule createCustomSnpEffDb: - """ - Create a custom SnpEff database from a reference genome and GFF file - """ - input: - fa="path/to/your_organism.fa", - gff="path/to/your_organism.gff", - output: - touch("workflow/scripts/snpEff/custom_db.built"), - log: - "logs/snpEff/createCustomSnpEffDb.log", - conda: - "../envs/snpeff.yaml" - params: - db_name="my_organism", - data_dir="resources/snpEffdb/my_organism", - shell: - """ - mkdir -p {params.data_dir} - cp {input.fa} {params.data_dir}/sequences.fa - cp {input.gff} {params.data_dir}/genes.gff - echo "{params.db_name}.genome : My Organism" >> snpEff.config - snpEff build -gff3 -v {params.db_name} 2> {log} - touch {output} - """ - rule createCustomSnpEffDb: """ @@ -104,7 +41,6 @@ rule createCustomSnpEffDb: snpEff build -gff3 -v -dataDir {params.dataDir} -configOption mysnpeffdb.genome=mysnpeffdb mysnpeffdb -noCheckCds -noCheckProtein 2>> {log} """ - rule snpEff: """ Run snpEff on the VCFs diff --git a/workflow/rules/parabricks-gpu.smk b/workflow/rules/star-haplotypecaller.smk similarity index 100% rename from workflow/rules/parabricks-gpu.smk rename to workflow/rules/star-haplotypecaller.smk diff --git a/workflow/rules/utilities.smk b/workflow/rules/utilities.smk new file mode 100644 index 0000000..49eca67 --- /dev/null +++ b/workflow/rules/utilities.smk @@ -0,0 +1,69 @@ +rule set_kernel: + input: + f'{workflow.basedir}/envs/pythonGenomics.yaml' + output: + touch("results/.kernel.set") + conda: f'{workflow.basedir}/envs/pythonGenomics.yaml' + log: + "logs/notebooks/set_kernel.log" + shell: + """ + python -m ipykernel install --user --name=pythonGenomics 2> {log} + """ + +rule GenomeUnzip: + """ + Index the reference genome with samtools + """ + input: + config["reference"]["genome"], + output: + config["reference"]["genome"].rstrip(".gz"), + log: + "logs/GenomeUnzip.log", + shell: + "gzip -d -c {input} > {output} 2> {log}" + + +rule GenomeIndex: + """ + Index the reference genome with samtools + """ + input: + config["reference"]["genome"].rstrip(".gz"), + output: + config["reference"]["genome"].rstrip(".gz") + ".fai", + log: + "logs/GenomeIndex.log", + wrapper: + "v1.15.0/bio/samtools/faidx" + + +rule IndexBams: + """ + Index bams with samtools + """ + input: + bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam", + output: + idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai", + log: + "logs/IndexBams/{sample}.log", + wrapper: + "v1.15.0/bio/samtools/index" + + +rule RestrictToSNPs: + """" + Filter out indels + """ + input: + "results/variantAnalysis/vcfs/haplotypecaller/variants.{contig}.vcf" if config['pipeline'] == 'parabricks' else "results/variantAnalysis/vcfs/freebayes/variants.{contig}.vcf", + output: + temp("results/variantAnalysis/vcfs/variants.{contig}.vcf"), + log: + "logs/bcftoolsView/{contig}.log", + params: + extra="-v snps", + wrapper: + "v1.15.0/bio/bcftools/view" diff --git a/workflow/rules/variantCalling.smk b/workflow/rules/variantCalling.smk deleted file mode 100644 index 76cc956..0000000 --- a/workflow/rules/variantCalling.smk +++ /dev/null @@ -1,49 +0,0 @@ -chunks = np.arange(1, config["VariantAnalysis"]["chunks"]) - -rule GenerateFreebayesParams: - input: - ref_idx=config["reference"]["genome"].rstrip(".gz"), - index=config["reference"]["genome"].rstrip(".gz") + ".fai", - bams=expand("results/alignments/{sample}.hisat2.bam", sample=samples), - output: - bamlist="results/alignments/bam.list", - pops="results/alignments/populations.tsv", - regions=expand( - "resources/regions/genome.{contig}.region.{i}.bed", - contig=config["contigs"], - i=chunks, - ), - log: - "logs/GenerateFreebayesParams.log", - params: - metadata=config["metadata"], - contigs=config["contigs"], - chunks=config["VariantAnalysis"]["chunks"], - conda: - "../envs/diffexp.yaml" - script: - "../scripts/GenerateFreebayesParams.R" - - -rule VariantCallingFreebayes: - """ - Run freebayes on chunks of the genome, splitting the samples by population (strain) - """ - input: - bams=expand("results/alignments/{sample}.hisat2.bam", sample=samples), - index=expand("results/alignments/{sample}.hisat2.bam.bai", sample=samples), - ref=config["reference"]["genome"].rstrip(".gz"), - samples=ancient("results/alignments/bam.list"), - pops=ancient("results/alignments/populations.tsv"), - regions=ancient("resources/regions/genome.{contig}.region.{i}.bed"), - output: - temp("results/variantAnalysis/vcfs/freebayes/{contig}/variants.{i}.vcf"), - log: - "logs/VariantCallingFreebayes/{contig}.{i}.log", - params: - ploidy=config["VariantAnalysis"]["ploidy"], - conda: - "../envs/variants.yaml" - threads: 1 - shell: - "freebayes -f {input.ref} -t {input.regions} --ploidy {params.ploidy} --populations {input.pops} --pooled-discrete --use-best-n-alleles 5 -L {input.samples} > {output} 2> {log}" diff --git a/workflow/rules/variantsOfInterest.smk b/workflow/rules/variantsOfInterest.smk index 4f96fab..36c3ea6 100644 --- a/workflow/rules/variantsOfInterest.smk +++ b/workflow/rules/variantsOfInterest.smk @@ -3,8 +3,8 @@ rule mpileupVariantsOfInterest: Get allele count tables of variants of choice (specified in config file ("IRmutations.tsv")) """ input: - bam="results/alignments/{sample}.hisat2.bam" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam", - idx="results/alignments/{sample}.hisat2.bam.bai" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam.bai", + bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam", + idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai", output: "results/variantAnalysis/variantsOfInterest/counts/{sample}_{mut}_allele_counts.tsv", conda: From a90fec41790bc3605c9659011f5493c2063de58e Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Thu, 27 Jun 2024 10:55:13 +0100 Subject: [PATCH 07/12] try to prevent qc notebook needing fastp --- workflow/rules/qc.smk | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index c415458..309b268 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -99,7 +99,9 @@ rule multiQC: Integrate QC statistics from other tools into a final .html report """ input: - expand("results/qc/{sample}.json", sample=samples), + expand("results/qc/{sample}.json", sample=samples) + if config['QualityControl']['fastp-trim']['activate'] + else [], expand( "results/qc/vcfs/{contig}.txt", contig=config["contigs"] ) From f30d8279ababef82c67729ae1688236e1a694ac7 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Thu, 27 Jun 2024 10:55:39 +0100 Subject: [PATCH 08/12] try to prevent qc notebook needing fastp --- workflow/rules/qc.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 309b268..149748c 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -109,7 +109,8 @@ rule multiQC: else [], expand( "results/qc/coverage/{sample}.mosdepth.summary.txt", sample=samples - ), + ) if config['QualityControl']['coverage'] + else [], expand("results/qc/alignments/{sample}.flagstat", sample=samples), expand("results/counts/{sample}", sample=samples), output: From 1586a071af90e67775abeb42820e60b8754c8b9c Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Thu, 27 Jun 2024 11:54:23 +0100 Subject: [PATCH 09/12] fix --- workflow/rules/snpEff.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/snpEff.smk b/workflow/rules/snpEff.smk index 148ae91..92acb63 100644 --- a/workflow/rules/snpEff.smk +++ b/workflow/rules/snpEff.smk @@ -59,9 +59,9 @@ rule snpEff: conda: "../envs/snpeff.yaml" params: - db=config["reference"]["snpeff"]['dbname'], + db=config["reference"]["snpeff"]['dbname'] if config['reference']['snpeff']['customdb'] else "mysnpeffdb", prefix=lambda w, output: os.path.splitext(output[0])[0], - dataDir=lambda x: wkdir + "resources/reference/mysnpeffdb", + dataDir=lambda x: wkdir + "resources/reference/", shell: """ snpEff eff {params.db} -dataDir {params.dataDir} -csvStats {output.csvStats} {input.calls} > {params.prefix} 2> {log} From 59afb198926ac7a7ba9ca410e8b919e23b7f3b87 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Thu, 27 Jun 2024 12:26:45 +0100 Subject: [PATCH 10/12] fix --- workflow/rules/snpEff.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/snpEff.smk b/workflow/rules/snpEff.smk index 92acb63..92f50d3 100644 --- a/workflow/rules/snpEff.smk +++ b/workflow/rules/snpEff.smk @@ -59,7 +59,7 @@ rule snpEff: conda: "../envs/snpeff.yaml" params: - db=config["reference"]["snpeff"]['dbname'] if config['reference']['snpeff']['customdb'] else "mysnpeffdb", + db=config["reference"]["snpeff"]['dbname'] if not config['reference']['snpeff']['customdb'] else "mysnpeffdb", prefix=lambda w, output: os.path.splitext(output[0])[0], dataDir=lambda x: wkdir + "resources/reference/", shell: From 416112ac63f134c29c580db6ad0dd44df4076c53 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Thu, 27 Jun 2024 12:41:59 +0100 Subject: [PATCH 11/12] fix --- workflow/rules/snpEff.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/snpEff.smk b/workflow/rules/snpEff.smk index 92f50d3..91ebbcd 100644 --- a/workflow/rules/snpEff.smk +++ b/workflow/rules/snpEff.smk @@ -61,7 +61,7 @@ rule snpEff: params: db=config["reference"]["snpeff"]['dbname'] if not config['reference']['snpeff']['customdb'] else "mysnpeffdb", prefix=lambda w, output: os.path.splitext(output[0])[0], - dataDir=lambda x: wkdir + "resources/reference/", + dataDir=lambda x: wkdir + "/resources/reference/", shell: """ snpEff eff {params.db} -dataDir {params.dataDir} -csvStats {output.csvStats} {input.calls} > {params.prefix} 2> {log} From 89d648c6e3b8ddcddf703da60320c100217781f9 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Thu, 27 Jun 2024 13:04:45 +0100 Subject: [PATCH 12/12] fix --- workflow/rules/snpEff.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/snpEff.smk b/workflow/rules/snpEff.smk index 91ebbcd..497d764 100644 --- a/workflow/rules/snpEff.smk +++ b/workflow/rules/snpEff.smk @@ -64,7 +64,7 @@ rule snpEff: dataDir=lambda x: wkdir + "/resources/reference/", shell: """ - snpEff eff {params.db} -dataDir {params.dataDir} -csvStats {output.csvStats} {input.calls} > {params.prefix} 2> {log} + snpEff eff {params.db} -dataDir {params.dataDir} -configOption mysnpeffdb.genome=mysnpeffdb -csvStats {output.csvStats} {input.calls} > {params.prefix} 2> {log} bgzip {params.prefix} """