diff --git a/.test/config/config_paired_end.yaml b/.test/config/config_paired_end.yaml index c0041fa..4cf72c3 100644 --- a/.test/config/config_paired_end.yaml +++ b/.test/config/config_paired_end.yaml @@ -1,9 +1,10 @@ metadata: config/samples.tsv +pipeline: cpu # Dataset name dataset: 'Test-GithubActionsCI' -jupyter-book: +results-jupyterbook: activate: True fastq: @@ -18,8 +19,9 @@ reference: "resources/reference/Anopheles-gambiae-PEST_TRANSCRIPTS_AgamP4.12-X.fa.gz" gff: "resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12-X.gff3" - snpeffdb: - "Anopheles_gambiae" + snpeff: + customdb: True + dbname: Anopheles_gambiae genes2transcripts: "resources/Gene2TranscriptMap.tsv" @@ -35,7 +37,7 @@ contrasts: QualityControl: fastp-trim: activate: True - mosdepth: + coverage: activate: True multiqc: activate: True @@ -45,7 +47,7 @@ DifferentialExpression: # Activate differential expression gene-level: activate: True isoform-level: - activate: True + activate: False progressiveGenes: activate: True @@ -60,7 +62,6 @@ DifferentialExpression: # Activate differential expression VariantAnalysis: activate: True - caller: freebayes #haplotypecaller # Options: haplotypecaller, freebayes ploidy: 10 chunks: 9 # Number of chunks to split the genome into when parallelising freebayes # Number of chunks to split the genome into when parallelising freebayes @@ -69,7 +70,7 @@ VariantAnalysis: activate: True missingness: 0.4 - summaryStatistics: + geneticDiversity: activate: True missingness: 0.4 @@ -77,7 +78,7 @@ VariantAnalysis: activate: True missingness: 0.5 # Do we want to run pbs (Needs three conditions, two more closely related and a slight outgroup) - pbs: + population-branch-statistic: activate: False contrasts: ['PiriTia_ContTia_Kisumu'] diff --git a/.test/config/config_single_end.yaml b/.test/config/config_single_end.yaml index e095191..8da2aae 100644 --- a/.test/config/config_single_end.yaml +++ b/.test/config/config_single_end.yaml @@ -1,9 +1,10 @@ metadata: config/samples.tsv +pipeline: cpu # Dataset name dataset: 'Test-GithubActionsCI' -jupyter-book: +results-jupyterbook: activate: True fastq: @@ -20,8 +21,9 @@ reference: "resources/reference/Anopheles-gambiae-PEST_TRANSCRIPTS_AgamP4.12-X.fa.gz" gff: "resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12-X.gff3" - snpeffdb: - "Anopheles_gambiae" + snpeff: + customdb: False + dbname: Anopheles_gambiae genes2transcripts: "resources/Gene2TranscriptMap.tsv" @@ -38,7 +40,7 @@ contrasts: QualityControl: fastp-trim: activate: True - mosdepth: + coverage: activate: True multiqc: activate: True @@ -65,7 +67,6 @@ DifferentialExpression: # Activate differential expression VariantAnalysis: activate: True - caller: freebayes ploidy: 10 chunks: 9 # Number of chunks to split the genome into when parallelising freebayes # Number of chunks to split the genome into when parallelising freebayes @@ -74,7 +75,7 @@ VariantAnalysis: activate: True missingness: 0.4 - summaryStatistics: + geneticDiversity: activate: True missingness: 0.4 @@ -82,7 +83,7 @@ VariantAnalysis: activate: True missingness: 0.5 # Do we want to run pbs (Needs three conditions, two more closely related and a slight outgroup) - pbs: + population-branch-statistic: activate: False contrasts: - 'PiriTia_ContTia_Kisumu' diff --git a/config/exampleconfig.yaml b/config/exampleconfig.yaml index e0fb0eb..a854f12 100755 --- a/config/exampleconfig.yaml +++ b/config/exampleconfig.yaml @@ -1,5 +1,5 @@ # RNA-Seq-Pop - +pipeline: parabricks #parabricks or cpu metadata: config/samples.tsv # samplesheet metadata file dataset: 'Ag_Bouake' # Dataset name: Can be anything, will be used to name some main output files @@ -27,10 +27,11 @@ reference: "resources/reference/Anopheles-gambiae-PEST_TRANSCRIPTS_AgamP4.12.fa" # Path to transcriptome reference FASTA file gff: "resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3" # Path to GFF annotation file - snpeffdb: - "Anopheles_gambiae" # SNPeff database name + snpeff: + customdb: False + dbname: Anopheles_gambiae # SNPeff database name genes2transcripts: - "resources/exampleGene2TranscriptMap.tsv" # gene names file with gene and transcript names + resources/exampleGene2TranscriptMap.tsv # gene names file with gene and transcript names # Chromosome names for the appropriate species. # Please ensure that these correspond exactly to the reference fasta/gff files. Extra unwanted chromosomes (unplaced contigs) can be ignored. @@ -48,7 +49,7 @@ contrasts: QualityControl: fastp-trim: activate: False - mosdepth: + coverage: activate: True multiqc: activate: True @@ -75,7 +76,6 @@ DifferentialExpression: VariantAnalysis: activate: True - caller: freebayes ploidy: 10 # Ploidy level for freebayes to call at (Generally we are using pooled samples).For diploid organisms, this should be 2 * number of individuals in each pool chunks: 9 # Number of chunks to split each chromosome into when parallelising freebayes. 9 or less is recommended. @@ -83,7 +83,7 @@ VariantAnalysis: activate: True missingness: 1 - summaryStatistics: # Estimate Population Genetic Summary Statistics such as Dxy, Pi + geneticDiversity: # Estimate Population Genetic Summary Statistics such as Dxy, Pi activate: True missingness: 1 @@ -91,7 +91,7 @@ VariantAnalysis: activate: True missingness: 1 - pbs: + population-branch-statistic: activate: True # Activate Population Branch Statistic analysis (Needs three conditions, two closely related and an outgroup) for resistance, do survivors_unexposed_susceptible contrasts: - 'gambiaePM_gambiaeCont_Kisumu' @@ -114,7 +114,6 @@ VariantAnalysis: # - "2Ru" - miscellaneous: VariantsOfInterest: activate: True diff --git a/workflow/Snakefile b/workflow/Snakefile index 37b4487..7092b91 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -23,14 +23,14 @@ welcome(version="v2.0.2") include: "rules/qc.smk" include: "rules/diffexp.smk" -include: "rules/alignment.smk" -include: "rules/variantCalling.smk" -include: "rules/filterAnnotate.smk" +include: "rules/utilities.smk" +include: "rules/hisat2-freebayes.smk" +include: "rules/snpEff.smk" include: "rules/variantAnalysis.smk" include: "rules/jupyter-book.smk" -if config['VariantAnalysis']['caller'] == 'haplotypecaller': - include: "rules/parabricks-gpu.smk" +if config['pipeline'] == 'parabricks': + include: "rules/star-haplotypecaller.smk" rule all: input: diff --git a/workflow/notebooks/windowed-selection.ipynb b/workflow/notebooks/windowed-selection.ipynb index 9c8c5f6..4f842e9 100644 --- a/workflow/notebooks/windowed-selection.ipynb +++ b/workflow/notebooks/windowed-selection.ipynb @@ -109,7 +109,7 @@ "comparisons = comparisons.contrast.str.split(\"_\", expand=True)\n", "comparisons.columns = ['sus', 'res']\n", "comparisons = [list(row) for i,row in comparisons.iterrows()]\n", - "pbscomps = config_params[\"VariantAnalysis\"]['selection']['pbs']['contrasts']\n", + "pbscomps = config_params[\"VariantAnalysis\"]['selection']['population-branch-statistic']['contrasts']\n", "\n", "for i, contig in enumerate(contigs):\n", "\n", diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk deleted file mode 100644 index 6a713be..0000000 --- a/workflow/rules/alignment.smk +++ /dev/null @@ -1,127 +0,0 @@ - -rule KallistoIndex: - """ - Create a kallisto index of the reference transcriptome - """ - input: - fasta=config["reference"]["transcriptome"], - output: - index="resources/reference/kallisto.idx", - group: - "diffexp" - log: - "logs/kallisto/index.log", - wrapper: - "v1.15.0/bio/kallisto/index" - - -rule KallistoQuant: - """ - Pseudo-align reads for each sample to the reference transcriptome. - Bootstrap to allow for isoform differential expression. - """ - input: - fastq=lambda wildcards: getFASTQs(wildcards=wildcards, rules="KallistoQuant"), - index="resources/reference/kallisto.idx", - output: - directory("results/counts/{sample}"), - group: - "diffexp" - log: - "logs/kallisto/quant_{sample}.log", - params: - extra="-b 100" if config['fastq']['paired'] is True else "-b 100 --single -l 75 -s 5", - threads: 24 - wrapper: - "v1.15.0/bio/kallisto/quant" - - -rule GenomeUnzip: - """ - Index the reference genome with samtools - """ - input: - config["reference"]["genome"], - output: - config["reference"]["genome"].rstrip(".gz"), - log: - "logs/GenomeUnzip.log", - shell: - "gzip -d -c {input} > {output} 2> {log}" - - -rule GenomeIndex: - """ - Index the reference genome with samtools - """ - input: - config["reference"]["genome"].rstrip(".gz"), - output: - config["reference"]["genome"].rstrip(".gz") + ".fai", - log: - "logs/GenomeIndex.log", - wrapper: - "v1.15.0/bio/samtools/faidx" - - -rule HISAT2index: - """ - Make a HISAT2 index of the reference genome - """ - input: - fasta=config["reference"]["genome"].rstrip(".gz"), - output: - "resources/reference/ht2index/idx.1.ht2", - touch("resources/reference/ht2index/.complete"), - log: - "logs/HISAT2/HISAT2index.log", - conda: - "../envs/variants.yaml" - params: - prefix=lambda w, output: output[0].split(os.extsep)[0], - threads: 8 - shell: - "hisat2-build -p {threads} {input.fasta} {params.prefix} 2> {log}" - - -rule HISAT2align: - """ - Align reads to the genome with HISAT2, mark duplicates with samblaster and sort with samtools - """ - input: - reads=lambda wildcards: getFASTQs( - wildcards=wildcards, rules="HISAT2align_input" - ), - idx="resources/reference/ht2index/.complete", - output: - "results/alignments/{sample}.hisat2.bam", - log: - align="logs/HISAT2/{sample}_align.log", - sort="logs/samtoolsSort/{sample}.log", - conda: - "../envs/variants.yaml" - params: - readflags=lambda wildcards: getFASTQs(wildcards=wildcards, rules="HISAT2align"), - extra="--dta -q --rg-id {sample} --rg SM:{sample} --rg PL:ILLUMINA --new-summary", - idx="resources/reference/ht2index/idx", - samblaster="" if config['fastq']['paired'] is True else "--ignoreUnmated" - threads: 12 - shell: - """ - hisat2 {params.extra} --threads {threads} -x {params.idx} {params.readflags} 2> {log.align} | - samblaster {params.samblaster} 2> {log.sort} | samtools sort -@{threads} - -o {output} 2>> {log.sort} - """ - - -rule IndexBams: - """ - Index bams with samtools - """ - input: - "results/alignments/{sample}.hisat2.bam" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam" - output: - "results/alignments/{sample}.hisat2.bam.bai" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam.bai" - log: - "logs/IndexBams/{sample}.log", - wrapper: - "v1.15.0/bio/samtools/index" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d52f56e..9c231f4 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,4 +1,4 @@ -if config["VariantAnalysis"]["selection"]["pbs"]["activate"]: +if config["VariantAnalysis"]["selection"]["population-branch-statistic"]["activate"]: windowedStats = ["Fst", "Pbs"] else: windowedStats = ["Fst"] @@ -16,46 +16,6 @@ def load_metadata(metadata_path): raise ValueError("Metadata file must be .xlsx or .csv") return metadata - -rule set_kernel: - input: - f'{workflow.basedir}/envs/pythonGenomics.yaml' - output: - touch("results/.kernel.set") - conda: f'{workflow.basedir}/envs/pythonGenomics.yaml' - log: - "logs/notebooks/set_kernel.log" - shell: - """ - python -m ipykernel install --user --name=pythonGenomics 2> {log} - """ - -rule remove_input_param_cell_nbs: - input: - input_nb = f"{workflow.basedir}/notebooks/process-notebooks.ipynb", - counts_qc = "docs/rna-seq-pop-results/notebooks/counts-qc.ipynb", - diffexp = "docs/rna-seq-pop-results/notebooks/differential-expression.ipynb", - qc = "docs/rna-seq-pop-results/notebooks/quality-control.ipynb" if config['QualityControl']['multiqc']['activate'] else [], - gsea = "docs/rna-seq-pop-results/notebooks/gene-set-enrichment-analysis.ipynb" if config['DifferentialExpression']['GSEA']['activate'] else [], - gene_families = "docs/rna-seq-pop-results/notebooks/gene-families-heatmap.ipynb" if config['miscellaneous']['GeneFamiliesHeatmap']['activate'] else [], - pca = "docs/rna-seq-pop-results/notebooks/principal-components-analysis.ipynb" if config['VariantAnalysis']['pca']['activate'] else [], - genetic_diversity = "docs/rna-seq-pop-results/notebooks/genetic-diversity.ipynb" if config['VariantAnalysis']['summaryStatistics']['activate'] else [], - selection = "docs/rna-seq-pop-results/notebooks/windowed-selection.ipynb" if config['VariantAnalysis']['selection']['activate'] else [], - voi = "docs/rna-seq-pop-results/notebooks/variants-of-interest.ipynb" if config['miscellaneous']['VariantsOfInterest']['activate'] else [], - karyo = "docs/rna-seq-pop-results/notebooks/karyotype.ipynb" if config['VariantAnalysis']['karyotype']['activate'] else [], - output: - out_nb = "results/notebooks/process-notebooks.ipynb", - process = touch("results/rna-seq-pop-results/.processed_nbs") - conda: - f'{workflow.basedir}/envs/pythonGenomics.yaml' - log: - "logs/notebooks/remove_input_param_cell_nbs.log" - params: - wkdir = wkdir - shell: - """ - papermill -k pythonGenomics {input.input_nb} {output.out_nb} -p wkdir {params.wkdir} 2> {log} - """ def getFASTQs(wildcards, rules=None): @@ -109,10 +69,10 @@ def getBAM(wildcards): """ Get BAM files depending on aligner """ - if config['variantAnalysis']['caller'] == 'freebayes': - bam = "results/alignments/{sample}.hisat2.bam" - else: + if config['pipeline'] == 'parabricks': bam = "results/alignments/{sample}.star.bam" + else: + bam = "results/alignments/{sample}.hisat2.bam" return bam def GetDesiredOutputs(wildcards): @@ -203,7 +163,7 @@ def GetDesiredOutputs(wildcards): ) ) - if config['QualityControl']['mosdepth']['activate']: + if config['QualityControl']['coverage']['activate']: wanted_input.extend( expand( [ @@ -304,7 +264,7 @@ def GetDesiredOutputs(wildcards): ) ) - if config['jupyter-book']['activate']: + if config['results-jupyterbook']['activate']: wanted_input.extend( [ "results/rna-seq-pop-results/_build/html/index.html" diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk index e62bcc7..f009605 100644 --- a/workflow/rules/diffexp.smk +++ b/workflow/rules/diffexp.smk @@ -1,3 +1,42 @@ + +rule KallistoIndex: + """ + Create a kallisto index of the reference transcriptome + """ + input: + fasta=config["reference"]["transcriptome"], + output: + index="resources/reference/kallisto.idx", + group: + "diffexp" + log: + "logs/kallisto/index.log", + wrapper: + "v1.15.0/bio/kallisto/index" + + +rule KallistoQuant: + """ + Pseudo-align reads for each sample to the reference transcriptome. + Bootstrap to allow for isoform differential expression. + """ + input: + fastq=lambda wildcards: getFASTQs(wildcards=wildcards, rules="KallistoQuant"), + index="resources/reference/kallisto.idx", + output: + directory("results/counts/{sample}"), + group: + "diffexp" + log: + "logs/kallisto/quant_{sample}.log", + params: + extra="-b 100" if config['fastq']['paired'] is True else "-b 100 --single -l 75 -s 5", + threads: 24 + wrapper: + "v1.15.0/bio/kallisto/quant" + + + rule DifferentialGeneExpression: """ Perform differential expression analysis at the gene-level with DESeq2 diff --git a/workflow/rules/filterAnnotate.smk b/workflow/rules/filterAnnotate.smk deleted file mode 100644 index badbfd0..0000000 --- a/workflow/rules/filterAnnotate.smk +++ /dev/null @@ -1,108 +0,0 @@ -chunks = np.arange(1, config["VariantAnalysis"]["chunks"]) - -rule ConcatVCFs: - """ - Concatenate VCFs together - """ - input: - calls=expand( - "results/variantAnalysis/vcfs/freebayes/{{contig}}/variants.{i}.vcf", - i=chunks, - ), - output: - temp("results/variantAnalysis/vcfs/freebayes/variants.{contig}.vcf"), - log: - "logs/ConcatVCFs/{contig}.log", - conda: - "../envs/variants.yaml" - threads: 4 - shell: - "bcftools concat {input.calls} | vcfuniq > {output} 2> {log}" - - -rule RestrictToSNPs: - """" - Filter out indels - """ - input: - "results/variantAnalysis/vcfs/freebayes/variants.{contig}.vcf" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/variantAnalysis/vcfs/haplotypecaller/variants.{contig}.vcf", - output: - temp("results/variantAnalysis/vcfs/variants.{contig}.vcf"), - log: - "logs/bcftoolsView/{contig}.log", - params: - extra="-v snps", - wrapper: - "v1.15.0/bio/bcftools/view" - - -rule snpEffDbDownload: - """ - Download the snpEff database for your species - """ - output: - touch("workflow/scripts/snpEff/db.dl"), - log: - "logs/snpEff/snpEffDbDownload.log", - conda: - "../envs/snpeff.yaml" - params: - ref=config["reference"]["snpeffdb"], - dir="resources/snpEffdb", - shell: - "snpEff download {params.ref} -dataDir {params.dir} 2> {log}" - -rule createCustomSnpEffDb: - """ - Create a custom SnpEff database from a reference genome and GFF file - """ - input: - fa="path/to/your_organism.fa", - gff="path/to/your_organism.gff", - output: - touch("workflow/scripts/snpEff/custom_db.built"), - log: - "logs/snpEff/createCustomSnpEffDb.log", - conda: - "../envs/snpeff.yaml" - params: - db_name="my_organism", - data_dir="resources/snpEffdb/my_organism", - shell: - """ - mkdir -p {params.data_dir} - cp {input.fa} {params.data_dir}/sequences.fa - cp {input.gff} {params.data_dir}/genes.gff - echo "{params.db_name}.genome : My Organism" >> snpEff.config - snpEff build -gff3 -v {params.db_name} 2> {log} - touch {output} - """ - - -rule snpEff: - """ - Run snpEff on the VCFs - """ - input: - calls="results/variantAnalysis/vcfs/variants.{contig}.vcf", - dl="workflow/scripts/snpEff/db.dl", - output: - calls=expand( - "results/variantAnalysis/vcfs/{dataset}.{{contig}}.vcf.gz", - dataset=config["dataset"], - ), - csvStats="results/variantAnalysis/vcfs/snpEff.summary.{contig}.csv", - log: - "logs/snpEff/snpEff.{contig}.log", - conda: - "../envs/snpeff.yaml" - params: - db=config["reference"]["snpeffdb"], - prefix=lambda w, output: os.path.splitext(output[0])[0], - dir="resources/snpEffdb", - shell: - """ - snpEff eff {params.db} -dataDir {params.dir} -csvStats {output.csvStats} {input.calls} > {params.prefix} 2> {log} - bgzip {params.prefix} - """ - diff --git a/workflow/rules/hisat2-freebayes.smk b/workflow/rules/hisat2-freebayes.smk new file mode 100644 index 0000000..c4767dd --- /dev/null +++ b/workflow/rules/hisat2-freebayes.smk @@ -0,0 +1,122 @@ + +rule HISAT2index: + """ + Make a HISAT2 index of the reference genome + """ + input: + fasta=config["reference"]["genome"].rstrip(".gz"), + output: + "resources/reference/ht2index/idx.1.ht2", + touch("resources/reference/ht2index/.complete"), + log: + "logs/HISAT2/HISAT2index.log", + conda: + "../envs/variants.yaml" + params: + prefix=lambda w, output: output[0].split(os.extsep)[0], + threads: 8 + shell: + "hisat2-build -p {threads} {input.fasta} {params.prefix} 2> {log}" + + +rule HISAT2align: + """ + Align reads to the genome with HISAT2, mark duplicates with samblaster and sort with samtools + """ + input: + reads=lambda wildcards: getFASTQs( + wildcards=wildcards, rules="HISAT2align_input" + ), + idx="resources/reference/ht2index/.complete", + output: + "results/alignments/{sample}.hisat2.bam", + log: + align="logs/HISAT2/{sample}_align.log", + sort="logs/samtoolsSort/{sample}.log", + conda: + "../envs/variants.yaml" + params: + readflags=lambda wildcards: getFASTQs(wildcards=wildcards, rules="HISAT2align"), + extra="--dta -q --rg-id {sample} --rg SM:{sample} --rg PL:ILLUMINA --new-summary", + idx="resources/reference/ht2index/idx", + samblaster="" if config['fastq']['paired'] is True else "--ignoreUnmated" + threads: 12 + shell: + """ + hisat2 {params.extra} --threads {threads} -x {params.idx} {params.readflags} 2> {log.align} | + samblaster {params.samblaster} 2> {log.sort} | samtools sort -@{threads} - -o {output} 2>> {log.sort} + """ + + +chunks = np.arange(1, config["VariantAnalysis"]["chunks"]) + +rule GenerateFreebayesParams: + input: + ref_idx=config["reference"]["genome"].rstrip(".gz"), + index=config["reference"]["genome"].rstrip(".gz") + ".fai", + bams=expand("results/alignments/{sample}.hisat2.bam", sample=samples), + output: + bamlist="results/alignments/bam.list", + pops="results/alignments/populations.tsv", + regions=expand( + "resources/regions/genome.{contig}.region.{i}.bed", + contig=config["contigs"], + i=chunks, + ), + log: + "logs/GenerateFreebayesParams.log", + params: + metadata=config["metadata"], + contigs=config["contigs"], + chunks=config["VariantAnalysis"]["chunks"], + conda: + "../envs/diffexp.yaml" + script: + "../scripts/GenerateFreebayesParams.R" + + +rule VariantCallingFreebayes: + """ + Run freebayes on chunks of the genome, splitting the samples by population (strain) + """ + input: + bams=expand("results/alignments/{sample}.hisat2.bam", sample=samples), + index=expand("results/alignments/{sample}.hisat2.bam.bai", sample=samples), + ref=config["reference"]["genome"].rstrip(".gz"), + samples=ancient("results/alignments/bam.list"), + pops=ancient("results/alignments/populations.tsv"), + regions=ancient("resources/regions/genome.{contig}.region.{i}.bed"), + output: + temp("results/variantAnalysis/vcfs/freebayes/{contig}/variants.{i}.vcf"), + log: + "logs/VariantCallingFreebayes/{contig}.{i}.log", + params: + ploidy=config["VariantAnalysis"]["ploidy"], + conda: + "../envs/variants.yaml" + threads: 1 + shell: + "freebayes -f {input.ref} -t {input.regions} --ploidy {params.ploidy} --populations {input.pops} --pooled-discrete --use-best-n-alleles 5 -L {input.samples} > {output} 2> {log}" + +chunks = np.arange(1, config["VariantAnalysis"]["chunks"]) + +rule ConcatVCFs: + """ + Concatenate VCFs together + """ + input: + calls=expand( + "results/variantAnalysis/vcfs/freebayes/{{contig}}/variants.{i}.vcf", + i=chunks, + ), + output: + temp("results/variantAnalysis/vcfs/freebayes/variants.{contig}.vcf"), + log: + "logs/ConcatVCFs/{contig}.log", + conda: + "../envs/variants.yaml" + threads: 4 + shell: + "bcftools concat {input.calls} | vcfuniq > {output} 2> {log}" + + diff --git a/workflow/rules/jupyter-book.smk b/workflow/rules/jupyter-book.smk index a4c4345..d7502c0 100644 --- a/workflow/rules/jupyter-book.smk +++ b/workflow/rules/jupyter-book.smk @@ -7,7 +7,7 @@ rule jupyterbook: gsea = "docs/rna-seq-pop-results/notebooks/gene-set-enrichment-analysis.ipynb" if config['DifferentialExpression']['GSEA']['activate'] else [], gene_families = "docs/rna-seq-pop-results/notebooks/gene-families-heatmap.ipynb" if config['miscellaneous']['GeneFamiliesHeatmap']['activate'] else [], pca = "docs/rna-seq-pop-results/notebooks/principal-components-analysis.ipynb" if config['VariantAnalysis']['pca']['activate'] else [], - genetic_diversity = "docs/rna-seq-pop-results/notebooks/genetic-diversity.ipynb" if config['VariantAnalysis']['summaryStatistics']['activate'] else [], + genetic_diversity = "docs/rna-seq-pop-results/notebooks/genetic-diversity.ipynb" if config['VariantAnalysis']['geneticDiversity']['activate'] else [], selection = "docs/rna-seq-pop-results/notebooks/windowed-selection.ipynb" if config['VariantAnalysis']['selection']['activate'] else [], voi = "docs/rna-seq-pop-results/notebooks/variants-of-interest.ipynb" if config['miscellaneous']['VariantsOfInterest']['activate'] else [], karyo = "docs/rna-seq-pop-results/notebooks/karyotype.ipynb" if config['VariantAnalysis']['karyotype']['activate'] else [], @@ -24,4 +24,33 @@ rule jupyterbook: """ jupyter-book build --all docs/rna-seq-pop-results --path-output results/rna-seq-pop-results 2> {log} ln -sf docs/rna-seq-pop-results/_build/html/index.html {params.dataset}-results.html 2>> {log} - """ \ No newline at end of file + """ + + + +rule remove_input_param_cell_nbs: + input: + input_nb = f"{workflow.basedir}/notebooks/process-notebooks.ipynb", + counts_qc = "docs/rna-seq-pop-results/notebooks/counts-qc.ipynb", + diffexp = "docs/rna-seq-pop-results/notebooks/differential-expression.ipynb", + qc = "docs/rna-seq-pop-results/notebooks/quality-control.ipynb" if config['QualityControl']['multiqc']['activate'] else [], + gsea = "docs/rna-seq-pop-results/notebooks/gene-set-enrichment-analysis.ipynb" if config['DifferentialExpression']['GSEA']['activate'] else [], + gene_families = "docs/rna-seq-pop-results/notebooks/gene-families-heatmap.ipynb" if config['miscellaneous']['GeneFamiliesHeatmap']['activate'] else [], + pca = "docs/rna-seq-pop-results/notebooks/principal-components-analysis.ipynb" if config['VariantAnalysis']['pca']['activate'] else [], + genetic_diversity = "docs/rna-seq-pop-results/notebooks/genetic-diversity.ipynb" if config['VariantAnalysis']['geneticDiversity']['activate'] else [], + selection = "docs/rna-seq-pop-results/notebooks/windowed-selection.ipynb" if config['VariantAnalysis']['selection']['activate'] else [], + voi = "docs/rna-seq-pop-results/notebooks/variants-of-interest.ipynb" if config['miscellaneous']['VariantsOfInterest']['activate'] else [], + karyo = "docs/rna-seq-pop-results/notebooks/karyotype.ipynb" if config['VariantAnalysis']['karyotype']['activate'] else [], + output: + out_nb = "results/notebooks/process-notebooks.ipynb", + process = touch("results/rna-seq-pop-results/.processed_nbs") + conda: + f'{workflow.basedir}/envs/pythonGenomics.yaml' + log: + "logs/notebooks/remove_input_param_cell_nbs.log" + params: + wkdir = wkdir + shell: + """ + papermill -k pythonGenomics {input.input_nb} {output.out_nb} -p wkdir {params.wkdir} 2> {log} + """ diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 782ac5e..149748c 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -44,8 +44,8 @@ rule BamStats: QC alignment statistics """ input: - bam="results/alignments/{sample}.hisat2.bam" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam", - idx="results/alignments/{sample}.hisat2.bam.bai" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam.bai", + bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam", + idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai", output: stats="results/qc/alignments/{sample}.flagstat", log: @@ -59,8 +59,8 @@ rule Coverage: Calculate coverage with mosdepth """ input: - bam="results/alignments/{sample}.hisat2.bam" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam", - idx="results/alignments/{sample}.hisat2.bam.bai" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam.bai", + bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam", + idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai", output: "results/qc/coverage/{sample}.mosdepth.summary.txt", log: @@ -99,7 +99,9 @@ rule multiQC: Integrate QC statistics from other tools into a final .html report """ input: - expand("results/qc/{sample}.json", sample=samples), + expand("results/qc/{sample}.json", sample=samples) + if config['QualityControl']['fastp-trim']['activate'] + else [], expand( "results/qc/vcfs/{contig}.txt", contig=config["contigs"] ) @@ -107,7 +109,8 @@ rule multiQC: else [], expand( "results/qc/coverage/{sample}.mosdepth.summary.txt", sample=samples - ), + ) if config['QualityControl']['coverage'] + else [], expand("results/qc/alignments/{sample}.flagstat", sample=samples), expand("results/counts/{sample}", sample=samples), output: diff --git a/workflow/rules/snpEff.smk b/workflow/rules/snpEff.smk new file mode 100644 index 0000000..497d764 --- /dev/null +++ b/workflow/rules/snpEff.smk @@ -0,0 +1,70 @@ + +rule snpEffDbDownload: + """ + Download the snpEff database for your species + """ + output: + touch("resources/reference/mysnpeffdb/db.dl"), + log: + "logs/snpEff/snpEffDbDownload.log", + conda: + "../envs/snpeff.yaml" + params: + ref=config["reference"]["snpeff"]['dbname'], + dir="resources/reference/mysnpeffdb", + shell: + "snpEff download {params.ref} -dataDir {params.dir} 2> {log}" + + +rule createCustomSnpEffDb: + """ + Create a custom SnpEff database from a reference genome and GFF file + """ + input: + fa=config['reference']['genome'].rstrip(".gz"), + gff=config['reference']['gff'], + output: + "resources/reference/mysnpeffdb/sequences.fa", + "resources/reference/mysnpeffdb/genes.gff", + "resources/reference/mysnpeffdb/snpEffectPredictor.bin" + log: + "logs/snpEff/createCustomSnpEffDb.log", + conda: + "../envs/snpeff.yaml" + params: + dataDir=lambda x: wkdir + "/resources/reference", + wkdir=wkdir + shell: + """ + ln -s {params.wkdir}/{input.fa} {params.dataDir}/mysnpeffdb/sequences.fa 2> {log} + ln -s {params.wkdir}/{input.gff} {params.dataDir}/mysnpeffdb/genes.gff 2> {log} + snpEff build -gff3 -v -dataDir {params.dataDir} -configOption mysnpeffdb.genome=mysnpeffdb mysnpeffdb -noCheckCds -noCheckProtein 2>> {log} + """ + +rule snpEff: + """ + Run snpEff on the VCFs + """ + input: + calls="results/variantAnalysis/vcfs/variants.{contig}.vcf", + db="resources/reference/mysnpeffdb/snpEffectPredictor.bin" if config['reference']['snpeff']['customdb'] else "resources/reference/mysnpeffdb/db.dl", + output: + calls=expand( + "results/variantAnalysis/vcfs/{dataset}.{{contig}}.vcf.gz", + dataset=config["dataset"], + ), + csvStats="results/variantAnalysis/vcfs/snpEff.summary.{contig}.csv", + log: + "logs/snpEff/snpEff.{contig}.log", + conda: + "../envs/snpeff.yaml" + params: + db=config["reference"]["snpeff"]['dbname'] if not config['reference']['snpeff']['customdb'] else "mysnpeffdb", + prefix=lambda w, output: os.path.splitext(output[0])[0], + dataDir=lambda x: wkdir + "/resources/reference/", + shell: + """ + snpEff eff {params.db} -dataDir {params.dataDir} -configOption mysnpeffdb.genome=mysnpeffdb -csvStats {output.csvStats} {input.calls} > {params.prefix} 2> {log} + bgzip {params.prefix} + """ + diff --git a/workflow/rules/parabricks-gpu.smk b/workflow/rules/star-haplotypecaller.smk similarity index 100% rename from workflow/rules/parabricks-gpu.smk rename to workflow/rules/star-haplotypecaller.smk diff --git a/workflow/rules/utilities.smk b/workflow/rules/utilities.smk new file mode 100644 index 0000000..49eca67 --- /dev/null +++ b/workflow/rules/utilities.smk @@ -0,0 +1,69 @@ +rule set_kernel: + input: + f'{workflow.basedir}/envs/pythonGenomics.yaml' + output: + touch("results/.kernel.set") + conda: f'{workflow.basedir}/envs/pythonGenomics.yaml' + log: + "logs/notebooks/set_kernel.log" + shell: + """ + python -m ipykernel install --user --name=pythonGenomics 2> {log} + """ + +rule GenomeUnzip: + """ + Index the reference genome with samtools + """ + input: + config["reference"]["genome"], + output: + config["reference"]["genome"].rstrip(".gz"), + log: + "logs/GenomeUnzip.log", + shell: + "gzip -d -c {input} > {output} 2> {log}" + + +rule GenomeIndex: + """ + Index the reference genome with samtools + """ + input: + config["reference"]["genome"].rstrip(".gz"), + output: + config["reference"]["genome"].rstrip(".gz") + ".fai", + log: + "logs/GenomeIndex.log", + wrapper: + "v1.15.0/bio/samtools/faidx" + + +rule IndexBams: + """ + Index bams with samtools + """ + input: + bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam", + output: + idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai", + log: + "logs/IndexBams/{sample}.log", + wrapper: + "v1.15.0/bio/samtools/index" + + +rule RestrictToSNPs: + """" + Filter out indels + """ + input: + "results/variantAnalysis/vcfs/haplotypecaller/variants.{contig}.vcf" if config['pipeline'] == 'parabricks' else "results/variantAnalysis/vcfs/freebayes/variants.{contig}.vcf", + output: + temp("results/variantAnalysis/vcfs/variants.{contig}.vcf"), + log: + "logs/bcftoolsView/{contig}.log", + params: + extra="-v snps", + wrapper: + "v1.15.0/bio/bcftools/view" diff --git a/workflow/rules/variantAnalysis.smk b/workflow/rules/variantAnalysis.smk index 0aca014..8600579 100644 --- a/workflow/rules/variantAnalysis.smk +++ b/workflow/rules/variantAnalysis.smk @@ -72,7 +72,7 @@ rule pca_notebook: cp {output.nb} {output.docs_nb} 2>> {log} """ -rule SummaryStatistics_notebook: +rule geneticDiversity_notebook: """ Calculate population genetic summary statistics on genotype data """ @@ -101,7 +101,7 @@ rule SummaryStatistics_notebook: dataset=config["dataset"], config_path = configpath, ploidy=config["VariantAnalysis"]["ploidy"], - missingprop=config["VariantAnalysis"]["summaryStatistics"]["missingness"], + missingprop=config["VariantAnalysis"]["geneticDiversity"]["missingness"], qualflt=30, shell: """ @@ -135,11 +135,11 @@ rule WindowedFstPBS_notebook: PBS=( expand( "results/variantAnalysis/selection/pbs/{wsize}/{pbscomp}.PBS.{contig}.svg", - pbscomp=config["VariantAnalysis"]["selection"]["pbs"]["contrasts"], + pbscomp=config["VariantAnalysis"]["selection"]["population-branch-statistic"]["contrasts"], contig=config["contigs"], wsize=['1000snp_window', '2000snp_window', '5000snp_window'], ) - if config["VariantAnalysis"]["selection"]["pbs"]["activate"] + if config["VariantAnalysis"]["selection"]["population-branch-statistic"]["activate"] else [] ), log: @@ -149,8 +149,8 @@ rule WindowedFstPBS_notebook: params: dataset=config["dataset"], config = configpath, - pbs=config["VariantAnalysis"]["selection"]["pbs"]["activate"], - pbscomps=config["VariantAnalysis"]["selection"]["pbs"]["contrasts"], + pbs=config["VariantAnalysis"]["selection"]["population-branch-statistic"]["activate"], + pbscomps=config["VariantAnalysis"]["selection"]["population-branch-statistic"]["contrasts"], ploidy=config["VariantAnalysis"]["ploidy"], missingprop=config["VariantAnalysis"]["selection"]["missingness"], qualflt=30, @@ -188,8 +188,8 @@ rule PerGeneFstPBSDxyPi: params: dataset=config["dataset"], DEcontrasts=config["contrasts"], - pbs=config["VariantAnalysis"]["selection"]["pbs"]["activate"], - pbscomps=config["VariantAnalysis"]["selection"]["pbs"]["contrasts"], + pbs=config["VariantAnalysis"]["selection"]["population-branch-statistic"]["activate"], + pbscomps=config["VariantAnalysis"]["selection"]["population-branch-statistic"]["contrasts"], contigs=config["contigs"], ploidy=config["VariantAnalysis"]["ploidy"], missingprop=config["VariantAnalysis"]["selection"]["missingness"], diff --git a/workflow/rules/variantCalling.smk b/workflow/rules/variantCalling.smk deleted file mode 100644 index 76cc956..0000000 --- a/workflow/rules/variantCalling.smk +++ /dev/null @@ -1,49 +0,0 @@ -chunks = np.arange(1, config["VariantAnalysis"]["chunks"]) - -rule GenerateFreebayesParams: - input: - ref_idx=config["reference"]["genome"].rstrip(".gz"), - index=config["reference"]["genome"].rstrip(".gz") + ".fai", - bams=expand("results/alignments/{sample}.hisat2.bam", sample=samples), - output: - bamlist="results/alignments/bam.list", - pops="results/alignments/populations.tsv", - regions=expand( - "resources/regions/genome.{contig}.region.{i}.bed", - contig=config["contigs"], - i=chunks, - ), - log: - "logs/GenerateFreebayesParams.log", - params: - metadata=config["metadata"], - contigs=config["contigs"], - chunks=config["VariantAnalysis"]["chunks"], - conda: - "../envs/diffexp.yaml" - script: - "../scripts/GenerateFreebayesParams.R" - - -rule VariantCallingFreebayes: - """ - Run freebayes on chunks of the genome, splitting the samples by population (strain) - """ - input: - bams=expand("results/alignments/{sample}.hisat2.bam", sample=samples), - index=expand("results/alignments/{sample}.hisat2.bam.bai", sample=samples), - ref=config["reference"]["genome"].rstrip(".gz"), - samples=ancient("results/alignments/bam.list"), - pops=ancient("results/alignments/populations.tsv"), - regions=ancient("resources/regions/genome.{contig}.region.{i}.bed"), - output: - temp("results/variantAnalysis/vcfs/freebayes/{contig}/variants.{i}.vcf"), - log: - "logs/VariantCallingFreebayes/{contig}.{i}.log", - params: - ploidy=config["VariantAnalysis"]["ploidy"], - conda: - "../envs/variants.yaml" - threads: 1 - shell: - "freebayes -f {input.ref} -t {input.regions} --ploidy {params.ploidy} --populations {input.pops} --pooled-discrete --use-best-n-alleles 5 -L {input.samples} > {output} 2> {log}" diff --git a/workflow/rules/variantsOfInterest.smk b/workflow/rules/variantsOfInterest.smk index 4f96fab..36c3ea6 100644 --- a/workflow/rules/variantsOfInterest.smk +++ b/workflow/rules/variantsOfInterest.smk @@ -3,8 +3,8 @@ rule mpileupVariantsOfInterest: Get allele count tables of variants of choice (specified in config file ("IRmutations.tsv")) """ input: - bam="results/alignments/{sample}.hisat2.bam" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam", - idx="results/alignments/{sample}.hisat2.bam.bai" if config['VariantAnalysis']['caller'] == 'freebayes' else "results/alignments/{sample}.star.bam.bai", + bam="results/alignments/{sample}.star.bam" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam", + idx="results/alignments/{sample}.star.bam.bai" if config['pipeline'] == 'parabricks' else "results/alignments/{sample}.hisat2.bam.bai", output: "results/variantAnalysis/variantsOfInterest/counts/{sample}_{mut}_allele_counts.tsv", conda: