From fe5ff583eccb55533cdb9538767e93058e20362a Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 25 Aug 2023 16:48:34 +0200 Subject: [PATCH 01/34] fix: tracking of pre-processing in meta to handle downstream handling --- modules/local/cat.nf | 37 +++++++------ modules/local/quast_bins.nf | 9 ++-- modules/nf-core/gtdbtk/classifywf/main.nf | 63 ++++++++++++++--------- subworkflows/local/binning_refinement.nf | 15 ++++-- subworkflows/local/depths.nf | 8 ++- subworkflows/local/gtdbtk.nf | 7 ++- workflows/mag.nf | 33 ++++++++++-- 7 files changed, 115 insertions(+), 57 deletions(-) diff --git a/modules/local/cat.nf b/modules/local/cat.nf index 8bf77cb0..be4375f3 100644 --- a/modules/local/cat.nf +++ b/modules/local/cat.nf @@ -1,5 +1,5 @@ process CAT { - tag "${meta.assembler}-${meta.binner}-${meta.id}-${db_name}" + tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}-${db_name}" conda "bioconda::cat=4.6 bioconda::diamond=2.0.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -11,29 +11,32 @@ process CAT { tuple val(db_name), path("database/*"), path("taxonomy/*") output: - path("*.names.txt.gz") , emit: tax_classification - path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca - path("raw/*.predicted_proteins.faa.gz"), emit: faa - path("raw/*.predicted_proteins.gff.gz"), emit: gff - path("raw/*.log") , emit: log - path("raw/*.bin2classification.txt.gz"), emit: tax_classification_taxids - path "versions.yml" , emit: versions + path("*.ORF2LCA.names.txt.gz") , emit: orf2lca_classification + path("*.bin2classification.names.txt.gz") , emit: tax_classification_names + path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca + path("raw/*.predicted_proteins.faa.gz") , emit: faa + path("raw/*.predicted_proteins.gff.gz") , emit: gff + path("raw/*.log") , emit: log + path("raw/*.bin2classification.txt.gz") , emit: tax_classification_taxids + path "versions.yml" , emit: versions script: def official_taxonomy = params.cat_official_taxonomy ? "--only_official" : "" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" """ - CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${meta.assembler}-${meta.binner}-${meta.id}" --I_know_what_Im_doing - CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy} - CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy} + CAT bins $args -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${prefix}" --I_know_what_Im_doing + CAT add_names -i "${prefix}.ORF2LCA.txt" -o "${prefix}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy} + CAT add_names -i "${prefix}.bin2classification.txt" -o "${prefix}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy} mkdir raw mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/ - gzip "raw/${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" \ - "raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.faa" \ - "raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.gff" \ - "raw/${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" \ - "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" \ - "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" + gzip "raw/${prefix}.ORF2LCA.txt" \ + "raw/${prefix}.concatenated.predicted_proteins.faa" \ + "raw/${prefix}.concatenated.predicted_proteins.gff" \ + "raw/${prefix}.bin2classification.txt" \ + "${prefix}.ORF2LCA.names.txt" \ + "${prefix}.bin2classification.names.txt" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/quast_bins.nf b/modules/local/quast_bins.nf index 5d43815a..31d521c8 100644 --- a/modules/local/quast_bins.nf +++ b/modules/local/quast_bins.nf @@ -1,5 +1,5 @@ process QUAST_BINS { - tag "${meta.assembler}-${meta.binner}-${meta.id}" + tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" conda "bioconda::quast=5.0.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -15,15 +15,16 @@ process QUAST_BINS { path "versions.yml" , emit: versions script: + def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" """ BINS=\$(echo \"$bins\" | sed 's/[][]//g') IFS=', ' read -r -a bins <<< \"\$BINS\" for bin in \"\${bins[@]}\"; do metaquast.py --threads "${task.cpus}" --max-ref-number 0 --rna-finding --gene-finding -l "\${bin}" "\${bin}" -o "QUAST/\${bin}" - if ! [ -f "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" ]; then - cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" + if ! [ -f "QUAST/${prefix}-quast_summary.tsv" ]; then + cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${prefix}-quast_summary.tsv" else - tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" + tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${prefix}-quast_summary.tsv" fi done diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf index 0b6b76cc..1e0c99db 100644 --- a/modules/nf-core/gtdbtk/classifywf/main.nf +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -1,5 +1,5 @@ process GTDBTK_CLASSIFYWF { - tag "${meta.assembler}-${meta.id}" + tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" label 'process_medium' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. @@ -13,15 +13,15 @@ process GTDBTK_CLASSIFYWF { tuple val(db_name), path("database/*") output: - path "gtdbtk.${meta.assembler}-${meta.id}.*.summary.tsv" , emit: summary - path "gtdbtk.${meta.assembler}-${meta.id}.*.classify.tree.gz" , emit: tree - path "gtdbtk.${meta.assembler}-${meta.id}.*.markers_summary.tsv", emit: markers - path "gtdbtk.${meta.assembler}-${meta.id}.*.msa.fasta.gz" , emit: msa - path "gtdbtk.${meta.assembler}-${meta.id}.*.user_msa.fasta" , emit: user_msa - path "gtdbtk.${meta.assembler}-${meta.id}.*.filtered.tsv" , emit: filtered - path "gtdbtk.${meta.assembler}-${meta.id}.log" , emit: log - path "gtdbtk.${meta.assembler}-${meta.id}.warnings.log" , emit: warnings - path "gtdbtk.${meta.assembler}-${meta.id}.failed_genomes.tsv" , emit: failed + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.summary.tsv" , emit: summary + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.classify.tree.gz" , emit: tree + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.markers_summary.tsv", emit: markers + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.msa.fasta.gz" , emit: msa + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.user_msa.fasta.gz" , emit: user_msa + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.filtered.tsv" , emit: filtered + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.log" , emit: log + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.warnings.log" , emit: warnings + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.failed_genomes.tsv" , emit: failed path "versions.yml" , emit: versions when: @@ -30,6 +30,7 @@ process GTDBTK_CLASSIFYWF { script: def args = task.ext.args ?: '' def pplacer_scratch = params.gtdbtk_pplacer_scratch ? "--scratch_dir pplacer_tmp" : "" + def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" """ export GTDBTK_DATA_PATH="\${PWD}/database" @@ -40,7 +41,7 @@ process GTDBTK_CLASSIFYWF { gtdbtk classify_wf \\ $args \\ --genome_dir bins \\ - --prefix "gtdbtk.${meta.assembler}-${meta.id}" \\ + --prefix "gtdbtk.${prefix}" \\ --out_dir "\${PWD}" \\ --cpus $task.cpus \\ --pplacer_cpus $params.gtdbtk_pplacer_cpus \\ @@ -48,9 +49,25 @@ process GTDBTK_CLASSIFYWF { --min_perc_aa $params.gtdbtk_min_perc_aa \\ --min_af $params.gtdbtk_min_af - gzip "gtdbtk.${meta.assembler}-${meta.id}".*.classify.tree "gtdbtk.${meta.assembler}-${meta.id}".*.msa.fasta - mv gtdbtk.log "gtdbtk.${meta.assembler}-${meta.id}.log" - mv gtdbtk.warnings.log "gtdbtk.${meta.assembler}-${meta.id}.warnings.log" + mv classify/gtdbtk.${prefix}.*.classify.tree \\ + classify/gtdbtk.${prefix}.*.summary.tsv \\ + . + + mv identify/gtdbtk.${prefix}.*.markers_summary.tsv \\ + identify/gtdbtk.${prefix}.failed_genomes.tsv \\ + . + + mv align/gtdbtk.${prefix}.*.msa.fasta.gz \\ + align/gtdbtk.${prefix}.*.user_msa.fasta.gz \\ + align/gtdbtk.${prefix}.*.filtered.tsv \\ + . + + gzip gtdbtk.${prefix}.*.classify.tree + + + + mv gtdbtk.log "gtdbtk.${prefix}.log" + mv gtdbtk.warnings.log "gtdbtk.${prefix}.warnings.log" cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -62,15 +79,15 @@ process GTDBTK_CLASSIFYWF { def VERSION = '2.1.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ - touch gtdbtk.${meta.assembler}-${meta.id}.stub.summary.tsv - touch gtdbtk.${meta.assembler}-${meta.id}.stub.classify.tree.gz - touch gtdbtk.${meta.assembler}-${meta.id}.stub.markers_summary.tsv - touch gtdbtk.${meta.assembler}-${meta.id}.stub.msa.fasta.gz - touch gtdbtk.${meta.assembler}-${meta.id}.stub.user_msa.fasta - touch gtdbtk.${meta.assembler}-${meta.id}.stub.filtered.tsv - touch gtdbtk.${meta.assembler}-${meta.id}.log - touch gtdbtk.${meta.assembler}-${meta.id}.warnings.log - touch gtdbtk.${meta.assembler}-${meta.id}.failed_genomes.tsv + touch gtdbtk.${prefix}.stub.summary.tsv + touch gtdbtk.${prefix}.stub.classify.tree.gz + touch gtdbtk.${prefix}.stub.markers_summary.tsv + touch gtdbtk.${prefix}.stub.msa.fasta.gz + touch gtdbtk.${prefix}.stub.user_msa.fasta.gz + touch gtdbtk.${prefix}.stub.filtered.tsv + touch gtdbtk.${prefix}.log + touch gtdbtk.${prefix}.warnings.log + touch gtdbtk.${prefix}.failed_genomes.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/binning_refinement.nf b/subworkflows/local/binning_refinement.nf index eea8c76a..360bffaa 100644 --- a/subworkflows/local/binning_refinement.nf +++ b/subworkflows/local/binning_refinement.nf @@ -25,7 +25,7 @@ workflow BINNING_REFINEMENT { // everything here is either unclassified or a prokaryote ch_bins = bins .map { meta, bins -> - def meta_new = meta - meta.subMap('domain') + def meta_new = meta - meta.subMap(['domain','refinement']) [meta_new, bins] } .groupTuple() @@ -88,7 +88,7 @@ workflow BINNING_REFINEMENT { .map { meta, bins -> def domain_class = params.bin_domain_classification ? 'prokarya' : 'unclassified' - def meta_new = meta + [domain: domain_class] + def meta_new = meta + [refinement: 'dastool_refined', domain: domain_class] [ meta_new, bins ] } @@ -96,14 +96,21 @@ workflow BINNING_REFINEMENT { .map { meta, bins -> def domain_class = params.bin_domain_classification ? 'prokarya' : 'unclassified' - def meta_new = meta + [binner: 'DASTool', domain: domain_class] + def meta_new = meta + [refinement: 'dastool_refined', binner: 'DASTool', domain: domain_class] [ meta_new, bins ] } RENAME_POSTDASTOOL ( ch_input_for_renamedastool ) + refined_unbins = RENAME_POSTDASTOOL.out.refined_unbins + .map { + meta, bins -> + def meta_new = meta + [refinement: 'dastool_refined_unbinned'] + [meta_new, bins] + } + emit: refined_bins = ch_dastool_bins_newmeta - refined_unbins = RENAME_POSTDASTOOL.out.refined_unbins + refined_unbins = refined_unbins versions = ch_versions } diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 140a809c..5afd6d83 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -23,17 +23,21 @@ workflow DEPTHS { main: ch_versions = Channel.empty() + bins_unbins.dump(tag: "DEPTH_bins_unbins", pretty: true) + depths.dump(tag: 'DEPTH_depths', pretty: true) + reads.dump(tag: 'DEPTH_reads', pretty: true) + // Compute bin depths for different samples (according to `binning_map_mode`) // Create a new meta joining key first, but copy meta so that // we retain the information about binners and domain classification ch_depth_input = bins_unbins .map { meta, bins -> - def meta_join = meta - meta.subMap('binner','domain') + def meta_join = meta - meta.subMap('binner','domain','refinement') [ meta_join, meta, bins ] } .combine( depths, by: 0 ) .map { meta_join, meta, bins, contig_depths_file -> - def meta_new = meta - meta.subMap('domain') + def meta_new = meta - meta.subMap('domain','refinement') [ meta_new, bins, contig_depths_file ] } .transpose() diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 21823962..8eea649d 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -74,6 +74,7 @@ workflow GTDBTK { error("Unsupported object given to --gtdb, database must be supplied as either a directory or a .tar.gz file!") } + ch_filtered_bins.passed.groupTuple().dump(tag: "ch_filtered_bins.passed.groupTuple", pretty: true) GTDBTK_CLASSIFYWF ( ch_filtered_bins.passed.groupTuple(), ch_db_for_gtdbtk @@ -82,8 +83,10 @@ workflow GTDBTK { GTDBTK_SUMMARY ( ch_filtered_bins.discarded.map{it[1]}.collect().ifEmpty([]), GTDBTK_CLASSIFYWF.out.summary.collect().ifEmpty([]), - GTDBTK_CLASSIFYWF.out.filtered.collect().ifEmpty([]), - GTDBTK_CLASSIFYWF.out.failed.collect().ifEmpty([]) + [], + // GTDBTK_CLASSIFYWF.out.filtered.collect().ifEmpty([]), + [] + // GTDBTK_CLASSIFYWF.out.failed.collect().ifEmpty([]) ) emit: diff --git a/workflows/mag.nf b/workflows/mag.nf index 62797405..21f12e55 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -735,6 +735,20 @@ workflow MAG { * DAS Tool: binning refinement */ + ch_binning_results_bins = ch_binning_results_bins + .map { meta, bins -> + def meta_new = meta + [refinement:'unrefined'] + [meta_new , bins] + } + + ch_binning_results_unbins = ch_binning_results_unbins + .map { meta, bins -> + def meta_new = meta + [refinement:'unrefined_unbinned'] + [meta_new, bins] + } + + + // If any two of the binners are both skipped at once, do not run because DAS_Tool needs at least one if ( params.refine_bins_dastool ) { ch_prokarya_bins_dastool = ch_binning_results_bins @@ -755,7 +769,14 @@ workflow MAG { } BINNING_REFINEMENT ( ch_contigs_for_binrefinement, ch_prokarya_bins_dastool ) - ch_refined_bins = ch_eukarya_bins_dastool.mix(BINNING_REFINEMENT.out.refined_bins) + ch_refined_bins = ch_eukarya_bins_dastool + .map{ meta, bins -> + def meta_new = meta + [refinement: 'eukaryote_unrefined'] + [meta_new, bins] + }.mix( BINNING_REFINEMENT.out.refined_bins) + + ch_refined_bins.dump(tag: 'ch_refined_bins', pretty: true) + ch_refined_unbins = BINNING_REFINEMENT.out.refined_unbins ch_versions = ch_versions.mix(BINNING_REFINEMENT.out.versions) @@ -837,17 +858,19 @@ workflow MAG { } ch_quast_bins_summary = Channel.empty() + ch_input_for_postbinning_bins_unbins.dump(tag: 'ch_input_for_postbinning_bins_unbins', pretty: true) if (!params.skip_quast){ ch_input_for_quast_bins = ch_input_for_postbinning_bins_unbins .groupTuple() .map { - meta, reads -> - def new_reads = reads.flatten() - [meta, new_reads] + meta, bins -> + def new_bins = bins.flatten() + [meta, new_bins] } QUAST_BINS ( ch_input_for_quast_bins ) ch_versions = ch_versions.mix(QUAST_BINS.out.versions.first()) + QUAST_BINS.out.quast_bin_summaries.collect().dump(tag: 'quast_bin_summaries_collect', pretty: true) QUAST_BINS_SUMMARY ( QUAST_BINS.out.quast_bin_summaries.collect() ) ch_quast_bins_summary = QUAST_BINS_SUMMARY.out.summary } @@ -868,7 +891,7 @@ workflow MAG { ch_cat_db ) CAT_SUMMARY( - CAT.out.tax_classification.collect() + CAT.out.tax_classification_names.collect() ) ch_versions = ch_versions.mix(CAT.out.versions.first()) ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions) From 0ae55d2c699d342bbbe2ba5ad1b6849c6a2eb256 Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 28 Aug 2023 13:43:56 +0200 Subject: [PATCH 02/34] update gtdb-tk version --- modules/nf-core/gtdbtk/classifywf/main.nf | 36 +++++++++++------------ nextflow.config | 1 + nextflow_schema.json | 4 +++ subworkflows/local/gtdbtk.nf | 4 ++- workflows/mag.nf | 4 ++- 5 files changed, 28 insertions(+), 21 deletions(-) diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf index 1e0c99db..31647828 100644 --- a/modules/nf-core/gtdbtk/classifywf/main.nf +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -3,25 +3,26 @@ process GTDBTK_CLASSIFYWF { label 'process_medium' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - conda "bioconda::gtdbtk=2.1.1" + conda "bioconda::gtdbtk=2.3.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gtdbtk:2.1.1--pyhdfd78af_1' : - 'biocontainers/gtdbtk:2.1.1--pyhdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/gtdbtk:2.3.2--pyhdfd78af_0' : + 'biocontainers/gtdbtk:2.3.2--pyhdfd78af_0' }" input: tuple val(meta), path("bins/*") tuple val(db_name), path("database/*") + path(mash_db) output: path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.summary.tsv" , emit: summary - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.classify.tree.gz" , emit: tree - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.markers_summary.tsv", emit: markers - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.msa.fasta.gz" , emit: msa - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.user_msa.fasta.gz" , emit: user_msa - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.filtered.tsv" , emit: filtered + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.classify.tree.gz" , emit: tree, optional: true + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.markers_summary.tsv", emit: markers, optional: true + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.msa.fasta.gz" , emit: msa, optional: true + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.user_msa.fasta.gz" , emit: user_msa, optional: true + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.filtered.tsv" , emit: filtered, optional: true path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.log" , emit: log path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.warnings.log" , emit: warnings - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.failed_genomes.tsv" , emit: failed + path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.failed_genomes.tsv" , emit: failed, optional: true path "versions.yml" , emit: versions when: @@ -31,6 +32,7 @@ process GTDBTK_CLASSIFYWF { def args = task.ext.args ?: '' def pplacer_scratch = params.gtdbtk_pplacer_scratch ? "--scratch_dir pplacer_tmp" : "" def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" + def mash_mode = mash_db ? "--mash_db ${mash_db}" : "--skip_ani_screen" """ export GTDBTK_DATA_PATH="\${PWD}/database" @@ -44,25 +46,21 @@ process GTDBTK_CLASSIFYWF { --prefix "gtdbtk.${prefix}" \\ --out_dir "\${PWD}" \\ --cpus $task.cpus \\ - --pplacer_cpus $params.gtdbtk_pplacer_cpus \\ + $mash_mode \\ $pplacer_scratch \\ --min_perc_aa $params.gtdbtk_min_perc_aa \\ --min_af $params.gtdbtk_min_af - mv classify/gtdbtk.${prefix}.*.classify.tree \\ - classify/gtdbtk.${prefix}.*.summary.tsv \\ + mv classify/* \\ . - mv identify/gtdbtk.${prefix}.*.markers_summary.tsv \\ - identify/gtdbtk.${prefix}.failed_genomes.tsv \\ + mv identify/* \\ . - mv align/gtdbtk.${prefix}.*.msa.fasta.gz \\ - align/gtdbtk.${prefix}.*.user_msa.fasta.gz \\ - align/gtdbtk.${prefix}.*.filtered.tsv \\ + mv align/* \\ . - - gzip gtdbtk.${prefix}.*.classify.tree + + find -name gtdbtk.${prefix}.*.classify.tree | xargs -r gzip # do not fail if .tree is missing diff --git a/nextflow.config b/nextflow.config index 336d378f..0e4ae4ee 100644 --- a/nextflow.config +++ b/nextflow.config @@ -87,6 +87,7 @@ params { save_cat_db = false skip_gtdbtk = false gtdb_db = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz" + gtdb_mash = null gtdbtk_min_completeness = 50.0 gtdbtk_max_contamination = 10.0 gtdbtk_min_perc_aa = 10 diff --git a/nextflow_schema.json b/nextflow_schema.json index 4de8c31e..3dd79f57 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -520,6 +520,10 @@ "description": "Specify the location of a GTDBTK database. Can be either an uncompressed directory or a `.tar.gz` archive. If not specified will be downloaded for you when GTDBTK or binning QC is not skipped.", "default": "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz" }, + "gtdb_mash": { + "type": "string", + "description": "Specify the location of a GTDBTK mash database. If missing, GTDB-Tk will skip the ani_screening step" + }, "gtdbtk_min_completeness": { "type": "number", "default": 50, diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 8eea649d..e4eb0307 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -12,6 +12,7 @@ workflow GTDBTK { busco_summary // channel: path checkm_summary // channel: path gtdb // channel: path + gtdb_mash main: // Filter bins: classify only medium & high quality MAGs @@ -77,7 +78,8 @@ workflow GTDBTK { ch_filtered_bins.passed.groupTuple().dump(tag: "ch_filtered_bins.passed.groupTuple", pretty: true) GTDBTK_CLASSIFYWF ( ch_filtered_bins.passed.groupTuple(), - ch_db_for_gtdbtk + ch_db_for_gtdbtk, + gtdb_mash ) GTDBTK_SUMMARY ( diff --git a/workflows/mag.nf b/workflows/mag.nf index 21f12e55..4fe9b96f 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -209,6 +209,7 @@ gtdb = ( params.skip_binqc || params.skip_gtdbtk ) ? false : params.gtdb_db if (gtdb) { gtdb = file( "${gtdb}", checkIfExists: true) + gtdb_mash = params.gtdb_mash ? file("${params.gtdb_mash}", checkIfExists: true) : [] } else { gtdb = [] } @@ -914,7 +915,8 @@ workflow MAG { ch_gtdb_bins, ch_busco_summary, ch_checkm_summary, - gtdb + gtdb, + gtdb_mash ) ch_versions = ch_versions.mix(GTDBTK.out.versions.first()) ch_gtdbtk_summary = GTDBTK.out.summary From ecb7e990cb0cc66b6f6d7d50cc331f6f1ccb0557 Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 28 Aug 2023 14:56:47 +0200 Subject: [PATCH 03/34] update CAT --- modules/local/cat.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/cat.nf b/modules/local/cat.nf index be4375f3..bf1f4368 100644 --- a/modules/local/cat.nf +++ b/modules/local/cat.nf @@ -1,10 +1,10 @@ process CAT { tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}-${db_name}" - conda "bioconda::cat=4.6 bioconda::diamond=2.0.6" + conda "bioconda::cat=5.2.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' : - 'quay.io/biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }" + 'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' : + 'quay.io/biocontainers/cat:5.2.3--hdfd78af_1' }" input: tuple val(meta), path("bins/*") From 9069426849c32ae8d5ca3c6ab546a4fbf837a127 Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 28 Aug 2023 15:03:55 +0200 Subject: [PATCH 04/34] cleanup: remove dump debugging --- workflows/mag.nf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/workflows/mag.nf b/workflows/mag.nf index 21f12e55..a86f741f 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -775,8 +775,6 @@ workflow MAG { [meta_new, bins] }.mix( BINNING_REFINEMENT.out.refined_bins) - ch_refined_bins.dump(tag: 'ch_refined_bins', pretty: true) - ch_refined_unbins = BINNING_REFINEMENT.out.refined_unbins ch_versions = ch_versions.mix(BINNING_REFINEMENT.out.versions) @@ -858,7 +856,6 @@ workflow MAG { } ch_quast_bins_summary = Channel.empty() - ch_input_for_postbinning_bins_unbins.dump(tag: 'ch_input_for_postbinning_bins_unbins', pretty: true) if (!params.skip_quast){ ch_input_for_quast_bins = ch_input_for_postbinning_bins_unbins .groupTuple() @@ -870,7 +867,6 @@ workflow MAG { QUAST_BINS ( ch_input_for_quast_bins ) ch_versions = ch_versions.mix(QUAST_BINS.out.versions.first()) - QUAST_BINS.out.quast_bin_summaries.collect().dump(tag: 'quast_bin_summaries_collect', pretty: true) QUAST_BINS_SUMMARY ( QUAST_BINS.out.quast_bin_summaries.collect() ) ch_quast_bins_summary = QUAST_BINS_SUMMARY.out.summary } From 724c28d5af2451a8bc67b98723790a8eeabccc94 Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 28 Aug 2023 16:45:27 +0200 Subject: [PATCH 05/34] feat: check named results also includes refinement info --- conf/modules.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index fcb06d9a..0b1ff8ad 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -379,8 +379,8 @@ process { } withName: 'CHECKM_LINEAGEWF' { - tag = { "${meta.assembler}-${meta.binner}-${meta.id}" } - ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_wf" } + tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" } publishDir = [ path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, mode: params.publish_dir_mode, @@ -389,7 +389,7 @@ process { } withName: 'CHECKM_QA' { - ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_qa" } + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_qa" } ext.args = "-o 2 --tab_table" publishDir = [ path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, From 49474625c1b46eb5ab16a6a7dc00e38e6cb7e0cc Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 28 Aug 2023 16:45:46 +0200 Subject: [PATCH 06/34] cleanup: remove dumps --- subworkflows/local/depths.nf | 3 --- subworkflows/local/gtdbtk.nf | 1 - 2 files changed, 4 deletions(-) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 5afd6d83..004cf10d 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -23,9 +23,6 @@ workflow DEPTHS { main: ch_versions = Channel.empty() - bins_unbins.dump(tag: "DEPTH_bins_unbins", pretty: true) - depths.dump(tag: 'DEPTH_depths', pretty: true) - reads.dump(tag: 'DEPTH_reads', pretty: true) // Compute bin depths for different samples (according to `binning_map_mode`) // Create a new meta joining key first, but copy meta so that diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 8eea649d..3bb532f6 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -74,7 +74,6 @@ workflow GTDBTK { error("Unsupported object given to --gtdb, database must be supplied as either a directory or a .tar.gz file!") } - ch_filtered_bins.passed.groupTuple().dump(tag: "ch_filtered_bins.passed.groupTuple", pretty: true) GTDBTK_CLASSIFYWF ( ch_filtered_bins.passed.groupTuple(), ch_db_for_gtdbtk From 9a3de4fcee4a0051c964179a822dc979c99fec4f Mon Sep 17 00:00:00 2001 From: maxibor Date: Wed, 30 Aug 2023 13:47:29 +0200 Subject: [PATCH 07/34] feat: update gtdb-tk to 2.3.2 --- modules/nf-core/gtdbtk/classifywf/main.nf | 42 ++++++++++------------ modules/nf-core/gtdbtk/classifywf/meta.yml | 4 +++ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf index 31647828..d490e730 100644 --- a/modules/nf-core/gtdbtk/classifywf/main.nf +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -1,5 +1,5 @@ process GTDBTK_CLASSIFYWF { - tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" + tag "${prefix}" label 'process_medium' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. @@ -14,16 +14,16 @@ process GTDBTK_CLASSIFYWF { path(mash_db) output: - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.summary.tsv" , emit: summary - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.classify.tree.gz" , emit: tree, optional: true - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.markers_summary.tsv", emit: markers, optional: true - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.msa.fasta.gz" , emit: msa, optional: true - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.user_msa.fasta.gz" , emit: user_msa, optional: true - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.filtered.tsv" , emit: filtered, optional: true - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.log" , emit: log - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.warnings.log" , emit: warnings - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.failed_genomes.tsv" , emit: failed, optional: true - path "versions.yml" , emit: versions + path "gtdbtk.${prefix}.*.summary.tsv" , emit: summary + path "gtdbtk.${prefix}.*.classify.tree.gz" , emit: tree, optional: true + path "gtdbtk.${prefix}.*.markers_summary.tsv", emit: markers, optional: true + path "gtdbtk.${prefix}.*.msa.fasta.gz" , emit: msa, optional: true + path "gtdbtk.${prefix}.*.user_msa.fasta.gz" , emit: user_msa, optional: true + path "gtdbtk.${prefix}.*.filtered.tsv" , emit: filtered, optional: true + path "gtdbtk.${prefix}.failed_genomes.tsv" , emit: failed, optional: true + path "gtdbtk.${prefix}.log" , emit: log + path "gtdbtk.${prefix}.warnings.log" , emit: warnings + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -31,8 +31,8 @@ process GTDBTK_CLASSIFYWF { script: def args = task.ext.args ?: '' def pplacer_scratch = params.gtdbtk_pplacer_scratch ? "--scratch_dir pplacer_tmp" : "" - def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" def mash_mode = mash_db ? "--mash_db ${mash_db}" : "--skip_ani_screen" + prefix = task.ext.prefix ?: "${meta.id}" """ export GTDBTK_DATA_PATH="\${PWD}/database" @@ -51,22 +51,18 @@ process GTDBTK_CLASSIFYWF { --min_perc_aa $params.gtdbtk_min_perc_aa \\ --min_af $params.gtdbtk_min_af - mv classify/* \\ - . + mv classify/* . - mv identify/* \\ - . + mv identify/* . - mv align/* \\ - . - - find -name gtdbtk.${prefix}.*.classify.tree | xargs -r gzip # do not fail if .tree is missing - - + mv align/* .\ mv gtdbtk.log "gtdbtk.${prefix}.log" + mv gtdbtk.warnings.log "gtdbtk.${prefix}.warnings.log" + find -name gtdbtk.${prefix}.*.classify.tree | xargs -r gzip # do not fail if .tree is missing + cat <<-END_VERSIONS > versions.yml "${task.process}": gtdbtk: \$(echo \$(gtdbtk --version -v 2>&1) | sed "s/gtdbtk: version //; s/ Copyright.*//") @@ -74,7 +70,7 @@ process GTDBTK_CLASSIFYWF { """ stub: - def VERSION = '2.1.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + def VERSION = '2.3.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ touch gtdbtk.${prefix}.stub.summary.tsv diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml index 4e7ec5f1..f3e51d3e 100644 --- a/modules/nf-core/gtdbtk/classifywf/meta.yml +++ b/modules/nf-core/gtdbtk/classifywf/meta.yml @@ -31,6 +31,10 @@ input: type: file description: The local copy of the taxonomic database used by GTDB-tk (unzipped copy) pattern: "*" + - mash_db: + type: file + description: The local copy of the Mash sketch database used by GTDB-tk (optional) + pattern: "*" output: - meta: From 0886ca0387d84aff99c038c77a9f311b6cface74 Mon Sep 17 00:00:00 2001 From: Maxime Borry Date: Fri, 1 Sep 2023 11:19:44 +0000 Subject: [PATCH 08/34] doc: update output filenames --- docs/output.md | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/docs/output.md b/docs/output.md index 902d0f12..83561fa8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -425,7 +425,7 @@ By default, only the raw bins (and unbinned contigs) from the actual binning met ⚠️ Due to ability to perform downstream QC of both raw and refined bins in parallel (via `--postbinning_input)`, bin names in DAS Tools's `*_allBins.eval` file will include `Refined`. However for this particular file, they _actually_ refer to the 'raw' input bins. The pipeline renames the input files prior to running DASTool to ensure they can be disambiguated from the original bin files in the downstream QC steps. -### Tiara +### Tiara Tiara is a contig classifier that identifies the domain (prokarya, eukarya) of contigs within an assembly. This is used in this pipeline to rapidly and with few resources identify the most likely domain classification of each bin or unbin based on its contig identities. @@ -476,6 +476,7 @@ For each bin or refined bin the median sequencing depth is computed based on the - `predicted_genes/[assembler]-[bin].rna.gff`: Contig positions for rRNA genes in gff version 3 format - `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor) - `GenomeBinning/QC/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]-quast_summary.tsv`: QUAST output summarized per sample/condition. - `quast_summary.tsv`: QUAST output for all bins summarized @@ -531,9 +532,9 @@ By default, nf-core/mag runs CheckM with the `check_lineage` workflow that place Output files - `GenomeBinning/QC/CheckM/` - - `[assembler]-[binner]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results. - - `[assembler]-[binner]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`). - - `[assembler]-[binner]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`). + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. - `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`). @@ -581,14 +582,14 @@ If `--gunc_save_db` is specified, the output directory will also contain the req Output files - `Taxonomy/CAT/[assembler]/[binner]/` - - `[assembler]-[binner]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names - - `[assembler]-[binner]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names - `Taxonomy/CAT/[assembler]/[binner]/raw/` - - `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format - - `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format - - `[assembler]-[binner]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig - - `[assembler]-[binner]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins - - `[assembler]-[binner]-[sample/group].log`: Log files + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].log`: Log files @@ -609,14 +610,14 @@ If the parameters `--cat_db_generate` and `--save_cat_db` are set, additionally Output files - `Taxonomy/GTDB-Tk/[assembler]/[binner]/[sample/group]/` - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html). - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer. - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome. - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes. - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA. - - `gtdbtk.[assembler]-[binner]-[sample/group].*.log`: Log files. - - `gtdbtk.[assembler]-[binner]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes. -- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk ((listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`). + - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html)). + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].*.log`: Log files. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes. +- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk (listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`). From 5a359f606b46b9a169550ce397dda671a08e7c1a Mon Sep 17 00:00:00 2001 From: maxibor Date: Thu, 7 Sep 2023 16:05:20 +0200 Subject: [PATCH 09/34] update GTDB-Tk module to 2.3.2 --- modules.json | 2 +- modules/nf-core/gtdbtk/classifywf/main.nf | 58 ++++++++++------------ modules/nf-core/gtdbtk/classifywf/meta.yml | 4 ++ nextflow.config | 1 + nextflow_schema.json | 4 ++ subworkflows/local/gtdbtk.nf | 4 +- workflows/mag.nf | 3 +- 7 files changed, 41 insertions(+), 35 deletions(-) diff --git a/modules.json b/modules.json index 5567070f..7251d102 100644 --- a/modules.json +++ b/modules.json @@ -120,7 +120,7 @@ }, "gtdbtk/classifywf": { "branch": "master", - "git_sha": "c67eaf89682a12966f60008a8fa30f5dd29239df", + "git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5", "installed_by": ["modules"] }, "gunc/downloaddb": { diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf index 1e0c99db..00da4459 100644 --- a/modules/nf-core/gtdbtk/classifywf/main.nf +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -1,28 +1,29 @@ process GTDBTK_CLASSIFYWF { - tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" + tag "${prefix}" label 'process_medium' // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. - conda "bioconda::gtdbtk=2.1.1" + conda "bioconda::gtdbtk=2.3.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gtdbtk:2.1.1--pyhdfd78af_1' : - 'biocontainers/gtdbtk:2.1.1--pyhdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/gtdbtk:2.3.2--pyhdfd78af_0' : + 'biocontainers/gtdbtk:2.3.2--pyhdfd78af_0' }" input: tuple val(meta), path("bins/*") tuple val(db_name), path("database/*") + path(mash_db) output: - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.summary.tsv" , emit: summary - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.classify.tree.gz" , emit: tree - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.markers_summary.tsv", emit: markers - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.msa.fasta.gz" , emit: msa - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.user_msa.fasta.gz" , emit: user_msa - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.*.filtered.tsv" , emit: filtered - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.log" , emit: log - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.warnings.log" , emit: warnings - path "gtdbtk.${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}.failed_genomes.tsv" , emit: failed - path "versions.yml" , emit: versions + tuple val(meta), path("gtdbtk.${prefix}.*.summary.tsv") , emit: summary + tuple val(meta), path("gtdbtk.${prefix}.*.classify.tree.gz") , emit: tree, optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.markers_summary.tsv") , emit: markers, optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.msa.fasta.gz") , emit: msa, optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.user_msa.fasta.gz") , emit: user_msa, optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.filtered.tsv") , emit: filtered, optional: true + tuple val(meta), path("gtdbtk.${prefix}.failed_genomes.tsv") , emit: failed, optional: true + tuple val(meta), path("gtdbtk.${prefix}.log") , emit: log + tuple val(meta), path("gtdbtk.${prefix}.warnings.log") , emit: warnings + path("versions.yml") , emit: versions when: task.ext.when == null || task.ext.when @@ -30,7 +31,8 @@ process GTDBTK_CLASSIFYWF { script: def args = task.ext.args ?: '' def pplacer_scratch = params.gtdbtk_pplacer_scratch ? "--scratch_dir pplacer_tmp" : "" - def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" + def mash_mode = mash_db ? "--mash_db ${mash_db}" : "--skip_ani_screen" + prefix = task.ext.prefix ?: "${meta.id}" """ export GTDBTK_DATA_PATH="\${PWD}/database" @@ -44,31 +46,23 @@ process GTDBTK_CLASSIFYWF { --prefix "gtdbtk.${prefix}" \\ --out_dir "\${PWD}" \\ --cpus $task.cpus \\ - --pplacer_cpus $params.gtdbtk_pplacer_cpus \\ + $mash_mode \\ $pplacer_scratch \\ --min_perc_aa $params.gtdbtk_min_perc_aa \\ --min_af $params.gtdbtk_min_af - mv classify/gtdbtk.${prefix}.*.classify.tree \\ - classify/gtdbtk.${prefix}.*.summary.tsv \\ - . - - mv identify/gtdbtk.${prefix}.*.markers_summary.tsv \\ - identify/gtdbtk.${prefix}.failed_genomes.tsv \\ - . + mv classify/* . - mv align/gtdbtk.${prefix}.*.msa.fasta.gz \\ - align/gtdbtk.${prefix}.*.user_msa.fasta.gz \\ - align/gtdbtk.${prefix}.*.filtered.tsv \\ - . - - gzip gtdbtk.${prefix}.*.classify.tree + mv identify/* . - + mv align/* .\ mv gtdbtk.log "gtdbtk.${prefix}.log" + mv gtdbtk.warnings.log "gtdbtk.${prefix}.warnings.log" + find -name gtdbtk.${prefix}.*.classify.tree | xargs -r gzip # do not fail if .tree is missing + cat <<-END_VERSIONS > versions.yml "${task.process}": gtdbtk: \$(echo \$(gtdbtk --version -v 2>&1) | sed "s/gtdbtk: version //; s/ Copyright.*//") @@ -76,8 +70,8 @@ process GTDBTK_CLASSIFYWF { """ stub: - def VERSION = '2.1.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - + def VERSION = '2.3.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + prefix = task.ext.prefix ?: "${meta.id}" """ touch gtdbtk.${prefix}.stub.summary.tsv touch gtdbtk.${prefix}.stub.classify.tree.gz diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml index 4e7ec5f1..4319bc74 100644 --- a/modules/nf-core/gtdbtk/classifywf/meta.yml +++ b/modules/nf-core/gtdbtk/classifywf/meta.yml @@ -31,6 +31,10 @@ input: type: file description: The local copy of the taxonomic database used by GTDB-tk (unzipped copy) pattern: "*" + - mash_db: + type: file + description: The local copy of the Mash sketch database used by GTDB-tk if `ani_screen` mode is used (optional) + pattern: "*.msh" output: - meta: diff --git a/nextflow.config b/nextflow.config index 336d378f..0e4ae4ee 100644 --- a/nextflow.config +++ b/nextflow.config @@ -87,6 +87,7 @@ params { save_cat_db = false skip_gtdbtk = false gtdb_db = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz" + gtdb_mash = null gtdbtk_min_completeness = 50.0 gtdbtk_max_contamination = 10.0 gtdbtk_min_perc_aa = 10 diff --git a/nextflow_schema.json b/nextflow_schema.json index 4de8c31e..3dd79f57 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -520,6 +520,10 @@ "description": "Specify the location of a GTDBTK database. Can be either an uncompressed directory or a `.tar.gz` archive. If not specified will be downloaded for you when GTDBTK or binning QC is not skipped.", "default": "https://data.ace.uq.edu.au/public/gtdb/data/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz" }, + "gtdb_mash": { + "type": "string", + "description": "Specify the location of a GTDBTK mash database. If missing, GTDB-Tk will skip the ani_screening step" + }, "gtdbtk_min_completeness": { "type": "number", "default": 50, diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 3bb532f6..3c901bf8 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -12,6 +12,7 @@ workflow GTDBTK { busco_summary // channel: path checkm_summary // channel: path gtdb // channel: path + gtdb_mash // channel: path main: // Filter bins: classify only medium & high quality MAGs @@ -76,7 +77,8 @@ workflow GTDBTK { GTDBTK_CLASSIFYWF ( ch_filtered_bins.passed.groupTuple(), - ch_db_for_gtdbtk + ch_db_for_gtdbtk, + gtdb_mash ) GTDBTK_SUMMARY ( diff --git a/workflows/mag.nf b/workflows/mag.nf index a86f741f..7fe21280 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -209,6 +209,7 @@ gtdb = ( params.skip_binqc || params.skip_gtdbtk ) ? false : params.gtdb_db if (gtdb) { gtdb = file( "${gtdb}", checkIfExists: true) + gtdb_mash = params.gtdb_mash ? file("${params.gtdb_mash}", checkIfExists: true) : [] } else { gtdb = [] } @@ -770,7 +771,7 @@ workflow MAG { BINNING_REFINEMENT ( ch_contigs_for_binrefinement, ch_prokarya_bins_dastool ) ch_refined_bins = ch_eukarya_bins_dastool - .map{ meta, bins -> + .map{ meta, bins -> def meta_new = meta + [refinement: 'eukaryote_unrefined'] [meta_new, bins] }.mix( BINNING_REFINEMENT.out.refined_bins) From 199ca1ccfc23eb5b5ef250eb6b13ce7c106ba0cb Mon Sep 17 00:00:00 2001 From: maxibor Date: Thu, 7 Sep 2023 16:15:29 +0200 Subject: [PATCH 10/34] update GTDB-TK prefix --- conf/modules.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/modules.config b/conf/modules.config index 0b1ff8ad..2dfc4410 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -460,6 +460,7 @@ process { withName: GTDBTK_CLASSIFYWF { ext.args = "--extension fa" + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } publishDir = [ path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.assembler}/${meta.binner}/${meta.id}" }, mode: params.publish_dir_mode, From e9428d0877705d55d76894c29623cc0275a416a6 Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 11 Sep 2023 16:07:57 +0200 Subject: [PATCH 11/34] tmp: add dump for depth --- subworkflows/local/depths.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 004cf10d..7b8f20c8 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -39,6 +39,7 @@ workflow DEPTHS { } .transpose() .groupTuple(by: [0,2]) + .dump(tag: 'ch_depth_input', pretty: true) MAG_DEPTHS ( ch_depth_input ) From 1da34ba440892769ecbf0ce714977ae79dae8c9f Mon Sep 17 00:00:00 2001 From: maxibor Date: Wed, 20 Sep 2023 11:52:13 +0200 Subject: [PATCH 12/34] update CAT --- modules/local/cat.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/cat.nf b/modules/local/cat.nf index be4375f3..bf1f4368 100644 --- a/modules/local/cat.nf +++ b/modules/local/cat.nf @@ -1,10 +1,10 @@ process CAT { tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}-${db_name}" - conda "bioconda::cat=4.6 bioconda::diamond=2.0.6" + conda "bioconda::cat=5.2.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' : - 'quay.io/biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }" + 'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' : + 'quay.io/biocontainers/cat:5.2.3--hdfd78af_1' }" input: tuple val(meta), path("bins/*") From 01b2a03623a46c75a311e9cb364800174ce80b4d Mon Sep 17 00:00:00 2001 From: maxibor Date: Wed, 20 Sep 2023 13:08:40 +0200 Subject: [PATCH 13/34] dump depth --- subworkflows/local/depths.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 7b8f20c8..3c7a1709 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -39,7 +39,8 @@ workflow DEPTHS { } .transpose() .groupTuple(by: [0,2]) - .dump(tag: 'ch_depth_input', pretty: true) + + ch_depth_input.dump(tag: 'ch_depth_input', pretty: true) MAG_DEPTHS ( ch_depth_input ) From 78b090ff0ef1c6258de7b8344891cfaaf616fc4b Mon Sep 17 00:00:00 2001 From: maxibor Date: Wed, 20 Sep 2023 14:11:41 +0200 Subject: [PATCH 14/34] remove duplicate bins --- subworkflows/local/depths.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 3c7a1709..e46c4a3c 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -39,6 +39,9 @@ workflow DEPTHS { } .transpose() .groupTuple(by: [0,2]) + .map { meta, bins -> + [meta, bins.unique()] + } ch_depth_input.dump(tag: 'ch_depth_input', pretty: true) From a501fe3a1d5f4b025b068d849b8516c060fc54d3 Mon Sep 17 00:00:00 2001 From: maxibor Date: Wed, 20 Sep 2023 15:00:28 +0200 Subject: [PATCH 15/34] fix: add depth --- subworkflows/local/depths.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index e46c4a3c..fce400c3 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -39,8 +39,8 @@ workflow DEPTHS { } .transpose() .groupTuple(by: [0,2]) - .map { meta, bins -> - [meta, bins.unique()] + .map { meta, bins, depth -> + [meta, bins.unique(), depth] } ch_depth_input.dump(tag: 'ch_depth_input', pretty: true) From 3153e402419d47dbe23170762048520706a236e4 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 10:47:06 +0200 Subject: [PATCH 16/34] unique busco bins --- subworkflows/local/busco_qc.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf index 6165be47..b55855e7 100644 --- a/subworkflows/local/busco_qc.nf +++ b/subworkflows/local/busco_qc.nf @@ -32,9 +32,9 @@ workflow BUSCO_QC { } BUSCO_SUMMARY ( - BUSCO.out.summary_domain.map{it[1]}.collect().ifEmpty([]), - BUSCO.out.summary_specific.map{it[1]}.collect().ifEmpty([]), - BUSCO.out.failed_bin.map{it[1]}.collect().ifEmpty([]) + BUSCO.out.summary_domain.map{it[1]}.unique().collect().ifEmpty([]), + BUSCO.out.summary_specific.map{it[1]}.unique().collect().ifEmpty([]), + BUSCO.out.failed_bin.map{it[1]}.unique().collect().ifEmpty([]) ) emit: From 5da6c7da7ef9f1795b002172903eb01fb0be76eb Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 11:00:43 +0200 Subject: [PATCH 17/34] log: add busco dump --- subworkflows/local/busco_qc.nf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf index b55855e7..2dd05bd8 100644 --- a/subworkflows/local/busco_qc.nf +++ b/subworkflows/local/busco_qc.nf @@ -31,6 +31,14 @@ workflow BUSCO_QC { BUSCO_SAVE_DOWNLOAD ( ch_downloads ) } + busco_summary_domain = BUSCO.out.summary_domain.collect() + busco_summary_specific = BUSCO.out.summary_specific.collect() + busco_failed_bin = BUSCO.out.failed_bin.collect() + + busco_summary_domain.dump(tag: 'busco_summary_domain', pretty: true) + busco_summary_specific.dump(tag: 'busco_summary_specific', pretty: true) + busco_failed_bin.dump(tag: 'busco_failed_bin', pretty: true) + BUSCO_SUMMARY ( BUSCO.out.summary_domain.map{it[1]}.unique().collect().ifEmpty([]), BUSCO.out.summary_specific.map{it[1]}.unique().collect().ifEmpty([]), From 611a53e4e222326a70d0496b5e0ec4a9d140a6cb Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 11:19:59 +0200 Subject: [PATCH 18/34] fix: add missing config entries for concoct --- conf/modules.config | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 2dfc4410..473ab285 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -672,6 +672,10 @@ process { ext.prefix = { "${meta.assembler}-MaxBin2-${meta.id}" } } + withName: DASTOOL_FASTATOCONTIG2BIN_CONCOCT { + ext.prefix = { "${meta.assembler}-CONCOCT-${meta.id}" } + } + withName: DASTOOL_FASTATOCONTIG2BIN_TIARA { ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}" } } @@ -699,7 +703,7 @@ process { path: { "${params.outdir}/GenomeBinning/DASTool/bins" }, mode: params.publish_dir_mode, // pattern needs to be updated in case of new binning methods - pattern: '*-{MetaBAT2,MaxBin2}Refined-*.fa' + pattern: '*-{MetaBAT2,MaxBin2,CONCOCT}Refined-*.fa' ] ] } From e4ea75674b7eba45912b0231fee874d8930d4b4b Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 13:30:43 +0200 Subject: [PATCH 19/34] eukaryote bins are not sent to refined channel --- subworkflows/local/busco_qc.nf | 6 +++--- workflows/mag.nf | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf index 2dd05bd8..6c2132ce 100644 --- a/subworkflows/local/busco_qc.nf +++ b/subworkflows/local/busco_qc.nf @@ -40,9 +40,9 @@ workflow BUSCO_QC { busco_failed_bin.dump(tag: 'busco_failed_bin', pretty: true) BUSCO_SUMMARY ( - BUSCO.out.summary_domain.map{it[1]}.unique().collect().ifEmpty([]), - BUSCO.out.summary_specific.map{it[1]}.unique().collect().ifEmpty([]), - BUSCO.out.failed_bin.map{it[1]}.unique().collect().ifEmpty([]) + BUSCO.out.summary_domain.map{it[1]}.collect().ifEmpty([]), + BUSCO.out.summary_specific.map{it[1]}.collect().ifEmpty([]), + BUSCO.out.failed_bin.map{it[1]}.collect().ifEmpty([]) ) emit: diff --git a/workflows/mag.nf b/workflows/mag.nf index 34ae2932..0a7542a6 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -770,12 +770,13 @@ workflow MAG { } BINNING_REFINEMENT ( ch_contigs_for_binrefinement, ch_prokarya_bins_dastool ) - ch_refined_bins = ch_eukarya_bins_dastool - .map{ meta, bins -> - def meta_new = meta + [refinement: 'eukaryote_unrefined'] - [meta_new, bins] - }.mix( BINNING_REFINEMENT.out.refined_bins) + // ch_refined_bins = ch_eukarya_bins_dastool + // .map{ meta, bins -> + // def meta_new = meta + [refinement: 'eukaryote_unrefined'] + // [meta_new, bins] + // }.mix( BINNING_REFINEMENT.out.refined_bins) + ch_refined_bins = BINNING_REFINEMENT.out.refined_bins ch_refined_unbins = BINNING_REFINEMENT.out.refined_unbins ch_versions = ch_versions.mix(BINNING_REFINEMENT.out.versions) From f9b4a36cdfcd9b1103616245debcaef9443acdf2 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 14:25:00 +0200 Subject: [PATCH 20/34] debug: add dump for bin metrics --- subworkflows/local/gtdbtk.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 3c901bf8..d96334ae 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -47,6 +47,9 @@ workflow GTDBTK { } } + ch_bin_metrics.dump(tag: 'ch_bin_metrics', pretty: true) + ch_filtered_bins.transpose().map { meta, bin -> [bin.getName(), bin, meta]}.dump(tag: 'ch_filtered_bins', pretty: true) + // Filter bins based on collected metrics: completeness, contamination ch_filtered_bins = bins .transpose() From 0a16911de5165af12bc1a172142747baa3f543b6 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 14:27:16 +0200 Subject: [PATCH 21/34] debug: ch_filtered_bins -> bins --- subworkflows/local/gtdbtk.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index d96334ae..8ca64627 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -48,7 +48,7 @@ workflow GTDBTK { } ch_bin_metrics.dump(tag: 'ch_bin_metrics', pretty: true) - ch_filtered_bins.transpose().map { meta, bin -> [bin.getName(), bin, meta]}.dump(tag: 'ch_filtered_bins', pretty: true) + bins.transpose().map { meta, bin -> [bin.getName(), bin, meta]}.dump(tag: 'ch_filtered_bins', pretty: true) // Filter bins based on collected metrics: completeness, contamination ch_filtered_bins = bins From 7c9479561dc62a729905dbcd6d67316224bc4d82 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 15:54:17 +0200 Subject: [PATCH 22/34] debug: add groupTuple --- subworkflows/local/depths.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index fce400c3..1ac80a4e 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -42,6 +42,7 @@ workflow DEPTHS { .map { meta, bins, depth -> [meta, bins.unique(), depth] } + .groupTuple(by: [0,2]) ch_depth_input.dump(tag: 'ch_depth_input', pretty: true) From 73a14b50c76fddca9b04b2b251f68ce81b84f5a1 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 16:20:25 +0200 Subject: [PATCH 23/34] debug: flatten bins --- subworkflows/local/depths.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 1ac80a4e..dc7352d3 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -43,6 +43,10 @@ workflow DEPTHS { [meta, bins.unique(), depth] } .groupTuple(by: [0,2]) + .map { + meta, bins, depth -> + [meta, bins.flatten(), depth] + } ch_depth_input.dump(tag: 'ch_depth_input', pretty: true) From 07c6dbda2ae29d16db05b2cb3f7d81d7052c60cf Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 17:10:01 +0200 Subject: [PATCH 24/34] debug: add bin_unbins dump --- subworkflows/local/depths.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index dc7352d3..cc3d2ef7 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -24,6 +24,8 @@ workflow DEPTHS { ch_versions = Channel.empty() + bins_unbins.dump(tag: 'bins_unbins', pretty: true) + // Compute bin depths for different samples (according to `binning_map_mode`) // Create a new meta joining key first, but copy meta so that // we retain the information about binners and domain classification @@ -45,7 +47,7 @@ workflow DEPTHS { .groupTuple(by: [0,2]) .map { meta, bins, depth -> - [meta, bins.flatten(), depth] + [meta, bins.flatten(),] } ch_depth_input.dump(tag: 'ch_depth_input', pretty: true) From 548a03a89e7ef358397914debbeb101f9bad4e11 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 17:34:54 +0200 Subject: [PATCH 25/34] debug: new combine for depths to avoid duplicated entries --- subworkflows/local/depths.nf | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index cc3d2ef7..6055ad88 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -27,27 +27,26 @@ workflow DEPTHS { bins_unbins.dump(tag: 'bins_unbins', pretty: true) // Compute bin depths for different samples (according to `binning_map_mode`) - // Create a new meta joining key first, but copy meta so that + // Create a new meta combine key first, but copy meta so that // we retain the information about binners and domain classification ch_depth_input = bins_unbins - .map { meta, bins -> - def meta_join = meta - meta.subMap('binner','domain','refinement') - [ meta_join, meta, bins ] - } - .combine( depths, by: 0 ) - .map { meta_join, meta, bins, contig_depths_file -> - def meta_new = meta - meta.subMap('domain','refinement') - [ meta_new, bins, contig_depths_file ] + .map { + meta, bins -> + def meta_combine = meta - meta.subMap('binner','domain','refinement') + [meta_combine, meta, bins] } + .groupTuple() + .combine(depths, by: 0) .transpose() - .groupTuple(by: [0,2]) - .map { meta, bins, depth -> - [meta, bins.unique(), depth] + .map { + meta_combine, meta, bins, depth -> + def meta_new = meta - meta.subMap('domain','refinement') + [meta_new, bins, depth] } .groupTuple(by: [0,2]) .map { meta, bins, depth -> - [meta, bins.flatten(),] + [meta, bins.unique(), depth] } ch_depth_input.dump(tag: 'ch_depth_input', pretty: true) From 12a24201969283d87afe279b838545e1120602f1 Mon Sep 17 00:00:00 2001 From: maxibor Date: Fri, 22 Sep 2023 17:48:12 +0200 Subject: [PATCH 26/34] debug: add depths dump --- subworkflows/local/depths.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 6055ad88..f399307b 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -25,6 +25,7 @@ workflow DEPTHS { bins_unbins.dump(tag: 'bins_unbins', pretty: true) + depths.dump(tag: 'depths', pretty: true) // Compute bin depths for different samples (according to `binning_map_mode`) // Create a new meta combine key first, but copy meta so that From 1d846926ec49e30e13f21a70c75fb373eb628539 Mon Sep 17 00:00:00 2001 From: Maxime Borry Date: Tue, 10 Oct 2023 15:00:15 +0000 Subject: [PATCH 27/34] pin nf-validation --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 0e4ae4ee..d701a58f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -324,7 +324,7 @@ singularity.registry = 'quay.io' // Nextflow plugins plugins { - id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-validation@0.3.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet } // Load igenomes.config if required From db0c32e100a5f64067c95917b4d994c7b65cfc36 Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 16 Oct 2023 15:24:26 +0200 Subject: [PATCH 28/34] dev: both is an accepted value --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 862cd008..6dad7784 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -812,7 +812,7 @@ "default": "raw_bins_only", "description": "Specify which binning output is sent for downstream annotation, taxonomic classification, bin quality control etc.", "help_text": "`raw_bins_only`: only bins (and unbinned contigs) from the binners.\n`refined_bins_only`: only bins (and unbinned contigs) from the bin refinement step .\n\n ~~`both`: bins and unbinned contigs from both the binning and bin refinement steps.~~ `both` option is disabled in v2.4 due a bug that will be fixed in a later release.", - "enum": ["raw_bins_only", "refined_bins_only"] + "enum": ["raw_bins_only", "refined_bins_only","both"] }, "run_gunc": { "type": "boolean", From 425079ae240d628d53348f298c0d7521487db95b Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 16 Oct 2023 15:27:20 +0200 Subject: [PATCH 29/34] dev: reactivate bin mixing --- workflows/mag.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/mag.nf b/workflows/mag.nf index a8a1ea95..d5aff32a 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -834,10 +834,10 @@ workflow MAG { ch_input_for_postbinning_bins_unbins = ch_refined_bins.mix(ch_refined_unbins) // TODO REACTIVATE ONCE PR #489 IS READY! // TODO RE-ADD BOTH TO SCHEMA ONCE RE-ADDING - // } else if ( params.postbinning_input == 'both' ) { - // ch_all_bins = ch_binning_results_bins.mix(ch_refined_bins) - // ch_input_for_postbinning_bins = ch_all_bins - // ch_input_for_postbinning_bins_unbins = ch_all_bins.mix(ch_binning_results_unbins).mix(ch_refined_unbins) + } else if ( params.postbinning_input == 'both' ) { + ch_all_bins = ch_binning_results_bins.mix(ch_refined_bins) + ch_input_for_postbinning_bins = ch_all_bins + ch_input_for_postbinning_bins_unbins = ch_all_bins.mix(ch_binning_results_unbins).mix(ch_refined_unbins) } } else { ch_input_for_postbinning_bins = ch_binning_results_bins From 1cfc3c539f3cfd7cf85db99721cce0fdc2ffaa53 Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Mon, 30 Oct 2023 17:05:12 +0000 Subject: [PATCH 30/34] [automated] Fix linting with Prettier --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 6dad7784..5488af90 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -812,7 +812,7 @@ "default": "raw_bins_only", "description": "Specify which binning output is sent for downstream annotation, taxonomic classification, bin quality control etc.", "help_text": "`raw_bins_only`: only bins (and unbinned contigs) from the binners.\n`refined_bins_only`: only bins (and unbinned contigs) from the bin refinement step .\n\n ~~`both`: bins and unbinned contigs from both the binning and bin refinement steps.~~ `both` option is disabled in v2.4 due a bug that will be fixed in a later release.", - "enum": ["raw_bins_only", "refined_bins_only","both"] + "enum": ["raw_bins_only", "refined_bins_only", "both"] }, "run_gunc": { "type": "boolean", From 9856ffee3281f5e791b9701726bb06e2ac8d7835 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 1 Nov 2023 13:17:46 +0100 Subject: [PATCH 31/34] Update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a587782b..e81fae56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` +- [#489] Fix file name collision clashes for CHECKM, CAT, GTDBTK, and QUAST (by @maxibor) + ### `Dependencies` ### `Deprecated` From 1ae197359b45963c0e094681dc2c3e352f002b50 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 1 Nov 2023 13:18:08 +0100 Subject: [PATCH 32/34] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e81fae56..bdea0e5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` -- [#489] Fix file name collision clashes for CHECKM, CAT, GTDBTK, and QUAST (by @maxibor) +- [#489](https://github.com/nf-core/mag/pull/489) Fix file name collision clashes for CHECKM, CAT, GTDBTK, and QUAST (by @maxibor) ### `Dependencies` From 86681f72d570343278c929645f30ac8dee98bc29 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 1 Nov 2023 13:18:30 +0100 Subject: [PATCH 33/34] Update subworkflows/local/depths.nf --- subworkflows/local/depths.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 53511b23..3ae92162 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -43,7 +43,7 @@ workflow DEPTHS { .groupTuple(by: [0,2]) .map { meta, bins, depth -> - [meta, bins.unique(), depth] + [meta, bins.unique().flatten(), depth] } ch_depth_input.dump(tag: 'ch_depth_input', pretty: true) From 1c41f35441b307bfb07a427b62956ae5835ed12f Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 2 Nov 2023 14:24:58 +0100 Subject: [PATCH 34/34] Apply suggestions from code review --- subworkflows/local/busco_qc.nf | 4 ---- subworkflows/local/depths.nf | 2 -- subworkflows/local/gtdbtk.nf | 2 -- 3 files changed, 8 deletions(-) diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf index 9d7d2210..a5c3be8d 100644 --- a/subworkflows/local/busco_qc.nf +++ b/subworkflows/local/busco_qc.nf @@ -69,10 +69,6 @@ workflow BUSCO_QC { busco_summary_specific = BUSCO.out.summary_specific.collect() busco_failed_bin = BUSCO.out.failed_bin.collect() - busco_summary_domain.dump(tag: 'busco_summary_domain', pretty: true) - busco_summary_specific.dump(tag: 'busco_summary_specific', pretty: true) - busco_failed_bin.dump(tag: 'busco_failed_bin', pretty: true) - BUSCO_SUMMARY ( BUSCO.out.summary_domain.map{it[1]}.collect().ifEmpty([]), BUSCO.out.summary_specific.map{it[1]}.collect().ifEmpty([]), diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf index 3ae92162..012899ad 100644 --- a/subworkflows/local/depths.nf +++ b/subworkflows/local/depths.nf @@ -20,7 +20,6 @@ workflow DEPTHS { ch_versions = Channel.empty() - bins_unbins.dump(tag: 'bins_unbins', pretty: true) depths.dump(tag: 'depths', pretty: true) // Compute bin depths for different samples (according to `binning_map_mode`) @@ -46,7 +45,6 @@ workflow DEPTHS { [meta, bins.unique().flatten(), depth] } - ch_depth_input.dump(tag: 'ch_depth_input', pretty: true) MAG_DEPTHS ( ch_depth_input ) diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index ba58dfea..2f110a43 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -47,8 +47,6 @@ workflow GTDBTK { } } - ch_bin_metrics.dump(tag: 'ch_bin_metrics', pretty: true) - bins.transpose().map { meta, bin -> [bin.getName(), bin, meta]}.dump(tag: 'ch_filtered_bins', pretty: true) // Filter bins based on collected metrics: completeness, contamination ch_filtered_bins = bins