Merge pull request #489 from maxibor/custom

Fix for `NFCORE_MAG:MAG:CAT_SUMMARY input file name collision`
nf-core · Nov 3, 2023 · 9917795 · 9917795
2 parents fae10a1 + 1c41f35
commit 9917795
Show file tree

Hide file tree

Showing 15 changed files with 181 additions and 104 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Fixed`
 
+- [#489](https://github.com/nf-core/mag/pull/489) Fix file name collision clashes for CHECKM, CAT, GTDBTK, and QUAST (by @maxibor)
+
 ### `Dependencies`
 
 ### `Deprecated`

diff --git a/conf/modules.config b/conf/modules.config
@@ -377,8 +377,8 @@ process {
     }
 
     withName: 'CHECKM_LINEAGEWF' {
-        tag = { "${meta.assembler}-${meta.binner}-${meta.id}" }
-        ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_wf" }
+        tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" }
+        ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" }
         publishDir = [
             path: { "${params.outdir}/GenomeBinning/QC/CheckM" },
             mode: params.publish_dir_mode,
@@ -387,7 +387,7 @@ process {
     }
 
     withName: 'CHECKM_QA' {
-        ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}_qa" }
+        ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_qa" }
         ext.args = "-o 2 --tab_table"
         publishDir = [
             path: { "${params.outdir}/GenomeBinning/QC/CheckM" },
@@ -458,6 +458,7 @@ process {
 
     withName: GTDBTK_CLASSIFYWF {
         ext.args   = "--extension fa"
+        ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" }
         publishDir = [
             path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.assembler}/${meta.binner}/${meta.id}" },
             mode: params.publish_dir_mode,

diff --git a/docs/output.md b/docs/output.md
@@ -476,6 +476,7 @@ For each bin or refined bin the median sequencing depth is computed based on the
   - `predicted_genes/[assembler]-[bin].rna.gff`: Contig positions for rRNA genes in gff version 3 format
   - `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor)
 - `GenomeBinning/QC/`
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]-quast_summary.tsv`: QUAST output summarized per sample/condition.
   - `quast_summary.tsv`: QUAST output for all bins summarized
 
 </details>
@@ -531,9 +532,9 @@ By default, nf-core/mag runs CheckM with the `check_lineage` workflow that place
 <summary>Output files</summary>
 
 - `GenomeBinning/QC/CheckM/`
-  - `[assembler]-[binner]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results.
-  - `[assembler]-[binner]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`).
-  - `[assembler]-[binner]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc.
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results.
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`).
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc.
   - `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`).
 
 </details>
@@ -581,14 +582,14 @@ If `--gunc_save_db` is specified, the output directory will also contain the req
 <summary>Output files</summary>
 
 - `Taxonomy/CAT/[assembler]/[binner]/`
-  - `[assembler]-[binner]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names
-  - `[assembler]-[binner]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names
 - `Taxonomy/CAT/[assembler]/[binner]/raw/`
-  - `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format
-  - `[assembler]-[binner]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format
-  - `[assembler]-[binner]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig
-  - `[assembler]-[binner]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins
-  - `[assembler]-[binner]-[sample/group].log`: Log files
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins
+  - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].log`: Log files
 
 </details>
 
@@ -609,14 +610,14 @@ If the parameters `--cat_db_generate` and `--save_cat_db` are set, additionally
 <summary>Output files</summary>
 
 - `Taxonomy/GTDB-Tk/[assembler]/[binner]/[sample/group]/`
-  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html).
-  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer.
-  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome.
-  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes.
-  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA.
-  - `gtdbtk.[assembler]-[binner]-[sample/group].*.log`: Log files.
-  - `gtdbtk.[assembler]-[binner]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes.
-- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk ((listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`).
+  - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html)).
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer.
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome.
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes.
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA.
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].*.log`: Log files.
+  - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes.
+- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk (listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`).
 
 </details>
 

diff --git a/modules.json b/modules.json
@@ -118,7 +118,7 @@
                     },
                     "gtdbtk/classifywf": {
                         "branch": "master",
-                        "git_sha": "c67eaf89682a12966f60008a8fa30f5dd29239df",
+                        "git_sha": "898259a38563f29c3c5d2490876019ec2d6f49c5",
                         "installed_by": ["modules"]
                     },
                     "gunc/downloaddb": {

diff --git a/modules/local/cat.nf b/modules/local/cat.nf
@@ -1,39 +1,42 @@
 process CAT {
-    tag "${meta.assembler}-${meta.binner}-${meta.id}-${db_name}"
+    tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}-${db_name}"
 
-    conda "bioconda::cat=4.6 bioconda::diamond=2.0.6"
+    conda "bioconda::cat=5.2.3"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' :
-        'biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }"
+        'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' :
+        'biocontainers/cat:5.2.3--hdfd78af_1' }"
 
     input:
     tuple val(meta), path("bins/*")
     tuple val(db_name), path("database/*"), path("taxonomy/*")
 
     output:
-    path("*.names.txt.gz")                 , emit: tax_classification
-    path("raw/*.ORF2LCA.txt.gz")           , emit: orf2lca
-    path("raw/*.predicted_proteins.faa.gz"), emit: faa
-    path("raw/*.predicted_proteins.gff.gz"), emit: gff
-    path("raw/*.log")                      , emit: log
-    path("raw/*.bin2classification.txt.gz"), emit: tax_classification_taxids
-    path "versions.yml"                    , emit: versions
+    path("*.ORF2LCA.names.txt.gz")            , emit: orf2lca_classification
+    path("*.bin2classification.names.txt.gz") , emit: tax_classification_names
+    path("raw/*.ORF2LCA.txt.gz")              , emit: orf2lca
+    path("raw/*.predicted_proteins.faa.gz")   , emit: faa
+    path("raw/*.predicted_proteins.gff.gz")   , emit: gff
+    path("raw/*.log")                         , emit: log
+    path("raw/*.bin2classification.txt.gz")   , emit: tax_classification_taxids
+    path "versions.yml"                       , emit: versions
 
     script:
     def official_taxonomy = params.cat_official_taxonomy ? "--only_official" : ""
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
     """
-    CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${meta.assembler}-${meta.binner}-${meta.id}" --I_know_what_Im_doing
-    CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy}
-    CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy}
+    CAT bins $args -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${prefix}" --I_know_what_Im_doing
+    CAT add_names -i "${prefix}.ORF2LCA.txt" -o "${prefix}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy}
+    CAT add_names -i "${prefix}.bin2classification.txt" -o "${prefix}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy}
 
     mkdir raw
     mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/
-    gzip "raw/${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" \
-        "raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.faa" \
-        "raw/${meta.assembler}-${meta.binner}-${meta.id}.concatenated.predicted_proteins.gff" \
-        "raw/${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" \
-        "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" \
-        "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt"
+    gzip "raw/${prefix}.ORF2LCA.txt" \
+        "raw/${prefix}.concatenated.predicted_proteins.faa" \
+        "raw/${prefix}.concatenated.predicted_proteins.gff" \
+        "raw/${prefix}.bin2classification.txt" \
+        "${prefix}.ORF2LCA.names.txt" \
+        "${prefix}.bin2classification.names.txt"
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/quast_bins.nf b/modules/local/quast_bins.nf
@@ -1,5 +1,5 @@
 process QUAST_BINS {
-    tag "${meta.assembler}-${meta.binner}-${meta.id}"
+    tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
 
     conda "bioconda::quast=5.0.2"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
@@ -15,15 +15,16 @@ process QUAST_BINS {
     path "versions.yml"             , emit: versions
 
     script:
+    def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}"
     """
     BINS=\$(echo \"$bins\" | sed 's/[][]//g')
     IFS=', ' read -r -a bins <<< \"\$BINS\"
     for bin in \"\${bins[@]}\"; do
         metaquast.py --threads "${task.cpus}" --max-ref-number 0 --rna-finding --gene-finding -l "\${bin}" "\${bin}" -o "QUAST/\${bin}"
-        if ! [ -f "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv" ]; then
-            cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv"
+        if ! [ -f "QUAST/${prefix}-quast_summary.tsv" ]; then
+            cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${prefix}-quast_summary.tsv"
         else
-            tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${meta.assembler}-${meta.domain}-${meta.binner}-${meta.id}-quast_summary.tsv"
+            tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${prefix}-quast_summary.tsv"
         fi
     done