Merge pull request #98 from sanger-tol/clean_params

Overall clean up
sanger-tol · May 23, 2024 · 6581abf · 6581abf
2 parents acbc472 + 9b1ecc3
commit 6581abf
Show file tree

Hide file tree

Showing 16 changed files with 105 additions and 118 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,27 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[0.5.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.0)] – Snorlax – []
+
+General tidy up of the configuration and the pipeline
+
+### Enhancements & fixes
+
+- Increased the resources for blastn
+- Removed some options that were not used or not needed
+
+### Parameters
+
+| Old parameter   | New parameter |
+| --------------- | ------------- |
+| --taxa_file     |               |
+| --blastp_outext |               |
+| --blastp_cols   |               |
+| --blastx_outext |               |
+| --blastx_cols   |               |
+
+> **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.
+
 ## [[0.4.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.4.0)] – Buneary – [2024-04-17]
 
 The pipeline has now been validated on dozens of genomes, up to 11 Gbp.

diff --git a/README.md b/README.md
@@ -20,8 +20,8 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome
 4. Run BUSCO ([`busco`](https://busco.ezlab.org/))
 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))
 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))
-7. Run BLASTn against extracted BUSCO genes ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))
-8. Run BLASTx against extracted BUSCO genes ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))
+7. Run BLASTx against sequences with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))
+8. Run BLASTn against sequences still with not hit ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))
 9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))
 10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))
 11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))

diff --git a/conf/base.config b/conf/base.config
@@ -104,6 +104,12 @@ process {
         time   = { check_max( 3.h * Math.ceil(meta.genome_size/1000000000) * task.attempt, 'time') }
     }
 
+    withName: "BLAST_BLASTN" {
+        cpus   = { check_max( 24    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 100.MB * task.attempt, 'memory' ) }
+        time   = { check_max( 12.h  * task.attempt, 'time'    ) }
+    }
+
     withName:CUSTOM_DUMPSOFTWAREVERSIONS {
         cache = false
     }

diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff
diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf
diff --git a/nextflow.config b/nextflow.config
@@ -17,11 +17,10 @@ params {
     mask                       = false
     fetchngs_samplesheet       = false
 
-    // Reference options    
+    // Reference options
     fasta                      = null
     accession                  = null
     taxon                      = null
-    taxa_file                  = null
 
     // Output options
     image_format               = 'png'
@@ -32,10 +31,6 @@ params {
     blastp                     = null
     blastx                     = null
     blastn                     = null
-    blastp_outext              = 'txt'
-    blastp_cols                = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore'
-    blastx_outext              = 'txt'
-    blastx_cols                = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore'
 
     // MultiQC options
     multiqc_config             = null
@@ -248,7 +243,7 @@ manifest {
     description     = """Quality assessment of genome assemblies"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '0.4.0'
+    version         = '0.5.0'
     doi             = '10.5281/zenodo.7949058'
 }
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -75,7 +75,7 @@
             "type": "object",
             "fa_icon": "fas fa-dna",
             "description": "Reference genome related files and options required for the workflow.",
-            "required": ["taxon", "accession", "fasta"],
+            "required": ["taxon", "fasta"],
             "properties": {
                 "taxon": {
                     "type": ["string", "integer"],
@@ -102,43 +102,12 @@
             "description": "Define the location and parameters to work with databases.",
             "required": ["blastp", "blastx", "blastn", "taxdump"],
             "properties": {
-                "taxa_file": {
-                    "type": "string",
-                    "format": "file-path",
-                    "description": "Path to file containing the BUSCO lineages for the genome species",
-                    "help_text": "If this file is not included, the relevant BUSCO lineages are automatically calculated using the taxon parameter.",
-                    "fa_icon": "fas fa-file-alt"
-                },
                 "busco": {
                     "type": "string",
                     "format": "directory-path",
                     "description": "Local directory where clade-specific BUSCO lineage datasets are stored",
                     "fa_icon": "fas fa-folder-open"
                 },
-                "blastp_cols": {
-                    "type": "string",
-                    "description": "When blastp_outext is 'txt', this is the list of columns that Diamond BLAST should print.",
-                    "default": "qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"
-                },
-                "blastp_outext": {
-                    "type": "string",
-                    "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"],
-                    "description": "Extension (file format) of the output file from Diamond BLAST.",
-                    "fa_icon": "fas fa-file-circle-question",
-                    "default": "txt"
-                },
-                "blastx_cols": {
-                    "type": "string",
-                    "description": "When blastx_outext is 'txt', this is the list of columns that Diamond BLAST should print.",
-                    "default": "qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"
-                },
-                "blastx_outext": {
-                    "type": "string",
-                    "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"],
-                    "description": "Extension (file format) of the output file from Diamond BLAST.",
-                    "fa_icon": "fas fa-file-circle-question",
-                    "default": "txt"
-                },
                 "blastp": {
                     "type": "string",
                     "format": "file-path",

diff --git a/subworkflows/local/blobtools.nf b/subworkflows/local/blobtools.nf
@@ -28,14 +28,14 @@ workflow BLOBTOOLS {
     ch_versions = ch_versions.mix ( BLOBTOOLKIT_METADATA.out.versions.first() )
 
 
-    //  
+    //
     // Create Blobtools dataset files
     //
     BLOBTOOLKIT_CREATEBLOBDIR ( windowstats, busco, blastp, BLOBTOOLKIT_METADATA.out.yaml, taxdump )
     ch_versions = ch_versions.mix ( BLOBTOOLKIT_CREATEBLOBDIR.out.versions.first() )
 
 
-    //  
+    //
     // Update Blobtools dataset files
     //
     BLOBTOOLKIT_UPDATEBLOBDIR ( BLOBTOOLKIT_CREATEBLOBDIR.out.blobdir, blastx, blastn, taxdump )

diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf
@@ -12,23 +12,23 @@ include { RESTRUCTUREBUSCODIR       } from '../../modules/local/restructurebusco
 workflow BUSCO_DIAMOND {
     take:
     fasta        // channel: [ val(meta), path(fasta) ]
-    taxon_taxa   // channel: [ val(meta, val(taxon), path(taxa) ]
+    taxon        // channel: val(taxon)
     busco_db     // channel: path(busco_db)
     blastp       // channel: path(blastp_db)
-    outext       // channel: val(out_format)
-    cols         // channel: val(column_names)
 
 
     main:
     ch_versions = Channel.empty()
 
 
     //
-    // Fetch BUSCO lineages for taxon (or taxa)
+    // Fetch BUSCO lineages for taxon
     //
-    GOAT_TAXONSEARCH ( taxon_taxa )
+    GOAT_TAXONSEARCH (
+        fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] }
+    )
     ch_versions = ch_versions.mix ( GOAT_TAXONSEARCH.out.versions.first() )
-    
+
 
     //
     // Get NCBI species ID
@@ -70,7 +70,7 @@ workflow BUSCO_DIAMOND {
         ch_fasta_with_lineage,
         "genome",
         ch_fasta_with_lineage.map { it[0].lineage_name },
-        busco_db.collect().ifEmpty([]),
+        busco_db,
         [],
     )
     ch_versions = ch_versions.mix ( BUSCO.out.versions.first() )
@@ -108,11 +108,14 @@ workflow BUSCO_DIAMOND {
 
     //
     // Align BUSCO genes against the BLASTp database
-    //    
+    //
     BLOBTOOLKIT_EXTRACTBUSCOS.out.genes
     | filter { it[1].size() > 140 }
     | set { ch_busco_genes }
 
+    // Hardcoded to match the format expected by blobtools
+    def outext = 'txt'
+    def cols   = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore'
     DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols )
     ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() )
 
@@ -141,7 +144,7 @@ workflow BUSCO_DIAMOND {
 
 
     emit:
-    first_table = ch_first_table          // channel: [ val(meta), path(full_table) ] 
+    first_table = ch_first_table          // channel: [ val(meta), path(full_table) ]
     all_tables  = ch_indexed_buscos       // channel: [ val(meta), path(full_tables) ]
     blastp_txt  = DIAMOND_BLASTP.out.txt  // channel: [ val(meta), path(txt) ]
     taxon_id    = ch_taxid                // channel: taxon_id

diff --git a/subworkflows/local/collate_stats.nf b/subworkflows/local/collate_stats.nf
@@ -8,7 +8,7 @@ include { BLOBTOOLKIT_WINDOWSTATS } from '../../modules/local/blobtoolkit/window
 
 
 workflow COLLATE_STATS {
-    take: 
+    take:
     busco       // channel: [ val(meta), path(full_table) ]
     bed         // channel: [ val(meta), path(bed) ]
     freq        // channel: [ val(meta), path(freq) ]

diff --git a/subworkflows/local/coverage_stats.nf b/subworkflows/local/coverage_stats.nf
@@ -10,8 +10,8 @@ include { CREATE_BED     } from '../../modules/local/create_bed'
 
 
 workflow COVERAGE_STATS {
-    take: 
-    input    // channel: [ val(meta), path(aln) ] 
+    take:
+    input    // channel: [ val(meta), path(aln) ]
     fasta    // channel: [ val(meta), path(fasta) ]
 
 
@@ -57,7 +57,7 @@ workflow COVERAGE_STATS {
     CREATE_BED ( FASTAWINDOWS.out.mononuc )
     ch_versions = ch_versions.mix ( CREATE_BED.out.versions.first() )
 
-    
+
     // Calculate coverage
     BLOBTK_DEPTH ( ch_bam_csi )
     ch_versions = ch_versions.mix ( BLOBTK_DEPTH.out.versions.first() )

diff --git a/subworkflows/local/minimap_alignment.nf b/subworkflows/local/minimap_alignment.nf
@@ -1,4 +1,4 @@
-// 
+//
 // Optional alignment subworkflow using Minimap2
 //
 
@@ -52,7 +52,7 @@ workflow MINIMAP2_ALIGNMENT {
     // Align with Minimap2
     MINIMAP2_HIC ( ch_input.hic, fasta, true, false, false )
     ch_versions = ch_versions.mix(MINIMAP2_HIC.out.versions.first())
-    
+
     MINIMAP2_ILMN ( ch_input.illumina, fasta, true, false, false )
     ch_versions = ch_versions.mix(MINIMAP2_ILMN.out.versions.first())
 

diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf
@@ -48,7 +48,7 @@ workflow PREPARE_GENOME {
         ch_fasta = ch_genome
     }
 
-    
+
     emit:
     genome   = ch_fasta            // channel: [ meta, path(genome) ]
     versions = ch_versions         // channel: [ versions.yml ]

diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf
@@ -12,8 +12,8 @@ include { BLOBTOOLKIT_UNCHUNK          } from '../../modules/local/blobtoolkit/u
 
 
 workflow RUN_BLASTN {
-    take: 
-    blast_table  // channel: [ val(meta), path(blast_table) ] 
+    take:
+    blast_table  // channel: [ val(meta), path(blast_table) ]
     fasta        // channel: [ val(meta), path(fasta) ]
     blastn       // channel: [ val(meta), path(blastn_db) ]
     taxon_id     // channel: val(taxon_id)
@@ -27,16 +27,16 @@ workflow RUN_BLASTN {
     // Get list of sequence ids with no hits in diamond blastx search
     NOHIT_LIST ( blast_table, fasta )
     ch_versions = ch_versions.mix ( NOHIT_LIST.out.versions.first() )
- 
+
     // Subset of sequences with no hits
     SEQTK_SUBSEQ (
         fasta,
-        NOHIT_LIST.out.nohitlist.map { meta, nohit -> nohit }
+        NOHIT_LIST.out.nohitlist.map { meta, nohit -> nohit } . filter { it.size() > 0 }
     )
     ch_versions = ch_versions.mix ( SEQTK_SUBSEQ.out.versions.first() )
-    
-    
-    //  Split long contigs into chunks 
+
+
+    //  Split long contigs into chunks
     // create chunks
     BLOBTOOLKIT_CHUNK ( SEQTK_SUBSEQ.out.sequences, [[],[]] )
     ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() )

diff --git a/subworkflows/local/run_blastx.nf b/subworkflows/local/run_blastx.nf
@@ -11,8 +11,6 @@ workflow RUN_BLASTX {
     fasta      // channel: [ val(meta), path(fasta) ]
     table      // channel: [ val(meta), path(busco_table) ]
     blastx     // channel: [ val(meta), path(blastx_db) ]
-    outext     // channel: val(out_format)
-    cols       // channel: val(column_names)
 
 
     main:
@@ -29,9 +27,12 @@ workflow RUN_BLASTX {
     //
     // Run diamond_blastx
     //
+    // Hardocded to match the format expected by blobtools
+    def outext = 'txt'
+    def cols   = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore'
     DIAMOND_BLASTX ( BLOBTOOLKIT_CHUNK.out.chunks, blastx, outext, cols)
     ch_versions = ch_versions.mix ( DIAMOND_BLASTX.out.versions.first() )
-    
+
 
     //
     // Unchunk chunked blastx results
-Original file line number
+Diff line change
@@ Expand Up / @@ -48,7 +48,7 @@ workflow PREPARE_GENOME { @@
             ch_fasta = ch_genome
         }
         emit:
         genome   = ch_fasta            // channel: [ meta, path(genome) ]
         versions = ch_versions         // channel: [ versions.yml ]
@@ Expand Down @@