From e1ef4516753f7b55f2082a8ca553629a40b80ef4 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 15 Aug 2024 20:08:42 +0100
Subject: [PATCH 01/30] Skip blastn if there are no chunks

The filter in SEQTK_SUBSEQ is not sufficient because some BLOBTOOLKIT_CHUNK further excludes masked regions
---
 subworkflows/local/run_blastn.nf | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf
index 1ea64b82..7dc92fd7 100644
--- a/subworkflows/local/run_blastn.nf
+++ b/subworkflows/local/run_blastn.nf
@@ -41,10 +41,14 @@ workflow RUN_BLASTN {
     BLOBTOOLKIT_CHUNK ( SEQTK_SUBSEQ.out.sequences, [[],[]] )
     ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() )
 
+    // Check that there are still sequences left after chunking (which excludes masked regions)
+    BLOBTOOLKIT_CHUNK.out.chunks
+    | filter { it[1].size() > 0 }
+    | set { ch_chunks }
 
     // Run blastn search
     // run blastn excluding taxon_id
-    BLASTN_TAXON ( BLOBTOOLKIT_CHUNK.out.chunks, blastn, taxon_id )
+    BLASTN_TAXON ( ch_chunks, blastn, taxon_id )
     ch_versions = ch_versions.mix ( BLASTN_TAXON.out.versions.first() )
 
     // check if blastn output table is empty
@@ -56,7 +60,8 @@ workflow RUN_BLASTN {
     | set { ch_blastn_taxon_out }
 
     // repeat the blastn search without excluding taxon_id
-    ch_blastn_taxon_out.empty.join ( BLOBTOOLKIT_CHUNK.out.chunks )
+    ch_blastn_taxon_out.empty
+    | join ( ch_chunks )
     | map { meta, txt, fasta -> [meta, fasta] }
     | set { ch_blast_blastn_input }
 

From ed44da107826cea25752fdbc3fc294d79701b54b Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 15 Aug 2024 20:12:56 +0100
Subject: [PATCH 02/30] Version bump

---
 CHANGELOG.md    | 6 ++++++
 nextflow.config | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b1f08975..5146c972 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,12 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-15]
+
+### Enhancements & fixes
+
+- Bugfix: skip BLASTN if there are no chunks to align
+
 ## [[0.5.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.0)] – Snorlax – [2024-07-31]
 
 General tidy up of the configuration and the pipeline
diff --git a/nextflow.config b/nextflow.config
index db5ef388..84bae361 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -243,7 +243,7 @@ manifest {
     description     = """Quality assessment of genome assemblies"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '0.5.0'
+    version         = '0.5.1'
     doi             = '10.5281/zenodo.7949058'
 }
 

From 3053bdbf48d6e79793254d90ffc10210e72cfc35 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 22 Aug 2024 08:12:57 +0100
Subject: [PATCH 03/30] New date

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5146c972..bf3e3e6a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-15]
+## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-22]
 
 ### Enhancements & fixes
 

From 12a75579da3d947304d9caefcd8d3e45355c2c6d Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 13 May 2024 10:32:31 +0100
Subject: [PATCH 04/30] Version bump

---
 CHANGELOG.md    | 4 ++++
 nextflow.config | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bf3e3e6a..0614c6d9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,10 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[1.0.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/1.0.0)] – – []
+
+The pipeline has now been validated for draft (unpublished) assemblies.
+
 ## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-22]
 
 ### Enhancements & fixes
diff --git a/nextflow.config b/nextflow.config
index 84bae361..2345a399 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -243,7 +243,7 @@ manifest {
     description     = """Quality assessment of genome assemblies"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '0.5.1'
+    version         = '1.0.0'
     doi             = '10.5281/zenodo.7949058'
 }
 

From bd2334e098a3f542dcc99b83d1d03f980d3c5dcc Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 13 May 2024 10:53:39 +0100
Subject: [PATCH 05/30] Directly query the NCBI API

---
 CHANGELOG.md                                  | 11 +++
 CITATIONS.md                                  |  4 --
 README.md                                     |  2 +-
 ...ataset_name.eukaryota_odb10.2019-12-16.txt | 67 +++++++++++++++++++
 bin/get_odb_taxon.py                          | 61 +++++++++++++++++
 conf/modules.config                           |  4 --
 docs/usage.md                                 |  1 -
 modules.json                                  |  5 --
 modules/local/get_odb_taxon.nf                | 32 +++++++++
 .../nf-core/goat/taxonsearch/environment.yml  |  7 --
 modules/nf-core/goat/taxonsearch/main.nf      | 36 ----------
 modules/nf-core/goat/taxonsearch/meta.yml     | 50 --------------
 nextflow.config                               |  2 +
 nextflow_schema.json                          |  9 ++-
 subworkflows/local/busco_diamond_blastp.nf    | 35 +++++-----
 workflows/blobtoolkit.nf                      |  7 +-
 16 files changed, 206 insertions(+), 127 deletions(-)
 create mode 100644 assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt
 create mode 100755 bin/get_odb_taxon.py
 create mode 100644 modules/local/get_odb_taxon.nf
 delete mode 100644 modules/nf-core/goat/taxonsearch/environment.yml
 delete mode 100644 modules/nf-core/goat/taxonsearch/main.nf
 delete mode 100644 modules/nf-core/goat/taxonsearch/meta.yml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0614c6d9..5f8f155d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 The pipeline has now been validated for draft (unpublished) assemblies.
 
+- The pipeline now queries the NCBI database instead of GoaT to establish the
+  taxonomic classification of the species and the relevant Busco lineages.
+
+### Software dependencies
+
+Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported.
+
+| Dependency | Old version | New version |
+| ---------- | ----------- | ----------- |
+| goat       | 0.2.5       |             |
+
 ## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-22]
 
 ### Enhancements & fixes
diff --git a/CITATIONS.md b/CITATIONS.md
index 8b960779..c2cadb65 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -26,10 +26,6 @@
 
 - [Fasta_windows](https://github.com/tolkit/fasta_windows)
 
-- [GoaT](https://goat.genomehubs.org)
-
-  > Challis, Richard, et al. “Genomes on a Tree (GoaT): A versatile, scalable search engine for genomic and sequencing project metadata across the eukaryotic tree of life.” Wellcome Open Research, vol. 8, no. 24, 2023, https://doi.org/10.12688/wellcomeopenres.18658.1.
-
 - [Minimap2](https://github.com/lh3/minimap2)
 
   > Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, https://doi.org/10.1093/bioinformatics/bty191.
diff --git a/README.md b/README.md
index c7b92970..b55ab353 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome
 
 1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))
 2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))
-3. Fetch associated BUSCO lineages ([`goat/taxonsearch`](https://github.com/genomehubs/goat-cli))
+3. Determine the appropriate BUSCO lineages from the taxonomy.
 4. Run BUSCO ([`busco`](https://busco.ezlab.org/))
 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))
 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))
diff --git a/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt b/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt
new file mode 100644
index 00000000..5d8af90f
--- /dev/null
+++ b/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt
@@ -0,0 +1,67 @@
+422676	aconoidasida
+7898	actinopterygii
+5338	agaricales
+155619	agaricomycetes
+33630	alveolata
+5794	apicomplexa
+6854	arachnida
+6656	arthropoda
+4890	ascomycota
+8782	aves
+5204	basidiomycota
+68889	boletales
+3699	brassicales
+134362	capnodiales
+33554	carnivora
+91561	cetartiodactyla
+34395	chaetothyriales
+3041	chlorophyta
+5796	coccidia
+28738	cyprinodontiformes
+7147	diptera
+147541	dothideomycetes
+3193	embryophyta
+33392	endopterygota
+314146	euarchontoglires
+33682	euglenozoa
+2759	eukaryota
+5042	eurotiales
+147545	eurotiomycetes
+9347	eutheria
+72025	fabales
+4751	fungi
+314147	glires
+1028384	glomerellales
+5178	helotiales
+7524	hemiptera
+7399	hymenoptera
+5125	hypocreales
+50557	insecta
+314145	laurasiatheria
+147548	leotiomycetes
+7088	lepidoptera
+4447	liliopsida
+40674	mammalia
+33208	metazoa
+6029	microsporidia
+6447	mollusca
+4827	mucorales
+1913637	mucoromycota
+6231	nematoda
+33183	onygenales
+9126	passeriformes
+5820	plasmodium
+92860	pleosporales
+38820	poales
+5303	polyporales
+9443	primates
+4891	saccharomycetes
+8457	sauropsida
+4069	solanales
+147550	sordariomycetes
+33634	stramenopiles
+32523	tetrapoda
+155616	tremellomycetes
+7742	vertebrata
+33090	viridiplantae
+71240	eudicots
diff --git a/bin/get_odb_taxon.py b/bin/get_odb_taxon.py
new file mode 100755
index 00000000..ade35e34
--- /dev/null
+++ b/bin/get_odb_taxon.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import sys
+import requests
+
+
+NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s"
+
+
+def parse_args(args=None):
+    Description = "Get Taxon ID and ODB database value using NCBI API and BUSCO configuration file"
+
+    parser = argparse.ArgumentParser(description=Description)
+    parser.add_argument("TAXON_QUERY", help="Query string/integer for this taxon.")
+    parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.")
+    parser.add_argument("FILE_OUT", help="Output CSV file.")
+    parser.add_argument("--version", action="version", version="%(prog)s 1.0")
+    return parser.parse_args(args)
+
+
+def make_dir(path):
+    if len(path) > 0:
+        os.makedirs(path, exist_ok=True)
+
+
+def get_odb(taxon_query, lineage_tax_ids, file_out):
+    # Read the mapping between the BUSCO lineages and their taxon_id
+    with open(lineage_tax_ids) as file_in:
+        lineage_tax_ids_dict = {}
+        for line in file_in:
+            arr = line.split()
+            lineage_tax_ids_dict[int(arr[0])] = arr[1]
+
+    # Using API, get the taxon_ids of the species and all parents
+    response = requests.get(NCBI_TAXONOMY_API % taxon_query).json()
+    this_taxon_id = response["taxonomy_nodes"][0]["taxonomy"]["tax_id"]
+    ancestor_taxon_ids = response["taxonomy_nodes"][0]["taxonomy"]["lineage"]
+
+    # Do the intersection to find the ancestors that have a BUSCO lineage
+    odb_arr = [
+        lineage_tax_ids_dict[taxon_id] for taxon_id in reversed(ancestor_taxon_ids) if taxon_id in lineage_tax_ids_dict
+    ]
+
+    out_dir = os.path.dirname(file_out)
+    make_dir(out_dir)
+
+    with open(file_out, "w") as fout:
+        print("taxon_id", this_taxon_id, sep=",", file=fout)
+        for odb_val in odb_arr:
+            print("busco_lineage", odb_val + "_odb10", sep=",", file=fout)
+
+
+def main(args=None):
+    args = parse_args(args)
+    get_odb(args.TAXON_QUERY, args.LINEAGE_TAX_IDS, args.FILE_OUT)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/modules.config b/conf/modules.config
index ac597dc4..d4b3f897 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -64,10 +64,6 @@ process {
         ext.args = "-c"
     }
 
-    withName: "GOAT_TAXONSEARCH" {
-        ext.args = "--lineage --busco"
-    }
-
     withName: "PIGZ_COMPRESS" {
         publishDir = [
             path: { "${params.outdir}/base_content" },
diff --git a/docs/usage.md b/docs/usage.md
index 4789ff84..0619d8e0 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -235,7 +235,6 @@ List of tools for any given dataset can be fetched from the API, for example htt
 | busco             | 5.3.2     | 5.5.0    |
 | diamond           | 2.0.15    | 2.1.8    |
 | fasta_windows     |           | 0.2.4    |
-| goat              |           | 0.2.5    |
 | minimap2          | 2.24      | 2.24     |
 | ncbi-datasets-cli | 14.1.0    |          |
 | nextflow          |           | 23.10.0  |
diff --git a/modules.json b/modules.json
index ebb45a6c..3ce2a831 100644
--- a/modules.json
+++ b/modules.json
@@ -45,11 +45,6 @@
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/fastawindows/fastawindows.diff"
                     },
-                    "goat/taxonsearch": {
-                        "branch": "master",
-                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
-                        "installed_by": ["modules"]
-                    },
                     "gunzip": {
                         "branch": "master",
                         "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208",
diff --git a/modules/local/get_odb_taxon.nf b/modules/local/get_odb_taxon.nf
new file mode 100644
index 00000000..ff061ea3
--- /dev/null
+++ b/modules/local/get_odb_taxon.nf
@@ -0,0 +1,32 @@
+process NCBI_GET_ODB_TAXON {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "conda-forge::requests=2.26.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/requests:2.26.0':
+        'biocontainers/requests:2.26.0' }"
+
+    input:
+    tuple val(meta), val(taxon_query)
+    path lineage_tax_ids
+
+    output:
+    tuple val(meta), path("*.csv"), emit: csv
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    get_odb_taxon.py "$taxon_query" $lineage_tax_ids ${prefix}.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        get_odb_taxon.py: \$(get_odb.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/goat/taxonsearch/environment.yml b/modules/nf-core/goat/taxonsearch/environment.yml
deleted file mode 100644
index e56e71f1..00000000
--- a/modules/nf-core/goat/taxonsearch/environment.yml
+++ /dev/null
@@ -1,7 +0,0 @@
-name: goat_taxonsearch
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - bioconda::goat=0.2.5
diff --git a/modules/nf-core/goat/taxonsearch/main.nf b/modules/nf-core/goat/taxonsearch/main.nf
deleted file mode 100644
index 62c12baa..00000000
--- a/modules/nf-core/goat/taxonsearch/main.nf
+++ /dev/null
@@ -1,36 +0,0 @@
-process GOAT_TAXONSEARCH {
-    tag "$meta.id"
-    label 'process_single'
-
-    conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/goat:0.2.5--h9d3141d_2':
-        'biocontainers/goat:0.2.5--h9d3141d_2' }"
-
-    input:
-    tuple val(meta), val(taxon), path(taxa_file)
-
-    output:
-    tuple val(meta), path("*.tsv"), emit: taxonsearch
-    path "versions.yml"           , emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    input = taxa_file ? "-f ${taxa_file}" : "-t \"${taxon}\""
-    if (!taxon && !taxa_file) error "No input. Valid input: single taxon identifier or a .txt file with identifiers"
-    if (taxon && taxa_file ) error "Only one input is required: a single taxon identifier or a .txt file with identifiers"
-    """
-    goat-cli taxon search \\
-        $args \\
-        $input > ${prefix}.tsv
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        goat: \$(goat-cli --version | cut -d' ' -f2)
-    END_VERSIONS
-    """
-}
diff --git a/modules/nf-core/goat/taxonsearch/meta.yml b/modules/nf-core/goat/taxonsearch/meta.yml
deleted file mode 100644
index 1bb19e30..00000000
--- a/modules/nf-core/goat/taxonsearch/meta.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: "goat_taxonsearch"
-description: Query metadata for any taxon across the tree of life.
-keywords:
-  - public datasets
-  - ncbi
-  - genomes on a tree
-tools:
-  - goat:
-      description: |
-        goat-cli is a command line interface to query the
-        Genomes on a Tree Open API.
-      homepage: https://github.com/genomehubs/goat-cli
-      documentation: https://github.com/genomehubs/goat-cli/wiki
-      tool_dev_url: https://genomehubs.github.io/goat-cli/goat_cli/
-      licence: ["MIT"]
-input:
-  - meta:
-      type: map
-      description: |
-        Groovy Map containing sample information
-        e.g. [ id:'test']
-  - taxon:
-      type: string
-      description: |
-        The taxon to search. An NCBI taxon ID, or the name of a taxon at any rank.
-  - taxa_file:
-      type: file
-      description: |
-        A file of NCBI taxonomy ID's (tips) and/or binomial names. Each line
-        should contain a single entry.File size is limited to 500 entries.
-      pattern: "*.txt"
-output:
-  - meta:
-      type: map
-      description: |
-        Groovy Map containing sample information
-        e.g. [ id:'test', single_end:false ]
-  - versions:
-      type: file
-      description: File containing software versions
-      pattern: "versions.yml"
-  - taxonsearch:
-      type: file
-      description: TSV file containing search results.
-      pattern: "*.tsv"
-authors:
-  - "@alxndrdiaz"
-maintainers:
-  - "@alxndrdiaz"
-  - "@muffato"
diff --git a/nextflow.config b/nextflow.config
index 2345a399..38964a66 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -28,6 +28,8 @@ params {
     // Databases and related options
     taxdump                    = null
     busco                      = null
+    // Taken from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz
+    lineage_tax_ids            = "${projectDir}/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt"
     blastp                     = null
     blastx                     = null
     blastn                     = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index b392e2a5..8e7f0fba 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -100,7 +100,7 @@
             "type": "object",
             "fa_icon": "fas fa-database",
             "description": "Define the location and parameters to work with databases.",
-            "required": ["blastp", "blastx", "blastn", "taxdump"],
+            "required": ["blastp", "blastx", "blastn", "lineage_tax_ids", "taxdump"],
             "properties": {
                 "busco": {
                     "type": "string",
@@ -108,6 +108,13 @@
                     "description": "Local directory where clade-specific BUSCO lineage datasets are stored",
                     "fa_icon": "fas fa-folder-open"
                 },
+                "lineage_tax_ids": {
+                    "type": "string",
+                    "format": "file-path",
+                    "description": "Local file that holds a mapping between BUSCO lineages and taxon IDs.",
+                    "help_text": "Initialised from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz",
+                    "fa_icon": "fas fa-file-code"
+                },
                 "blastp": {
                     "type": "string",
                     "format": "file-path",
diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf
index 2a89471f..5a20823c 100644
--- a/subworkflows/local/busco_diamond_blastp.nf
+++ b/subworkflows/local/busco_diamond_blastp.nf
@@ -1,8 +1,8 @@
 //
-// Run BUSCO for a genome from GOAT and runs diamond_blastp
+// Run BUSCO for a genome and runs diamond_blastp
 //
 
-include { GOAT_TAXONSEARCH          } from '../../modules/nf-core/goat/taxonsearch/main'
+include { NCBI_GET_ODB_TAXON        } from '../../modules/local/get_odb_taxon'
 include { BUSCO                     } from '../../modules/nf-core/busco/main'
 include { BLOBTOOLKIT_EXTRACTBUSCOS } from '../../modules/local/blobtoolkit/extractbuscos'
 include { DIAMOND_BLASTP            } from '../../modules/nf-core/diamond/blastp/main'
@@ -14,6 +14,7 @@ workflow BUSCO_DIAMOND {
     fasta        // channel: [ val(meta), path(fasta) ]
     taxon        // channel: val(taxon)
     busco_db     // channel: path(busco_db)
+    lineage_tax_ids        // channel: /path/to/lineage_tax_ids
     blastp       // channel: path(blastp_db)
 
 
@@ -24,22 +25,21 @@ workflow BUSCO_DIAMOND {
     //
     // Fetch BUSCO lineages for taxon
     //
-    GOAT_TAXONSEARCH (
-        fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] }
+    NCBI_GET_ODB_TAXON (
+        fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon ] },
+        lineage_tax_ids,
     )
-    ch_versions = ch_versions.mix ( GOAT_TAXONSEARCH.out.versions.first() )
+    ch_versions = ch_versions.mix ( NCBI_GET_ODB_TAXON.out.versions.first() )
 
 
     //
     // Get NCBI species ID
     //
-    GOAT_TAXONSEARCH.out.taxonsearch
-    | map { meta, csv -> csv.splitCsv(header:true, sep:'\t', strip:true) }
-    | map { row -> [ row.taxon_rank, row.taxon_id ] }
-    | transpose()
-    | filter { rank,id -> rank =~ /species/ }
-    | map { rank, id -> id}
-    | first
+    NCBI_GET_ODB_TAXON.out.csv
+    | map { meta, csv -> csv }
+    | splitCsv(header: ['key', 'value'])
+    | filter { it.key == "taxon_id" }
+    | map { it.value }
     | set { ch_taxid }
 
 
@@ -49,10 +49,13 @@ workflow BUSCO_DIAMOND {
     // 0. Initialise sone variables
     basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ]
     def lineage_position = 0
-    // 1. Parse the GOAT_TAXONSEARCH output
-    GOAT_TAXONSEARCH.out.taxonsearch
-    | map { meta, csv -> csv.splitCsv(header:true, sep:'\t', strip:true) }
-    | map { row -> row.odb10_lineage.findAll { it != "" } }
+    // 1. Parse the NCBI_GET_ODB_TAXON output
+    NCBI_GET_ODB_TAXON.out.csv
+    | map { meta, csv -> csv }
+    | splitCsv(header: ['key', 'value'])
+    | filter { it.key == "busco_lineage" }
+    | map { it.value }
+    | collect
     // 2. Add the (missing) basal lineages
     | map { lineages -> (lineages + basal_lineages).unique() }
     | flatten ()
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index 3610cdde..7d8a3a75 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -17,7 +17,7 @@ WorkflowBlobtoolkit.initialise(params, log)
 
 // Add all file path parameters for the pipeline to the list below
 // Check input path parameters to see if they exist
-def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxdump, params.busco, params.blastp, params.blastx ]
+def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxdump, params.busco, params.blastp, params.blastx, params.lineage_tax_ids ]
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
@@ -30,6 +30,8 @@ if (params.blastn) { ch_blastn = Channel.value([ [ 'id': file(params.blastn).bas
 if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' }
 if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' }
 
+if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.lineage_tax_ids) } else { exit 1, 'Mapping BUSCO lineage <-> taxon_ids not specified' }
+
 // Create channel for optional parameters
 if (params.busco) { ch_busco_db = Channel.fromPath(params.busco).first() } else { ch_busco_db = Channel.value([]) }
 if (params.yaml) { ch_yaml = Channel.fromPath(params.yaml) } else { ch_yaml = Channel.empty() }
@@ -123,12 +125,13 @@ workflow BLOBTOOLKIT {
     ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions )
 
     //
-    // SUBWORKFLOW: Run BUSCO using lineages fetched from GOAT, then run diamond_blastp
+    // SUBWORKFLOW: Run BUSCO using lineages fetched from NCBI, then run diamond_blastp
     //
     BUSCO_DIAMOND (
         PREPARE_GENOME.out.genome,
         ch_taxon,
         ch_busco_db,
+        ch_lineage_tax_ids,
         ch_blastp,
     )
     ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions )

From da7faf9155020d3fb2bda17b3e28310432a28e47 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 28 May 2024 09:15:08 +0000
Subject: [PATCH 06/30] bugfix: Need all lineage names, not just eukaryotes

---
 ...g_taxids-busco_dataset_name.2019-12-16.txt | 166 ++++++++++++++++++
 ...ataset_name.eukaryota_odb10.2019-12-16.txt |  67 -------
 nextflow.config                               |   4 +-
 nextflow_schema.json                          |   2 +-
 4 files changed, 169 insertions(+), 70 deletions(-)
 create mode 100644 assets/mapping_taxids-busco_dataset_name.2019-12-16.txt
 delete mode 100644 assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt

diff --git a/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt b/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt
new file mode 100644
index 00000000..7105beeb
--- /dev/null
+++ b/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt
@@ -0,0 +1,166 @@
+422676	aconoidasida
+7898	actinopterygii
+5338	agaricales
+155619	agaricomycetes
+33630	alveolata
+5794	apicomplexa
+6854	arachnida
+6656	arthropoda
+4890	ascomycota
+8782	aves
+5204	basidiomycota
+68889	boletales
+3699	brassicales
+134362	capnodiales
+33554	carnivora
+91561	cetartiodactyla
+34395	chaetothyriales
+3041	chlorophyta
+5796	coccidia
+28738	cyprinodontiformes
+7147	diptera
+147541	dothideomycetes
+3193	embryophyta
+33392	endopterygota
+314146	euarchontoglires
+33682	euglenozoa
+2759	eukaryota
+5042	eurotiales
+147545	eurotiomycetes
+9347	eutheria
+72025	fabales
+4751	fungi
+314147	glires
+1028384	glomerellales
+5178	helotiales
+7524	hemiptera
+7399	hymenoptera
+5125	hypocreales
+50557	insecta
+314145	laurasiatheria
+147548	leotiomycetes
+7088	lepidoptera
+4447	liliopsida
+40674	mammalia
+33208	metazoa
+6029	microsporidia
+6447	mollusca
+4827	mucorales
+1913637	mucoromycota
+6231	nematoda
+33183	onygenales
+9126	passeriformes
+5820	plasmodium
+92860	pleosporales
+38820	poales
+5303	polyporales
+9443	primates
+4891	saccharomycetes
+8457	sauropsida
+4069	solanales
+147550	sordariomycetes
+33634	stramenopiles
+32523	tetrapoda
+155616	tremellomycetes
+7742	vertebrata
+33090	viridiplantae
+71240	eudicots
+57723	acidobacteria
+201174	actinobacteria_phylum
+1760	actinobacteria_class
+28211	alphaproteobacteria
+135622	alteromonadales
+200783	aquificae
+1385	bacillales
+91061	bacilli
+2	bacteria
+171549	bacteroidales
+976	bacteroidetes
+68336	bacteroidetes-chlorobi_group
+200643	bacteroidia
+28216	betaproteobacteria
+80840	burkholderiales
+213849	campylobacterales
+1706369	cellvibrionales
+204428	chlamydiae
+1090	chlorobi
+200795	chloroflexi
+135613	chromatiales
+1118	chroococcales
+186801	clostridia
+186802	clostridiales
+84999	coriobacteriales
+84998	coriobacteriia
+85007	corynebacteriales
+1117	cyanobacteria
+768507	cytophagales
+768503	cytophagia
+68525	delta-epsilon-subdivisions
+28221	deltaproteobacteria
+213118	desulfobacterales
+213115	desulfovibrionales
+69541	desulfuromonadales
+91347	enterobacterales
+186328	entomoplasmatales
+29547	epsilonproteobacteria
+1239	firmicutes
+200644	flavobacteriales
+117743	flavobacteriia
+32066	fusobacteria
+203491	fusobacteriales
+1236	gammaproteobacteria
+186826	lactobacillales
+118969	legionellales
+85006	micrococcales
+31969	mollicutes
+2085	mycoplasmatales
+206351	neisseriales
+32003	nitrosomonadales
+1161	nostocales
+135619	oceanospirillales
+1150	oscillatoriales
+135625	pasteurellales
+203682	planctomycetes
+85009	propionibacteriales
+1224	proteobacteria
+72274	pseudomonadales
+356	rhizobiales
+227290	rhizobium-agrobacterium_group
+204455	rhodobacterales
+204441	rhodospirillales
+766	rickettsiales
+909929	selenomonadales
+117747	sphingobacteriia
+204457	sphingomonadales
+136	spirochaetales
+203691	spirochaetes
+203692	spirochaetia
+85011	streptomycetales
+85012	streptosporangiales
+1890424	synechococcales
+508458	synergistetes
+544448	tenericutes
+68295	thermoanaerobacterales
+200918	thermotogae
+72273	thiotrichales
+1737405	tissierellales
+1737404	tissierellia
+74201	verrucomicrobia
+135623	vibrionales
+135614	xanthomonadales
+2157	archaea
+2266	thermoproteales
+2281	sulfolobales
+114380	desulfurococcales
+183967	thermoplasmata
+651137	thaumarchaeota
+2182	methanococcales
+2191	methanomicrobiales
+183925	methanobacteria
+183924	thermoprotei
+2235	halobacteriales
+1644060	natrialbales
+224756	methanomicrobia
+1644055	haloferacales
+183963	halobacteria
+28890	euryarchaeota
diff --git a/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt b/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt
deleted file mode 100644
index 5d8af90f..00000000
--- a/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-422676	aconoidasida
-7898	actinopterygii
-5338	agaricales
-155619	agaricomycetes
-33630	alveolata
-5794	apicomplexa
-6854	arachnida
-6656	arthropoda
-4890	ascomycota
-8782	aves
-5204	basidiomycota
-68889	boletales
-3699	brassicales
-134362	capnodiales
-33554	carnivora
-91561	cetartiodactyla
-34395	chaetothyriales
-3041	chlorophyta
-5796	coccidia
-28738	cyprinodontiformes
-7147	diptera
-147541	dothideomycetes
-3193	embryophyta
-33392	endopterygota
-314146	euarchontoglires
-33682	euglenozoa
-2759	eukaryota
-5042	eurotiales
-147545	eurotiomycetes
-9347	eutheria
-72025	fabales
-4751	fungi
-314147	glires
-1028384	glomerellales
-5178	helotiales
-7524	hemiptera
-7399	hymenoptera
-5125	hypocreales
-50557	insecta
-314145	laurasiatheria
-147548	leotiomycetes
-7088	lepidoptera
-4447	liliopsida
-40674	mammalia
-33208	metazoa
-6029	microsporidia
-6447	mollusca
-4827	mucorales
-1913637	mucoromycota
-6231	nematoda
-33183	onygenales
-9126	passeriformes
-5820	plasmodium
-92860	pleosporales
-38820	poales
-5303	polyporales
-9443	primates
-4891	saccharomycetes
-8457	sauropsida
-4069	solanales
-147550	sordariomycetes
-33634	stramenopiles
-32523	tetrapoda
-155616	tremellomycetes
-7742	vertebrata
-33090	viridiplantae
-71240	eudicots
diff --git a/nextflow.config b/nextflow.config
index 38964a66..1b4db5ed 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -28,8 +28,8 @@ params {
     // Databases and related options
     taxdump                    = null
     busco                      = null
-    // Taken from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz
-    lineage_tax_ids            = "${projectDir}/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt"
+    // Taken from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.*.2019-12-16.txt.tar.gz
+    lineage_tax_ids            = "${projectDir}/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt"
     blastp                     = null
     blastx                     = null
     blastn                     = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 8e7f0fba..529faa7e 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -112,7 +112,7 @@
                     "type": "string",
                     "format": "file-path",
                     "description": "Local file that holds a mapping between BUSCO lineages and taxon IDs.",
-                    "help_text": "Initialised from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz",
+                    "help_text": "Initialised from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.*.2019-12-16.txt.tar.gz",
                     "fa_icon": "fas fa-file-code"
                 },
                 "blastp": {

From 9c5b4124039e82f9a5a133de053ce4152e4d8bc0 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 13 May 2024 11:26:48 +0100
Subject: [PATCH 07/30] Adding a parameter to choose the busco lineages

---
 CHANGELOG.md                               | 10 +++++++
 bin/get_odb_taxon.py                       | 32 +++++++++++++++-------
 modules/local/get_odb_taxon.nf             |  4 ++-
 nextflow.config                            |  1 +
 nextflow_schema.json                       |  6 ++++
 subworkflows/local/busco_diamond_blastp.nf |  2 ++
 workflows/blobtoolkit.nf                   |  4 ++-
 7 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f8f155d..5ca038f1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,16 @@ The pipeline has now been validated for draft (unpublished) assemblies.
 
 - The pipeline now queries the NCBI database instead of GoaT to establish the
   taxonomic classification of the species and the relevant Busco lineages.
+- New `--busco_lineages` parameter to choose specific Busco lineages instead of
+  automatically selecting based on the taxonomy.
+
+### Parameters
+
+| Old parameter | New parameter    |
+| ------------- | ---------------- |
+|               | --busco_lineages |
+
+> **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.
 
 ### Software dependencies
 
diff --git a/bin/get_odb_taxon.py b/bin/get_odb_taxon.py
index ade35e34..6038122a 100755
--- a/bin/get_odb_taxon.py
+++ b/bin/get_odb_taxon.py
@@ -16,7 +16,8 @@ def parse_args(args=None):
     parser.add_argument("TAXON_QUERY", help="Query string/integer for this taxon.")
     parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.")
     parser.add_argument("FILE_OUT", help="Output CSV file.")
-    parser.add_argument("--version", action="version", version="%(prog)s 1.0")
+    parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None)
+    parser.add_argument("--version", action="version", version="%(prog)s 1.1")
     return parser.parse_args(args)
 
 
@@ -25,23 +26,34 @@ def make_dir(path):
         os.makedirs(path, exist_ok=True)
 
 
-def get_odb(taxon_query, lineage_tax_ids, file_out):
+def get_odb(taxon_query, lineage_tax_ids, requested_buscos, file_out):
     # Read the mapping between the BUSCO lineages and their taxon_id
     with open(lineage_tax_ids) as file_in:
         lineage_tax_ids_dict = {}
         for line in file_in:
             arr = line.split()
-            lineage_tax_ids_dict[int(arr[0])] = arr[1]
+            lineage_tax_ids_dict[int(arr[0])] = arr[1] + "_odb10"
 
     # Using API, get the taxon_ids of the species and all parents
     response = requests.get(NCBI_TAXONOMY_API % taxon_query).json()
+
     this_taxon_id = response["taxonomy_nodes"][0]["taxonomy"]["tax_id"]
-    ancestor_taxon_ids = response["taxonomy_nodes"][0]["taxonomy"]["lineage"]
 
-    # Do the intersection to find the ancestors that have a BUSCO lineage
-    odb_arr = [
-        lineage_tax_ids_dict[taxon_id] for taxon_id in reversed(ancestor_taxon_ids) if taxon_id in lineage_tax_ids_dict
-    ]
+    if requested_buscos:
+        odb_arr = requested_buscos.split(",")
+        valid_odbs = set(lineage_tax_ids_dict.values())
+        for odb in odb_arr:
+            if odb not in valid_odbs:
+                print(f"Invalid BUSCO lineage: {odb}", file=sys.stderr)
+                sys.exit(1)
+    else:
+        # Do the intersection to find the ancestors that have a BUSCO lineage
+        ancestor_taxon_ids = response["taxonomy_nodes"][0]["taxonomy"]["lineage"]
+        odb_arr = [
+            lineage_tax_ids_dict[taxon_id]
+            for taxon_id in reversed(ancestor_taxon_ids)
+            if taxon_id in lineage_tax_ids_dict
+        ]
 
     out_dir = os.path.dirname(file_out)
     make_dir(out_dir)
@@ -49,12 +61,12 @@ def get_odb(taxon_query, lineage_tax_ids, file_out):
     with open(file_out, "w") as fout:
         print("taxon_id", this_taxon_id, sep=",", file=fout)
         for odb_val in odb_arr:
-            print("busco_lineage", odb_val + "_odb10", sep=",", file=fout)
+            print("busco_lineage", odb_val, sep=",", file=fout)
 
 
 def main(args=None):
     args = parse_args(args)
-    get_odb(args.TAXON_QUERY, args.LINEAGE_TAX_IDS, args.FILE_OUT)
+    get_odb(args.TAXON_QUERY, args.LINEAGE_TAX_IDS, args.REQUESTED_BUSCOS, args.FILE_OUT)
 
 
 if __name__ == "__main__":
diff --git a/modules/local/get_odb_taxon.nf b/modules/local/get_odb_taxon.nf
index ff061ea3..8787c276 100644
--- a/modules/local/get_odb_taxon.nf
+++ b/modules/local/get_odb_taxon.nf
@@ -9,6 +9,7 @@ process NCBI_GET_ODB_TAXON {
 
     input:
     tuple val(meta), val(taxon_query)
+    val busco_lin
     path lineage_tax_ids
 
     output:
@@ -21,8 +22,9 @@ process NCBI_GET_ODB_TAXON {
     script:
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
+    def busco_param = busco_lin ? "--busco '${busco_lin}'" : ""
     """
-    get_odb_taxon.py "$taxon_query" $lineage_tax_ids ${prefix}.csv
+    get_odb_taxon.py "$taxon_query" $lineage_tax_ids $busco_param ${prefix}.csv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/nextflow.config b/nextflow.config
index 1b4db5ed..67e5bd70 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -16,6 +16,7 @@ params {
     align                      = false
     mask                       = false
     fetchngs_samplesheet       = false
+    busco_lineages             = null
 
     // Reference options
     fasta                      = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 529faa7e..0a474234 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -43,6 +43,12 @@
                     "description": "Custom config file for draft assembly",
                     "fa_icon": "fas fa-file-alt"
                 },
+                "busco_lineages": {
+                    "type": "string",
+                    "pattern": "([^,]+_odb10,)*[^,]+_odb10",
+                    "description": "Name of the ODB10 lineages to run BUSCO on. By default, the pipeline will automatically selecy the correct lineage based on the taxonomy.",
+                    "fa_icon": "fas fa-bone"
+                },
                 "image_format": {
                     "type": "string",
                     "enum": ["png", "svg"],
diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf
index 5a20823c..696805ed 100644
--- a/subworkflows/local/busco_diamond_blastp.nf
+++ b/subworkflows/local/busco_diamond_blastp.nf
@@ -13,6 +13,7 @@ workflow BUSCO_DIAMOND {
     take:
     fasta        // channel: [ val(meta), path(fasta) ]
     taxon        // channel: val(taxon)
+    busco_lin    // channel: val([busco_lin])
     busco_db     // channel: path(busco_db)
     lineage_tax_ids        // channel: /path/to/lineage_tax_ids
     blastp       // channel: path(blastp_db)
@@ -27,6 +28,7 @@ workflow BUSCO_DIAMOND {
     //
     NCBI_GET_ODB_TAXON (
         fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon ] },
+        busco_lin,
         lineage_tax_ids,
     )
     ch_versions = ch_versions.mix ( NCBI_GET_ODB_TAXON.out.versions.first() )
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index 7d8a3a75..81b2163e 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -30,9 +30,10 @@ if (params.blastn) { ch_blastn = Channel.value([ [ 'id': file(params.blastn).bas
 if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' }
 if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' }
 
-if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.lineage_tax_ids) } else { exit 1, 'Mapping BUSCO lineage <-> taxon_ids not specified' }
+if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.lineage_tax_ids).first() } else { exit 1, 'Mapping BUSCO lineage <-> taxon_ids not specified' }
 
 // Create channel for optional parameters
+if (params.busco_lineages) { ch_busco_lin = Channel.value(params.busco_lineages) } else { ch_busco_lin = Channel.value([]) }
 if (params.busco) { ch_busco_db = Channel.fromPath(params.busco).first() } else { ch_busco_db = Channel.value([]) }
 if (params.yaml) { ch_yaml = Channel.fromPath(params.yaml) } else { ch_yaml = Channel.empty() }
 if (params.yaml && params.accession) { exit 1, '--yaml cannot be provided at the same time as --accession !' }
@@ -130,6 +131,7 @@ workflow BLOBTOOLKIT {
     BUSCO_DIAMOND (
         PREPARE_GENOME.out.genome,
         ch_taxon,
+        ch_busco_lin,
         ch_busco_db,
         ch_lineage_tax_ids,
         ch_blastp,

From cb89c3b9222397566b1368f43bb42b511379262b Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Sat, 18 May 2024 11:36:39 +0100
Subject: [PATCH 08/30] Removed the legacy yaml configuration file

---
 CHANGELOG.md                      |  3 +++
 assets/minimal.yaml               | 21 +++++++++++++++++++++
 docs/usage.md                     | 12 +++++-------
 nextflow.config                   |  1 -
 nextflow_schema.json              |  6 ------
 subworkflows/local/input_check.nf |  7 ++++---
 workflows/blobtoolkit.nf          |  5 +----
 7 files changed, 34 insertions(+), 21 deletions(-)
 create mode 100644 assets/minimal.yaml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5ca038f1..35c1b810 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,11 +11,14 @@ The pipeline has now been validated for draft (unpublished) assemblies.
   taxonomic classification of the species and the relevant Busco lineages.
 - New `--busco_lineages` parameter to choose specific Busco lineages instead of
   automatically selecting based on the taxonomy.
+- All parameters are now passed the regular Nextflow way. There is no support
+  for the original Yaml configuration files of the Snakemake version.
 
 ### Parameters
 
 | Old parameter | New parameter    |
 | ------------- | ---------------- |
+| --yaml        |                  |
 |               | --busco_lineages |
 
 > **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.
diff --git a/assets/minimal.yaml b/assets/minimal.yaml
new file mode 100644
index 00000000..e2ed4465
--- /dev/null
+++ b/assets/minimal.yaml
@@ -0,0 +1,21 @@
+assembly:
+  level: bar
+settings:
+  stats_windows:
+  - 0.1
+  - 0.01
+  - 100000
+  - 1000000
+similarity:
+  diamond_blastx:
+    foo: 0
+taxon:
+  class: class_name
+  family: family_name
+  genus: genus_name
+  kingdom: kingdom_name
+  name: species_name
+  order: order_name
+  phylum: phylum_name
+  superkingdom: superkingdom_name
+  taxid: 0
diff --git a/docs/usage.md b/docs/usage.md
index 0619d8e0..4d0ad33e 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -162,12 +162,6 @@ If you have [GNU parallel](https://www.gnu.org/software/parallel/) installed, yo
 find v5/data -name "*.tar.gz" | parallel "cd {//}; tar -xzf {/}"
 ```
 
-## YAML File and Nextflow configuration
-
-As in the Snakemake version [a YAML configuration file](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src#configuration) is needed to generate metadata summary. This YAML config file can be generated with a genome accession value for released assemblies (for example, GCA_XXXXXXXXX.X) or can be passed for draft assemblies (for example, [GCA_922984935.2.yaml](assets/test/GCA_922984935.2.yaml) using the `--yaml` parameter. Even for draft assemblies, a placeholder value should be passed with the `--accession` parameter.
-
-The data in the YAML is currently ignored in the Nextflow pipeline version. The YAML file is retained only to allow compatibility with the BlobDir dataset generated by the [Snakemake version](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src). The taxonomic information in the YAML file can be obtained from [NCBI Taxonomy](https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/).
-
 ## Changes from Snakemake to Nextflow
 
 ### Commands
@@ -189,9 +183,13 @@ Nextflow
 nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession GCA_ACCESSION --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB
 
 # Draft Assemblies
-nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession TAG --taxon TAXON_ID --yaml CONFIG --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB
+nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB
 ```
 
+The Nextflow pipeline does not support taking input from the Yaml files of the Snakemake pipeline.
+Instead, Nextflow has a uniform way of setting input parameters on the command-line or via a JSON / Yaml file,
+see <https://training.nextflow.io/basic_training/config/> for some examples.
+
 ### Subworkflows
 
 Here is a full list of snakemake subworkflows and their Nextflow couterparts:
diff --git a/nextflow.config b/nextflow.config
index 67e5bd70..a1ea3ae9 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -12,7 +12,6 @@ params {
     // Specify your pipeline's command line flags
     // Input options
     input                      = null
-    yaml                       = null
     align                      = false
     mask                       = false
     fetchngs_samplesheet       = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 0a474234..cd0b7341 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -37,12 +37,6 @@
                     "description": "Turn on the conversion from a nf-core/fetchngs samplesheet.",
                     "fa_icon": "fas fa-toggle-off"
                 },
-                "yaml": {
-                    "type": "string",
-                    "format": "file-path",
-                    "description": "Custom config file for draft assembly",
-                    "fa_icon": "fas fa-file-alt"
-                },
                 "busco_lineages": {
                     "type": "string",
                     "pattern": "([^,]+_odb10,)*[^,]+_odb10",
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index d498269f..a7a4a017 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -12,7 +12,6 @@ workflow INPUT_CHECK {
     take:
     samplesheet // file: /path/to/samplesheet.csv
     fasta       // channel: [ meta, path(fasta) ]
-    yaml        // channel: [ meta, path(config ]
 
     main:
     ch_versions = Channel.empty()
@@ -58,7 +57,7 @@ workflow INPUT_CHECK {
     | set { reads }
 
 
-    if ( !params.yaml ) {
+    if ( params.accession ) {
         read_files
         | map { meta, data -> meta.id.split("_")[0..-2].join("_") }
         | combine ( fasta )
@@ -70,7 +69,9 @@ workflow INPUT_CHECK {
         ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() )
         ch_config = BLOBTOOLKIT_CONFIG.out.yaml
     } else {
-        ch_config   = yaml
+        fasta
+        | map { meta, fa -> [meta, file("${projectDir}/assets/minimal.yaml")] }
+        | set { ch_config }
     }
 
     emit:
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index 81b2163e..a1318967 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -35,9 +35,6 @@ if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.linea
 // Create channel for optional parameters
 if (params.busco_lineages) { ch_busco_lin = Channel.value(params.busco_lineages) } else { ch_busco_lin = Channel.value([]) }
 if (params.busco) { ch_busco_db = Channel.fromPath(params.busco).first() } else { ch_busco_db = Channel.value([]) }
-if (params.yaml) { ch_yaml = Channel.fromPath(params.yaml) } else { ch_yaml = Channel.empty() }
-if (params.yaml && params.accession) { exit 1, '--yaml cannot be provided at the same time as --accession !' }
-if (!params.yaml && !params.accession) { exit 1, '--yaml and --accession are both mising. Pick one !' }
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -105,7 +102,7 @@ workflow BLOBTOOLKIT {
     //
     // SUBWORKFLOW: Check samplesheet and create channels for downstream analysis
     //
-    INPUT_CHECK ( ch_input, PREPARE_GENOME.out.genome, ch_yaml )
+    INPUT_CHECK ( ch_input, PREPARE_GENOME.out.genome )
     ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions )
 
     //

From d08ba188e863ee7f2b9dc732275cc5ef8ddbf262 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Sat, 18 May 2024 11:49:21 +0100
Subject: [PATCH 09/30] Generate a yaml file internally

---
 assets/minimal.yaml                        |  21 ---
 bin/generate_config.py                     | 154 +++++++++++++++++++++
 bin/get_odb_taxon.py                       |  73 ----------
 modules/local/generate_config.nf           |  34 +++++
 modules/local/get_odb_taxon.nf             |  34 -----
 subworkflows/local/busco_diamond_blastp.nf |  20 +--
 subworkflows/local/input_check.nf          |  19 ++-
 workflows/blobtoolkit.nf                   |  12 +-
 8 files changed, 215 insertions(+), 152 deletions(-)
 delete mode 100644 assets/minimal.yaml
 create mode 100755 bin/generate_config.py
 delete mode 100755 bin/get_odb_taxon.py
 create mode 100644 modules/local/generate_config.nf
 delete mode 100644 modules/local/get_odb_taxon.nf

diff --git a/assets/minimal.yaml b/assets/minimal.yaml
deleted file mode 100644
index e2ed4465..00000000
--- a/assets/minimal.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-assembly:
-  level: bar
-settings:
-  stats_windows:
-  - 0.1
-  - 0.01
-  - 100000
-  - 1000000
-similarity:
-  diamond_blastx:
-    foo: 0
-taxon:
-  class: class_name
-  family: family_name
-  genus: genus_name
-  kingdom: kingdom_name
-  name: species_name
-  order: order_name
-  phylum: phylum_name
-  superkingdom: superkingdom_name
-  taxid: 0
diff --git a/bin/generate_config.py b/bin/generate_config.py
new file mode 100755
index 00000000..8127b40a
--- /dev/null
+++ b/bin/generate_config.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+
+import argparse
+import dataclasses
+import os
+import sys
+import requests
+import typing
+import yaml
+
+NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s"
+
+RANKS = [
+    "genus",
+    "family",
+    "order",
+    "class",
+    "phylum",
+    "kingdom",
+    "superkingdom",
+]
+
+STATS_WINDOWS = [
+    0.1,
+    0.01,
+    100000,
+    1000000,
+]
+
+
+def parse_args(args=None):
+    Description = "Produce the various configuration files needed within the pipeline"
+
+    parser = argparse.ArgumentParser(description=Description)
+    parser.add_argument("FASTA", help="Path to the Fasta file of the assembly.")
+    parser.add_argument("TAXON_QUERY", help="Query string/integer for this taxon.")
+    parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.")
+    parser.add_argument("YML_OUT", help="Output YML file.")
+    parser.add_argument("CSV_OUT", help="Output CSV file.")
+    parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None)
+    parser.add_argument("--version", action="version", version="%(prog)s 1.0")
+    return parser.parse_args(args)
+
+
+def make_dir(path):
+    if len(path) > 0:
+        os.makedirs(path, exist_ok=True)
+
+
+@dataclasses.dataclass
+class TaxonInfo:
+    taxon_id: int
+    organism_name: str
+    rank: typing.Optional[str]
+    lineage: typing.List[int]
+
+
+def make_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo:
+    # Using API, get the taxon_ids of the species and all parents
+    response = requests.get(NCBI_TAXONOMY_API % taxon_name).json()
+    body = response["taxonomy_nodes"][0]["taxonomy"]
+    return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), body["lineage"])
+
+
+def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]:
+    ancestors: typing.Dict[str, str] = {}
+    for anc_taxon_id in taxon_info.lineage[1:]:  # skip the root taxon_id=1
+        anc_taxon_info = make_taxon_info(anc_taxon_id)
+        if anc_taxon_info.rank:
+            ancestors[anc_taxon_info.rank.lower()] = anc_taxon_info.organism_name
+    return {r: ancestors[r] for r in RANKS if r in ancestors}
+
+
+def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typing.Optional[str]) -> typing.List[str]:
+
+    # Read the mapping between the BUSCO lineages and their taxon_id
+    with open(lineage_tax_ids) as file_in:
+        lineage_tax_ids_dict: typing.Dict[int, str] = {}
+        for line in file_in:
+            arr = line.split()
+            lineage_tax_ids_dict[int(arr[0])] = arr[1] + "_odb10"
+
+    if requested_buscos:
+        odb_arr = requested_buscos.split(",")
+        valid_odbs = set(lineage_tax_ids_dict.values())
+        for odb in odb_arr:
+            if odb not in valid_odbs:
+                print(f"Invalid BUSCO lineage: {odb}", file=sys.stderr)
+                sys.exit(1)
+    else:
+        # Do the intersection to find the ancestors that have a BUSCO lineage
+        odb_arr = [
+            lineage_tax_ids_dict[taxon_id]
+            for taxon_id in reversed(taxon_info.lineage)
+            if taxon_id in lineage_tax_ids_dict
+        ]
+
+    return odb_arr
+
+
+def print_yaml(
+    file_out, fasta: str, taxon_info: TaxonInfo, classification: typing.Dict[str, str], odb_arr: typing.List[str]
+):
+
+    data = {
+        "assembly": {
+            "file": fasta,
+            "level": "scaffold",
+        },
+        "settings": {
+            "stats_windows": STATS_WINDOWS,
+        },
+        "similarity": {
+            "diamond_blastx": {
+                "foo": 0,
+            },
+        },
+        "taxon": {
+            "taxid": str(taxon_info.taxon_id),  # str on purpose, to mimic blobtoolkit
+            "name": taxon_info.organism_name,
+            **classification,
+        },
+    }
+    out_dir = os.path.dirname(file_out)
+    make_dir(out_dir)
+
+    with open(file_out, "w") as fout:
+        yaml.dump(data, fout)
+
+
+def print_csv(file_out, taxon_info: TaxonInfo, odb_arr: typing.List[str]):
+
+    out_dir = os.path.dirname(file_out)
+    make_dir(out_dir)
+
+    with open(file_out, "w") as fout:
+        print("taxon_id", taxon_info.taxon_id, sep=",", file=fout)
+        for odb_val in odb_arr:
+            print("busco_lineage", odb_val, sep=",", file=fout)
+
+
+def main(args=None):
+    args = parse_args(args)
+
+    taxon_info = make_taxon_info(args.TAXON_QUERY)
+    classification = get_classification(taxon_info)
+    odb_arr = get_odb(taxon_info, args.LINEAGE_TAX_IDS, args.REQUESTED_BUSCOS)
+
+    print_yaml(args.YML_OUT, args.FASTA, taxon_info, classification, odb_arr)
+    print_csv(args.CSV_OUT, taxon_info, odb_arr)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bin/get_odb_taxon.py b/bin/get_odb_taxon.py
deleted file mode 100755
index 6038122a..00000000
--- a/bin/get_odb_taxon.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-import sys
-import requests
-
-
-NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s"
-
-
-def parse_args(args=None):
-    Description = "Get Taxon ID and ODB database value using NCBI API and BUSCO configuration file"
-
-    parser = argparse.ArgumentParser(description=Description)
-    parser.add_argument("TAXON_QUERY", help="Query string/integer for this taxon.")
-    parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.")
-    parser.add_argument("FILE_OUT", help="Output CSV file.")
-    parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None)
-    parser.add_argument("--version", action="version", version="%(prog)s 1.1")
-    return parser.parse_args(args)
-
-
-def make_dir(path):
-    if len(path) > 0:
-        os.makedirs(path, exist_ok=True)
-
-
-def get_odb(taxon_query, lineage_tax_ids, requested_buscos, file_out):
-    # Read the mapping between the BUSCO lineages and their taxon_id
-    with open(lineage_tax_ids) as file_in:
-        lineage_tax_ids_dict = {}
-        for line in file_in:
-            arr = line.split()
-            lineage_tax_ids_dict[int(arr[0])] = arr[1] + "_odb10"
-
-    # Using API, get the taxon_ids of the species and all parents
-    response = requests.get(NCBI_TAXONOMY_API % taxon_query).json()
-
-    this_taxon_id = response["taxonomy_nodes"][0]["taxonomy"]["tax_id"]
-
-    if requested_buscos:
-        odb_arr = requested_buscos.split(",")
-        valid_odbs = set(lineage_tax_ids_dict.values())
-        for odb in odb_arr:
-            if odb not in valid_odbs:
-                print(f"Invalid BUSCO lineage: {odb}", file=sys.stderr)
-                sys.exit(1)
-    else:
-        # Do the intersection to find the ancestors that have a BUSCO lineage
-        ancestor_taxon_ids = response["taxonomy_nodes"][0]["taxonomy"]["lineage"]
-        odb_arr = [
-            lineage_tax_ids_dict[taxon_id]
-            for taxon_id in reversed(ancestor_taxon_ids)
-            if taxon_id in lineage_tax_ids_dict
-        ]
-
-    out_dir = os.path.dirname(file_out)
-    make_dir(out_dir)
-
-    with open(file_out, "w") as fout:
-        print("taxon_id", this_taxon_id, sep=",", file=fout)
-        for odb_val in odb_arr:
-            print("busco_lineage", odb_val, sep=",", file=fout)
-
-
-def main(args=None):
-    args = parse_args(args)
-    get_odb(args.TAXON_QUERY, args.LINEAGE_TAX_IDS, args.REQUESTED_BUSCOS, args.FILE_OUT)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf
new file mode 100644
index 00000000..8e8478e8
--- /dev/null
+++ b/modules/local/generate_config.nf
@@ -0,0 +1,34 @@
+process GENERATE_CONFIG {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "conda-forge::requests=2.28.1 conda-forge::pyyaml=6.0"
+    container "docker.io/genomehubs/blobtoolkit:4.3.9"
+
+    input:
+    tuple val(meta), val(fasta)
+    val taxon_query
+    val busco_lin
+    path lineage_tax_ids
+
+    output:
+    tuple val(meta), path("*.yaml"), emit: yaml
+    tuple val(meta), path("*.csv") , emit: csv
+    path "versions.yml"            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def busco_param = busco_lin ? "--busco '${busco_lin}'" : ""
+    """
+    generate_config.py $fasta "$taxon_query" $lineage_tax_ids $busco_param ${prefix}.yaml ${prefix}.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        generate_config.py: \$(generate_config.py --version | cut -d' ' -f2)
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/get_odb_taxon.nf b/modules/local/get_odb_taxon.nf
deleted file mode 100644
index 8787c276..00000000
--- a/modules/local/get_odb_taxon.nf
+++ /dev/null
@@ -1,34 +0,0 @@
-process NCBI_GET_ODB_TAXON {
-    tag "$meta.id"
-    label 'process_single'
-
-    conda "conda-forge::requests=2.26.0"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/requests:2.26.0':
-        'biocontainers/requests:2.26.0' }"
-
-    input:
-    tuple val(meta), val(taxon_query)
-    val busco_lin
-    path lineage_tax_ids
-
-    output:
-    tuple val(meta), path("*.csv"), emit: csv
-    path "versions.yml"           , emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    def busco_param = busco_lin ? "--busco '${busco_lin}'" : ""
-    """
-    get_odb_taxon.py "$taxon_query" $lineage_tax_ids $busco_param ${prefix}.csv
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        get_odb_taxon.py: \$(get_odb.py --version | cut -d' ' -f2)
-    END_VERSIONS
-    """
-}
diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf
index 696805ed..0d084f87 100644
--- a/subworkflows/local/busco_diamond_blastp.nf
+++ b/subworkflows/local/busco_diamond_blastp.nf
@@ -2,7 +2,6 @@
 // Run BUSCO for a genome and runs diamond_blastp
 //
 
-include { NCBI_GET_ODB_TAXON        } from '../../modules/local/get_odb_taxon'
 include { BUSCO                     } from '../../modules/nf-core/busco/main'
 include { BLOBTOOLKIT_EXTRACTBUSCOS } from '../../modules/local/blobtoolkit/extractbuscos'
 include { DIAMOND_BLASTP            } from '../../modules/nf-core/diamond/blastp/main'
@@ -12,10 +11,8 @@ include { RESTRUCTUREBUSCODIR       } from '../../modules/local/restructurebusco
 workflow BUSCO_DIAMOND {
     take:
     fasta        // channel: [ val(meta), path(fasta) ]
-    taxon        // channel: val(taxon)
-    busco_lin    // channel: val([busco_lin])
+    csv          // channel: [ val(meta), path(csv) ]
     busco_db     // channel: path(busco_db)
-    lineage_tax_ids        // channel: /path/to/lineage_tax_ids
     blastp       // channel: path(blastp_db)
 
 
@@ -23,21 +20,10 @@ workflow BUSCO_DIAMOND {
     ch_versions = Channel.empty()
 
 
-    //
-    // Fetch BUSCO lineages for taxon
-    //
-    NCBI_GET_ODB_TAXON (
-        fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon ] },
-        busco_lin,
-        lineage_tax_ids,
-    )
-    ch_versions = ch_versions.mix ( NCBI_GET_ODB_TAXON.out.versions.first() )
-
-
     //
     // Get NCBI species ID
     //
-    NCBI_GET_ODB_TAXON.out.csv
+    csv
     | map { meta, csv -> csv }
     | splitCsv(header: ['key', 'value'])
     | filter { it.key == "taxon_id" }
@@ -52,7 +38,7 @@ workflow BUSCO_DIAMOND {
     basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ]
     def lineage_position = 0
     // 1. Parse the NCBI_GET_ODB_TAXON output
-    NCBI_GET_ODB_TAXON.out.csv
+    csv
     | map { meta, csv -> csv }
     | splitCsv(header: ['key', 'value'])
     | filter { it.key == "busco_lineage" }
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index a7a4a017..466c283e 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -7,11 +7,15 @@ include { SAMTOOLS_FLAGSTAT         } from '../../modules/nf-core/samtools/flags
 include { SAMPLESHEET_CHECK         } from '../../modules/local/samplesheet_check'
 include { FETCHNGSSAMPLESHEET_CHECK } from '../../modules/local/fetchngssamplesheet_check'
 include { BLOBTOOLKIT_CONFIG        } from '../../modules/local/blobtoolkit/config'
+include { GENERATE_CONFIG           } from '../../modules/local/generate_config'
 
 workflow INPUT_CHECK {
     take:
     samplesheet // file: /path/to/samplesheet.csv
     fasta       // channel: [ meta, path(fasta) ]
+    taxon       // channel: val(taxon)
+    busco_lin   // channel: val([busco_lin])
+    lineage_tax_ids        // channel: /path/to/lineage_tax_ids
 
     main:
     ch_versions = Channel.empty()
@@ -57,6 +61,15 @@ workflow INPUT_CHECK {
     | set { reads }
 
 
+    GENERATE_CONFIG (
+        fasta,
+        taxon,
+        busco_lin,
+        lineage_tax_ids,
+    )
+    ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first())
+
+
     if ( params.accession ) {
         read_files
         | map { meta, data -> meta.id.split("_")[0..-2].join("_") }
@@ -68,15 +81,15 @@ workflow INPUT_CHECK {
         BLOBTOOLKIT_CONFIG ( grouped_reads, fasta )
         ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() )
         ch_config = BLOBTOOLKIT_CONFIG.out.yaml
+
     } else {
-        fasta
-        | map { meta, fa -> [meta, file("${projectDir}/assets/minimal.yaml")] }
-        | set { ch_config }
+        ch_config = GENERATE_CONFIG.out.yaml
     }
 
     emit:
     reads                                   // channel: [ val(meta), path(datafile) ]
     config = ch_config                      // channel: [ val(meta), path(yaml) ]
+    csv_params = GENERATE_CONFIG.out.csv    // channel: [ val(meta), path(csv) ]
     versions = ch_versions                  // channel: [ versions.yml ]
 }
 
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index a1318967..098dfccd 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -102,7 +102,13 @@ workflow BLOBTOOLKIT {
     //
     // SUBWORKFLOW: Check samplesheet and create channels for downstream analysis
     //
-    INPUT_CHECK ( ch_input, PREPARE_GENOME.out.genome )
+    INPUT_CHECK (
+        ch_input,
+        PREPARE_GENOME.out.genome,
+        ch_taxon,
+        ch_busco_lin,
+        ch_lineage_tax_ids,
+    )
     ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions )
 
     //
@@ -127,10 +133,8 @@ workflow BLOBTOOLKIT {
     //
     BUSCO_DIAMOND (
         PREPARE_GENOME.out.genome,
-        ch_taxon,
-        ch_busco_lin,
+        INPUT_CHECK.out.csv_params,
         ch_busco_db,
-        ch_lineage_tax_ids,
         ch_blastp,
     )
     ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions )

From 4e79fb011965bfa7f2fea5f5b9318f9c7fd3125f Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Sat, 18 May 2024 11:52:18 +0100
Subject: [PATCH 10/30] Hide the CSV from the rest of the pipeline

---
 subworkflows/local/busco_diamond_blastp.nf | 26 ++++-------------
 subworkflows/local/input_check.nf          | 34 +++++++++++++++++++++-
 workflows/blobtoolkit.nf                   |  7 +++--
 3 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf
index 0d084f87..4b07723e 100644
--- a/subworkflows/local/busco_diamond_blastp.nf
+++ b/subworkflows/local/busco_diamond_blastp.nf
@@ -11,39 +11,24 @@ include { RESTRUCTUREBUSCODIR       } from '../../modules/local/restructurebusco
 workflow BUSCO_DIAMOND {
     take:
     fasta        // channel: [ val(meta), path(fasta) ]
-    csv          // channel: [ val(meta), path(csv) ]
+    busco_lin    // channel: val([busco_lineages])
     busco_db     // channel: path(busco_db)
     blastp       // channel: path(blastp_db)
+    taxon_id     // channel: val(taxon_id)
 
 
     main:
     ch_versions = Channel.empty()
 
 
-    //
-    // Get NCBI species ID
-    //
-    csv
-    | map { meta, csv -> csv }
-    | splitCsv(header: ['key', 'value'])
-    | filter { it.key == "taxon_id" }
-    | map { it.value }
-    | set { ch_taxid }
-
-
     //
     // Prepare the BUSCO linages
     //
     // 0. Initialise sone variables
     basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ]
     def lineage_position = 0
-    // 1. Parse the NCBI_GET_ODB_TAXON output
-    csv
-    | map { meta, csv -> csv }
-    | splitCsv(header: ['key', 'value'])
-    | filter { it.key == "busco_lineage" }
-    | map { it.value }
-    | collect
+    // 1. Start from the taxon's lineages
+    busco_lin
     // 2. Add the (missing) basal lineages
     | map { lineages -> (lineages + basal_lineages).unique() }
     | flatten ()
@@ -108,7 +93,7 @@ workflow BUSCO_DIAMOND {
     // Hardcoded to match the format expected by blobtools
     def outext = 'txt'
     def cols   = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore'
-    DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols, ch_taxid )
+    DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols, taxon_id )
     ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() )
 
 
@@ -139,7 +124,6 @@ workflow BUSCO_DIAMOND {
     first_table = ch_first_table          // channel: [ val(meta), path(full_table) ]
     all_tables  = ch_indexed_buscos       // channel: [ val(meta), path(full_tables) ]
     blastp_txt  = DIAMOND_BLASTP.out.txt  // channel: [ val(meta), path(txt) ]
-    taxon_id    = ch_taxid                // channel: taxon_id
     multiqc                               // channel: [ meta, summary ]
     versions    = ch_versions             // channel: [ versions.yml ]
 }
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 466c283e..7da0d1b7 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -70,6 +70,37 @@ workflow INPUT_CHECK {
     ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first())
 
 
+    //
+    // Parse the CSV file
+    //
+    GENERATE_CONFIG.out.csv
+    | map { meta, csv -> csv }
+    | splitCsv(header: ['key', 'value'])
+    | branch {
+        taxon_id: it.key == "taxon_id"
+                    return it.value
+        busco_lineage: it.key == "busco_lineage"
+                    return it.value
+    }
+    | set { ch_parsed_csv }
+
+
+    //
+    // Get the taxon ID
+    //
+    ch_parsed_csv.taxon_id
+    | first
+    | set { ch_taxon_id }
+
+
+    //
+    // Get the BUSCO linages
+    //
+    ch_parsed_csv.busco_lineage
+    | collect
+    | set { ch_busco_lineages }
+
+
     if ( params.accession ) {
         read_files
         | map { meta, data -> meta.id.split("_")[0..-2].join("_") }
@@ -89,7 +120,8 @@ workflow INPUT_CHECK {
     emit:
     reads                                   // channel: [ val(meta), path(datafile) ]
     config = ch_config                      // channel: [ val(meta), path(yaml) ]
-    csv_params = GENERATE_CONFIG.out.csv    // channel: [ val(meta), path(csv) ]
+    taxon_id = ch_taxon_id                  // channel: val(taxon_id)
+    busco_lineages = ch_busco_lineages      // channel: val([busco_lin])
     versions = ch_versions                  // channel: [ versions.yml ]
 }
 
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index 098dfccd..4c557af4 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -133,9 +133,10 @@ workflow BLOBTOOLKIT {
     //
     BUSCO_DIAMOND (
         PREPARE_GENOME.out.genome,
-        INPUT_CHECK.out.csv_params,
+        INPUT_CHECK.out.busco_lineages,
         ch_busco_db,
         ch_blastp,
+        INPUT_CHECK.out.taxon_id,
     )
     ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions )
 
@@ -146,7 +147,7 @@ workflow BLOBTOOLKIT {
         PREPARE_GENOME.out.genome,
         BUSCO_DIAMOND.out.first_table,
         ch_blastx,
-        BUSCO_DIAMOND.out.taxon_id,
+        INPUT_CHECK.out.taxon_id,
     )
     ch_versions = ch_versions.mix ( RUN_BLASTX.out.versions )
 
@@ -158,7 +159,7 @@ workflow BLOBTOOLKIT {
         RUN_BLASTX.out.blastx_out,
         PREPARE_GENOME.out.genome,
         ch_blastn,
-        BUSCO_DIAMOND.out.taxon_id,
+        INPUT_CHECK.out.taxon_id,
     )
 
     //

From 4c7cef6f782b3e22fe4310701443dbcf5384ac4c Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 20 May 2024 15:21:32 +0100
Subject: [PATCH 11/30] Generate a more complete yaml to match the one we get
 from blobtoolkit

---
 bin/generate_config.py | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index 8127b40a..da2fd278 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -20,12 +20,7 @@
     "superkingdom",
 ]
 
-STATS_WINDOWS = [
-    0.1,
-    0.01,
-    100000,
-    1000000,
-]
+BUSCO_BASAL_LINEAGES = ["eukaryota_odb10", "bacteria_odb10", "archaea_odb10"]
 
 
 def parse_args(args=None):
@@ -38,7 +33,7 @@ def parse_args(args=None):
     parser.add_argument("YML_OUT", help="Output YML file.")
     parser.add_argument("CSV_OUT", help="Output CSV file.")
     parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None)
-    parser.add_argument("--version", action="version", version="%(prog)s 1.0")
+    parser.add_argument("--version", action="version", version="%(prog)s 1.1")
     return parser.parse_args(args)
 
 
@@ -104,15 +99,38 @@ def print_yaml(
 
     data = {
         "assembly": {
+            # Other attributes can't be filled in for draft assemblies
             "file": fasta,
             "level": "scaffold",
         },
+        "busco": {
+            "basal_lineages": BUSCO_BASAL_LINEAGES,
+            "lineages": odb_arr + [lin for lin in BUSCO_BASAL_LINEAGES if lin not in odb_arr],
+        },
+        "revision": 1,
         "settings": {
-            "stats_windows": STATS_WINDOWS,
+            # Only settings.stats_windows is mandatory, everything else is superfluous
+            "blast_chunk": 100000,
+            "blast_max_chunks": 10,
+            "blast_min_length": 1000,
+            "blast_overlap": 0,
+            "stats_chunk": 1000,
+            "stats_windows": [0.1, 0.01, 100000, 1000000],
+            "tmp": "/tmp",
         },
         "similarity": {
+            # Only the presence similarity.diamond_blastx seems mandatory, everything else is superfluous
+            "blastn": {
+                "name": "nt",
+            },
+            "defaults": {"evalue": 1e-10, "import_evalue": 1e-25, "max_target_seqs": 10, "taxrule": "buscogenes"},
+            "diamond_blastp": {
+                "import_max_target_seqs": 100000,
+                "name": "reference_proteomes",
+                "taxrule": "blastp=buscogenes",
+            },
             "diamond_blastx": {
-                "foo": 0,
+                "name": "reference_proteomes",
             },
         },
         "taxon": {
@@ -120,6 +138,7 @@ def print_yaml(
             "name": taxon_info.organism_name,
             **classification,
         },
+        "version": 2,
     }
     out_dir = os.path.dirname(file_out)
     make_dir(out_dir)

From eb871935480ea6fea82b4c12e4afa3301f3b8299 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 20 May 2024 15:22:25 +0100
Subject: [PATCH 12/30] Update the database paths in the final meta.json

---
 bin/generate_config.py                  |  6 ++++++
 bin/update_versions.py                  | 21 ++++++++++++++++-----
 modules/local/blobtoolkit/updatemeta.nf |  9 +++++++++
 subworkflows/local/finalise_blobdir.nf  |  9 ++++++++-
 4 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index da2fd278..e8a40f33 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -105,8 +105,10 @@ def print_yaml(
         },
         "busco": {
             "basal_lineages": BUSCO_BASAL_LINEAGES,
+            # "download_dir": <completely skipped because missing from final meta.json>
             "lineages": odb_arr + [lin for lin in BUSCO_BASAL_LINEAGES if lin not in odb_arr],
         },
+        # TODO: reads
         "revision": 1,
         "settings": {
             # Only settings.stats_windows is mandatory, everything else is superfluous
@@ -116,21 +118,25 @@ def print_yaml(
             "blast_overlap": 0,
             "stats_chunk": 1000,
             "stats_windows": [0.1, 0.01, 100000, 1000000],
+            # "taxdump": <added by update_versions.py at the end>,
             "tmp": "/tmp",
         },
         "similarity": {
             # Only the presence similarity.diamond_blastx seems mandatory, everything else is superfluous
             "blastn": {
                 "name": "nt",
+                # "path": <added by update_versions.py at the end>,
             },
             "defaults": {"evalue": 1e-10, "import_evalue": 1e-25, "max_target_seqs": 10, "taxrule": "buscogenes"},
             "diamond_blastp": {
                 "import_max_target_seqs": 100000,
                 "name": "reference_proteomes",
+                # "path": <added by update_versions.py at the end>,
                 "taxrule": "blastp=buscogenes",
             },
             "diamond_blastx": {
                 "name": "reference_proteomes",
+                # "path": <added by update_versions.py at the end>,
             },
         },
         "taxon": {
diff --git a/bin/update_versions.py b/bin/update_versions.py
index 9e014b46..f9c5f4b5 100755
--- a/bin/update_versions.py
+++ b/bin/update_versions.py
@@ -15,15 +15,19 @@ def parse_args(args=None):
     parser.add_argument("--meta_in", help="Input JSON file.", required=True)
     parser.add_argument("--meta_out", help="Output JSON file.", required=True)
     parser.add_argument("--software", help="Input YAML file.", required=True)
-    parser.add_argument("--version", action="version", version="%(prog)s 1.1.0")
+    parser.add_argument("--blastp", help="Path to the blastp database", required=True)
+    parser.add_argument("--blastx", help="Path to the blastx database", required=True)
+    parser.add_argument("--blastn", help="Path to the blastn database", required=True)
+    parser.add_argument("--taxdump", help="Path to the taxonomy database", required=True)
+    parser.add_argument("--version", action="version", version="%(prog)s 1.2.0")
     return parser.parse_args(args)
 
 
-def update_meta(meta, software):
-    with open(meta) as fh:
+def update_meta(args):
+    with open(args.meta_in) as fh:
         infile = json.load(fh)
 
-    with open(software) as fh:
+    with open(args.software) as fh:
         versions = yaml.safe_load(fh)
 
     new_dict = dict()
@@ -36,13 +40,20 @@ def update_meta(meta, software):
     del new_dict["sanger-tol/blobtoolkit"]
     infile["settings"]["software_versions"] = new_dict
 
+    infile["settings"]["taxdump"] = args.taxdump
+    for k in ["blastn", "diamond_blastp", "diamond_blastx"]:
+        infile["similarity"].setdefault(k, {})
+    infile["similarity"]["blastn"]["path"] = args.blastn
+    infile["similarity"]["diamond_blastp"]["path"] = args.blastp
+    infile["similarity"]["diamond_blastx"]["path"] = args.blastx
+
     return infile
 
 
 def main(args=None):
     args = parse_args(args)
 
-    data = update_meta(args.meta_in, args.software)
+    data = update_meta(args)
     with open(args.meta_out, "w") as fh:
         json.dump(data, fh)
 
diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf
index a5556348..f39bee85 100644
--- a/modules/local/blobtoolkit/updatemeta.nf
+++ b/modules/local/blobtoolkit/updatemeta.nf
@@ -10,6 +10,11 @@ process BLOBTOOLKIT_UPDATEMETA {
     input:
     tuple val(meta), path(input)
     path versions
+    // The following are passed as "val" because we just want to know the full paths. No staging necessary
+    val blastp
+    val blastx
+    val blastn
+    val taxdump
 
     output:
     tuple val(meta), path("*.json"), emit: json
@@ -26,6 +31,10 @@ process BLOBTOOLKIT_UPDATEMETA {
         ${args} \\
         --meta_in ${input}/meta.json \\
         --software ${versions} \\
+        --blastp ${blastp} \\
+        --blastx ${blastx} \\
+        --blastn ${blastn} \\
+        --taxdump ${taxdump} \\
         --meta_out ${prefix}.meta.json
 
     cat <<-END_VERSIONS > versions.yml
diff --git a/subworkflows/local/finalise_blobdir.nf b/subworkflows/local/finalise_blobdir.nf
index ffbbd534..a993c705 100644
--- a/subworkflows/local/finalise_blobdir.nf
+++ b/subworkflows/local/finalise_blobdir.nf
@@ -18,7 +18,14 @@ workflow FINALISE_BLOBDIR {
     //
     // MODULE: Update meta json file
     //
-    BLOBTOOLKIT_UPDATEMETA ( blobdir, software )
+    BLOBTOOLKIT_UPDATEMETA (
+        blobdir,
+        software,
+        params.blastp,
+        params.blastx,
+        params.blastn,
+        params.taxdump,
+    )
 
     //
     // MODULE: Compress all the json files

From f4c82fb02951d182aa45735ac7a3de677546dfe2 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 20 May 2024 16:05:07 +0100
Subject: [PATCH 13/30] Fill in the reads too

---
 bin/generate_config.py                  |  2 +-
 bin/update_versions.py                  | 21 +++++++++++++++++++++
 modules/local/blobtoolkit/updatemeta.nf |  3 +++
 subworkflows/local/finalise_blobdir.nf  |  2 ++
 workflows/blobtoolkit.nf                |  1 +
 5 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index e8a40f33..ed5868e5 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -108,7 +108,7 @@ def print_yaml(
             # "download_dir": <completely skipped because missing from final meta.json>
             "lineages": odb_arr + [lin for lin in BUSCO_BASAL_LINEAGES if lin not in odb_arr],
         },
-        # TODO: reads
+        # "reads": {}, <added by update_versions.py at the end>
         "revision": 1,
         "settings": {
             # Only settings.stats_windows is mandatory, everything else is superfluous
diff --git a/bin/update_versions.py b/bin/update_versions.py
index f9c5f4b5..1d3491a3 100755
--- a/bin/update_versions.py
+++ b/bin/update_versions.py
@@ -15,6 +15,9 @@ def parse_args(args=None):
     parser.add_argument("--meta_in", help="Input JSON file.", required=True)
     parser.add_argument("--meta_out", help="Output JSON file.", required=True)
     parser.add_argument("--software", help="Input YAML file.", required=True)
+    parser.add_argument("--read_id", action="append", help="ID of a read set")
+    parser.add_argument("--read_type", action="append", help="Type of a read set")
+    parser.add_argument("--read_path", action="append", help="Path of a read set")
     parser.add_argument("--blastp", help="Path to the blastp database", required=True)
     parser.add_argument("--blastx", help="Path to the blastx database", required=True)
     parser.add_argument("--blastn", help="Path to the blastn database", required=True)
@@ -23,6 +26,17 @@ def parse_args(args=None):
     return parser.parse_args(args)
 
 
+def datatype_to_platform(s):
+    if s == "ont":
+        return "OXFORD_NANOPORE"
+    elif s.startswith("pacbio"):
+        return "PACBIO_SMRT"
+    elif s in ["hic", "illumina"]:
+        return "ILLUMINA"
+    else:
+        return "OTHER"
+
+
 def update_meta(args):
     with open(args.meta_in) as fh:
         infile = json.load(fh)
@@ -47,6 +61,13 @@ def update_meta(args):
     infile["similarity"]["diamond_blastp"]["path"] = args.blastp
     infile["similarity"]["diamond_blastx"]["path"] = args.blastx
 
+    infile["reads"] = {}
+    for id, datatype, path in zip(args.read_id, args.read_type, args.read_path):
+        infile["reads"][id] = {
+            "file": path,
+            "plaform": datatype_to_platform(datatype),
+        }
+
     return infile
 
 
diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf
index f39bee85..0c3aaaaf 100644
--- a/modules/local/blobtoolkit/updatemeta.nf
+++ b/modules/local/blobtoolkit/updatemeta.nf
@@ -9,6 +9,7 @@ process BLOBTOOLKIT_UPDATEMETA {
 
     input:
     tuple val(meta), path(input)
+    val reads
     path versions
     // The following are passed as "val" because we just want to know the full paths. No staging necessary
     val blastp
@@ -26,6 +27,7 @@ process BLOBTOOLKIT_UPDATEMETA {
     script:
     def args = task.ext.args ?: ''
     prefix = task.ext.prefix ?: "${meta.id}"
+    def input_reads = reads.collect{"--read_id ${it[0].id} --read_type ${it[0].datatype} --read_path ${it[1]}"}.join(' ')
     """
     update_versions.py \\
         ${args} \\
@@ -35,6 +37,7 @@ process BLOBTOOLKIT_UPDATEMETA {
         --blastx ${blastx} \\
         --blastn ${blastn} \\
         --taxdump ${taxdump} \\
+        $input_reads \\
         --meta_out ${prefix}.meta.json
 
     cat <<-END_VERSIONS > versions.yml
diff --git a/subworkflows/local/finalise_blobdir.nf b/subworkflows/local/finalise_blobdir.nf
index a993c705..352cb01b 100644
--- a/subworkflows/local/finalise_blobdir.nf
+++ b/subworkflows/local/finalise_blobdir.nf
@@ -8,6 +8,7 @@ include { COMPRESSBLOBDIR        } from '../../modules/local/compressblobdir'
 workflow FINALISE_BLOBDIR {
     take:
     blobdir     // channel: [ val(meta), path(blobdir) ]
+    reads       // channel: [ [meta, reads] ]
     software    // channel: [ val(meta), path(software_yml) ]
     summary     // channel: [ val(meta), path(summary_json) ]
 
@@ -20,6 +21,7 @@ workflow FINALISE_BLOBDIR {
     //
     BLOBTOOLKIT_UPDATEMETA (
         blobdir,
+        reads,
         software,
         params.blastp,
         params.blastx,
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index 4c557af4..d608803a 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -206,6 +206,7 @@ workflow BLOBTOOLKIT {
     //
     FINALISE_BLOBDIR (
         BLOBTOOLS.out.blobdir,
+        INPUT_CHECK.out.reads.collect(flat: false).ifEmpty([]),
         CUSTOM_DUMPSOFTWAREVERSIONS.out.yml,
         VIEW.out.summary
     )

From a28690faca5f25087163203903c7c5b3a558833e Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 20 May 2024 17:22:16 +0100
Subject: [PATCH 14/30] Fill in the assembly information too

---
 bin/generate_config.py           | 48 ++++++++++++++++++++++++++------
 modules/local/generate_config.nf |  3 +-
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index ed5868e5..cc95dbd9 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -9,6 +9,7 @@
 import yaml
 
 NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s"
+NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report"
 
 RANKS = [
     "genus",
@@ -32,8 +33,11 @@ def parse_args(args=None):
     parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.")
     parser.add_argument("YML_OUT", help="Output YML file.")
     parser.add_argument("CSV_OUT", help="Output CSV file.")
+    parser.add_argument(
+        "--accession", dest="ACCESSION", help="Accession number of the assembly (optional).", default=None
+    )
     parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None)
-    parser.add_argument("--version", action="version", version="%(prog)s 1.1")
+    parser.add_argument("--version", action="version", version="%(prog)s 1.2")
     return parser.parse_args(args)
 
 
@@ -93,16 +97,37 @@ def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typin
     return odb_arr
 
 
+def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]]:
+    response = requests.get(NCBI_DATASETS_API % accession).json()
+    if response["total_count"] != 1:
+        print(f"Assembly not found: {accession}", file=sys.stderr)
+        sys.exit(1)
+    assembly_report = response["reports"][0]
+    assembly_info = assembly_report["assembly_info"]
+    return {
+        "accession": accession,
+        "alias": assembly_info["assembly_name"],
+        "bioproject": assembly_info["bioproject_accession"],
+        "biosample": assembly_info["biosample"]["accession"],
+        "level": assembly_info["assembly_level"].lower(),
+        "prefix": assembly_report["wgs_info"]["wgs_project_accession"],
+        "scaffold-count": assembly_report["assembly_stats"]["number_of_component_sequences"]
+        + assembly_report["assembly_stats"]["number_of_organelles"],
+        "span": int(assembly_report["assembly_stats"]["total_sequence_length"])
+        + sum(int(oi["total_seq_length"]) for oi in assembly_report["organelle_info"]),
+    }
+
+
 def print_yaml(
-    file_out, fasta: str, taxon_info: TaxonInfo, classification: typing.Dict[str, str], odb_arr: typing.List[str]
+    file_out,
+    assembly_info: typing.Dict[str, typing.Union[str, int]],
+    taxon_info: TaxonInfo,
+    classification: typing.Dict[str, str],
+    odb_arr: typing.List[str],
 ):
 
     data = {
-        "assembly": {
-            # Other attributes can't be filled in for draft assemblies
-            "file": fasta,
-            "level": "scaffold",
-        },
+        "assembly": assembly_info,
         "busco": {
             "basal_lineages": BUSCO_BASAL_LINEAGES,
             # "download_dir": <completely skipped because missing from final meta.json>
@@ -167,11 +192,18 @@ def print_csv(file_out, taxon_info: TaxonInfo, odb_arr: typing.List[str]):
 def main(args=None):
     args = parse_args(args)
 
+    assembly_info: typing.Dict[str, typing.Union[str, int]]
+    if args.ACCESSION:
+        assembly_info = get_assembly_info(args.ACCESSION)
+    else:
+        assembly_info = {"level": "scaffold"}
+    assembly_info["file"] = args.FASTA
+
     taxon_info = make_taxon_info(args.TAXON_QUERY)
     classification = get_classification(taxon_info)
     odb_arr = get_odb(taxon_info, args.LINEAGE_TAX_IDS, args.REQUESTED_BUSCOS)
 
-    print_yaml(args.YML_OUT, args.FASTA, taxon_info, classification, odb_arr)
+    print_yaml(args.YML_OUT, assembly_info, taxon_info, classification, odb_arr)
     print_csv(args.CSV_OUT, taxon_info, odb_arr)
 
 
diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf
index 8e8478e8..f80b1a0c 100644
--- a/modules/local/generate_config.nf
+++ b/modules/local/generate_config.nf
@@ -23,8 +23,9 @@ process GENERATE_CONFIG {
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     def busco_param = busco_lin ? "--busco '${busco_lin}'" : ""
+    def accession_params = params.accession ? "--accession ${params.accession}" : ""
     """
-    generate_config.py $fasta "$taxon_query" $lineage_tax_ids $busco_param ${prefix}.yaml ${prefix}.csv
+    generate_config.py $fasta "$taxon_query" $lineage_tax_ids $busco_param $accession_params ${prefix}.yaml ${prefix}.csv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From a514f2f48d16cf2b396fe10319226797bbdfe05d Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Mon, 20 May 2024 17:30:57 +0100
Subject: [PATCH 15/30] No need to generate the reference initial yaml file

---
 modules/local/blobtoolkit/config.nf | 37 -----------------------------
 subworkflows/local/input_check.nf   | 19 +--------------
 2 files changed, 1 insertion(+), 55 deletions(-)
 delete mode 100644 modules/local/blobtoolkit/config.nf

diff --git a/modules/local/blobtoolkit/config.nf b/modules/local/blobtoolkit/config.nf
deleted file mode 100644
index 32d4eacd..00000000
--- a/modules/local/blobtoolkit/config.nf
+++ /dev/null
@@ -1,37 +0,0 @@
-process BLOBTOOLKIT_CONFIG {
-    tag "$meta.id"
-    label 'process_single'
-
-    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
-        exit 1, "GENERATE_CONFIG module does not support Conda. Please use Docker / Singularity / Podman instead."
-    }
-    container "docker.io/genomehubs/blobtoolkit:4.3.9"
-
-    input:
-    tuple val(meta), val(reads)
-    tuple val(meta), val(fasta)
-
-    output:
-    tuple val(meta), path("${meta.id}/*.yaml"), emit: yaml
-    path "versions.yml"                       , emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    def input_reads = reads.collect{"--reads $it"}.join(' ')
-    """
-    btk pipeline \\
-        generate-config \\
-        ${prefix} \\
-        $args \\
-        ${input_reads}
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//')
-    END_VERSIONS
-    """
-}
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 7da0d1b7..ef2a4e42 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -6,7 +6,6 @@ include { CAT_CAT                   } from '../../modules/nf-core/cat/cat/main'
 include { SAMTOOLS_FLAGSTAT         } from '../../modules/nf-core/samtools/flagstat/main'
 include { SAMPLESHEET_CHECK         } from '../../modules/local/samplesheet_check'
 include { FETCHNGSSAMPLESHEET_CHECK } from '../../modules/local/fetchngssamplesheet_check'
-include { BLOBTOOLKIT_CONFIG        } from '../../modules/local/blobtoolkit/config'
 include { GENERATE_CONFIG           } from '../../modules/local/generate_config'
 
 workflow INPUT_CHECK {
@@ -101,25 +100,9 @@ workflow INPUT_CHECK {
     | set { ch_busco_lineages }
 
 
-    if ( params.accession ) {
-        read_files
-        | map { meta, data -> meta.id.split("_")[0..-2].join("_") }
-        | combine ( fasta )
-        | map { sample, meta, fasta -> [ meta, sample ] }
-        | groupTuple()
-        | set { grouped_reads }
-
-        BLOBTOOLKIT_CONFIG ( grouped_reads, fasta )
-        ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() )
-        ch_config = BLOBTOOLKIT_CONFIG.out.yaml
-
-    } else {
-        ch_config = GENERATE_CONFIG.out.yaml
-    }
-
     emit:
     reads                                   // channel: [ val(meta), path(datafile) ]
-    config = ch_config                      // channel: [ val(meta), path(yaml) ]
+    config = GENERATE_CONFIG.out.yaml       // channel: [ val(meta), path(yaml) ]
     taxon_id = ch_taxon_id                  // channel: val(taxon_id)
     busco_lineages = ch_busco_lineages      // channel: val([busco_lin])
     versions = ch_versions                  // channel: [ versions.yml ]

From df6d7d87fec1976884d6586813f69d987b52bf52 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 21 May 2024 10:08:14 +0100
Subject: [PATCH 16/30] Switched to the newer endpoint

---
 bin/generate_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index cc95dbd9..37a39ea5 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -8,7 +8,7 @@
 import typing
 import yaml
 
-NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s"
+NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/taxonomy/taxon/%s"
 NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report"
 
 RANKS = [
@@ -37,7 +37,7 @@ def parse_args(args=None):
         "--accession", dest="ACCESSION", help="Accession number of the assembly (optional).", default=None
     )
     parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None)
-    parser.add_argument("--version", action="version", version="%(prog)s 1.2")
+    parser.add_argument("--version", action="version", version="%(prog)s 1.3")
     return parser.parse_args(args)
 
 

From e190bddce636f447cac91ff29e68ccb47679da69 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 24 May 2024 11:43:35 +0000
Subject: [PATCH 17/30] Introduced --parameters to have flexibility regarding
 their order

---
 bin/generate_config.py           | 33 ++++++++++++++------------------
 modules/local/generate_config.nf |  9 ++++++++-
 2 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index 37a39ea5..99bf70db 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -28,15 +28,13 @@ def parse_args(args=None):
     Description = "Produce the various configuration files needed within the pipeline"
 
     parser = argparse.ArgumentParser(description=Description)
-    parser.add_argument("FASTA", help="Path to the Fasta file of the assembly.")
-    parser.add_argument("TAXON_QUERY", help="Query string/integer for this taxon.")
-    parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.")
-    parser.add_argument("YML_OUT", help="Output YML file.")
-    parser.add_argument("CSV_OUT", help="Output CSV file.")
-    parser.add_argument(
-        "--accession", dest="ACCESSION", help="Accession number of the assembly (optional).", default=None
-    )
-    parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None)
+    parser.add_argument("--fasta", help="Path to the Fasta file of the assembly.")
+    parser.add_argument("--taxon_query", help="Query string/integer for this taxon.")
+    parser.add_argument("--lineage_tax_ids", help="Mapping between BUSCO lineages and taxon IDs.")
+    parser.add_argument("--yml_out", help="Output YML file.")
+    parser.add_argument("--csv_out", help="Output CSV file.")
+    parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None)
+    parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None)
     parser.add_argument("--version", action="version", version="%(prog)s 1.3")
     return parser.parse_args(args)
 
@@ -71,7 +69,6 @@ def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]:
 
 
 def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typing.Optional[str]) -> typing.List[str]:
-
     # Read the mapping between the BUSCO lineages and their taxon_id
     with open(lineage_tax_ids) as file_in:
         lineage_tax_ids_dict: typing.Dict[int, str] = {}
@@ -125,7 +122,6 @@ def print_yaml(
     classification: typing.Dict[str, str],
     odb_arr: typing.List[str],
 ):
-
     data = {
         "assembly": assembly_info,
         "busco": {
@@ -179,7 +175,6 @@ def print_yaml(
 
 
 def print_csv(file_out, taxon_info: TaxonInfo, odb_arr: typing.List[str]):
-
     out_dir = os.path.dirname(file_out)
     make_dir(out_dir)
 
@@ -193,18 +188,18 @@ def main(args=None):
     args = parse_args(args)
 
     assembly_info: typing.Dict[str, typing.Union[str, int]]
-    if args.ACCESSION:
-        assembly_info = get_assembly_info(args.ACCESSION)
+    if args.accession:
+        assembly_info = get_assembly_info(args.accession)
     else:
         assembly_info = {"level": "scaffold"}
-    assembly_info["file"] = args.FASTA
+    assembly_info["file"] = args.fasta
 
-    taxon_info = make_taxon_info(args.TAXON_QUERY)
+    taxon_info = make_taxon_info(args.taxon_query)
     classification = get_classification(taxon_info)
-    odb_arr = get_odb(taxon_info, args.LINEAGE_TAX_IDS, args.REQUESTED_BUSCOS)
+    odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco)
 
-    print_yaml(args.YML_OUT, assembly_info, taxon_info, classification, odb_arr)
-    print_csv(args.CSV_OUT, taxon_info, odb_arr)
+    print_yaml(args.yml_out, assembly_info, taxon_info, classification, odb_arr)
+    print_csv(args.csv_out, taxon_info, odb_arr)
 
 
 if __name__ == "__main__":
diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf
index f80b1a0c..95ed385d 100644
--- a/modules/local/generate_config.nf
+++ b/modules/local/generate_config.nf
@@ -25,7 +25,14 @@ process GENERATE_CONFIG {
     def busco_param = busco_lin ? "--busco '${busco_lin}'" : ""
     def accession_params = params.accession ? "--accession ${params.accession}" : ""
     """
-    generate_config.py $fasta "$taxon_query" $lineage_tax_ids $busco_param $accession_params ${prefix}.yaml ${prefix}.csv
+    generate_config.py \\
+        --fasta $fasta \\
+        --taxon_query "$taxon_query" \\
+        --lineage_tax_ids $lineage_tax_ids \\
+        $busco_param \\
+        $accession_params \\
+        --yml_out ${prefix}.yaml \\
+        --csv_out ${prefix}.csv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From c37044c14af7e605fc1d8f12d2e41c9ee1952cbb Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 24 May 2024 12:06:14 +0000
Subject: [PATCH 18/30] Adjust the taxon_id to make sure it exists in the NT
 database

---
 .../test/nt_mMelMel3.1/taxonomy4blast.sqlite3 | Bin 0 -> 3072 bytes
 .../nt_gfLaeSulp1.1/taxonomy4blast.sqlite3    | Bin 0 -> 3072 bytes
 bin/generate_config.py                        |  20 ++++++++++++++----
 modules/local/generate_config.nf              |   2 ++
 subworkflows/local/input_check.nf             |   2 ++
 workflows/blobtoolkit.nf                      |   1 +
 6 files changed, 21 insertions(+), 4 deletions(-)
 create mode 100644 assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3
 create mode 100644 assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3

diff --git a/assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3 b/assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3
new file mode 100644
index 0000000000000000000000000000000000000000..dc933c1f9b259bd9900799e3f5094e925b8d1e23
GIT binary patch
literal 3072
zcmWFz^vNtqRY=P(%1ta$FlJz3U}R))P*7lCU|<DeW*~+E5Ss<Y0n&^}Y+OuM2EA}E
zUZ5-!GY1261CSU6qaiQ|L!gX_UEEcku~9TLFD11iB(Wkh#WOE0-#I_GAio%xLp)O|
z;tLXsQu9jSa?H*_u8twD3Z8y0t`Q1wVJun{{QVH}8W3Z2N<emK)-ke+%LDE8E=f$v
zNrf8%XD~r632}7taYY!Spuwd81Yk2j3@GsQ3vqRK4N?dQ^7M5Kid67+jnn}vgxCpF
zrT}y^SRt1tC~q<{|6l;3Q7{?;qaiRtLV)$O9Ww)y>Rv9Oe;Jv-G62yi7!85Z5Euy|
Qz{SGE1kC%Y`&9P<0A{9KsQ>@~

literal 0
HcmV?d00001

diff --git a/assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 b/assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3
new file mode 100644
index 0000000000000000000000000000000000000000..2a56a82f490f58163c36942b09edc6d60de9122f
GIT binary patch
literal 3072
zcmWFz^vNtqRY=P(%1ta$FlJz3U}R))P*7lCU|<DeW*~+E5Ss<Y0n&^}Y+OuM2EA}E
zUZ5-!GY1261CSU6qaiQ|L!gX_UEEcku~9TLFD11iB(Wkh#WOE0-#I_GAio%xLp)O|
z;tLXsQu9jSa?H*_u8twD3Z8y0t`Q1wVJun{{QVH}8W3Z2N<emK)-ke+%LDE8E=f$v
zNrf8%XD~r632}7taYY!Spuwd81Yk2j3@GsQ3vqRK4N?dQ^7M5Kid67+jnn}vgxCpF
zrT}y^SRt1tC~q<{|6l;3Q7{?;qaiRtLV$I39Ww)y=wB|Ne;Jv-G62yi7!85Z5Euy|
Qz{SGE1kC%Q|3v=*0I2m{i2wiq

literal 0
HcmV?d00001

diff --git a/bin/generate_config.py b/bin/generate_config.py
index 99bf70db..15cd6ef9 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -4,6 +4,7 @@
 import dataclasses
 import os
 import sys
+import sqlite3
 import requests
 import typing
 import yaml
@@ -35,7 +36,8 @@ def parse_args(args=None):
     parser.add_argument("--csv_out", help="Output CSV file.")
     parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None)
     parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None)
-    parser.add_argument("--version", action="version", version="%(prog)s 1.3")
+    parser.add_argument("--blastn", help="Path to the NCBI Taxonomy database")
+    parser.add_argument("--version", action="version", version="%(prog)s 1.4")
     return parser.parse_args(args)
 
 
@@ -115,6 +117,15 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]
     }
 
 
+def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int:
+    con = sqlite3.connect(os.path.join(blastn, "taxonomy4blast.sqlite3"))
+    cur = con.cursor()
+    for taxon_id in [taxon_info.taxon_id] + list(reversed(taxon_info.lineage)):
+        res = cur.execute("SELECT * FROM TaxidInfo WHERE taxid = ?", (taxon_id,))
+        if res.fetchone():
+            return taxon_id
+
+
 def print_yaml(
     file_out,
     assembly_info: typing.Dict[str, typing.Union[str, int]],
@@ -174,12 +185,12 @@ def print_yaml(
         yaml.dump(data, fout)
 
 
-def print_csv(file_out, taxon_info: TaxonInfo, odb_arr: typing.List[str]):
+def print_csv(file_out, taxon_id: int, odb_arr: typing.List[str]):
     out_dir = os.path.dirname(file_out)
     make_dir(out_dir)
 
     with open(file_out, "w") as fout:
-        print("taxon_id", taxon_info.taxon_id, sep=",", file=fout)
+        print("taxon_id", taxon_id, sep=",", file=fout)
         for odb_val in odb_arr:
             print("busco_lineage", odb_val, sep=",", file=fout)
 
@@ -197,9 +208,10 @@ def main(args=None):
     taxon_info = make_taxon_info(args.taxon_query)
     classification = get_classification(taxon_info)
     odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco)
+    taxon_id = adjust_taxon_id(args.blastn, taxon_info)
 
     print_yaml(args.yml_out, assembly_info, taxon_info, classification, odb_arr)
-    print_csv(args.csv_out, taxon_info, odb_arr)
+    print_csv(args.csv_out, taxon_id, odb_arr)
 
 
 if __name__ == "__main__":
diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf
index 95ed385d..5cf08eda 100644
--- a/modules/local/generate_config.nf
+++ b/modules/local/generate_config.nf
@@ -10,6 +10,7 @@ process GENERATE_CONFIG {
     val taxon_query
     val busco_lin
     path lineage_tax_ids
+    tuple val(meta2), val(blastn)
 
     output:
     tuple val(meta), path("*.yaml"), emit: yaml
@@ -31,6 +32,7 @@ process GENERATE_CONFIG {
         --lineage_tax_ids $lineage_tax_ids \\
         $busco_param \\
         $accession_params \\
+        --blastn $blastn \\
         --yml_out ${prefix}.yaml \\
         --csv_out ${prefix}.csv
 
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index ef2a4e42..53d21d80 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -15,6 +15,7 @@ workflow INPUT_CHECK {
     taxon       // channel: val(taxon)
     busco_lin   // channel: val([busco_lin])
     lineage_tax_ids        // channel: /path/to/lineage_tax_ids
+    blastn       // channel: [ val(meta), path(blastn_db) ]
 
     main:
     ch_versions = Channel.empty()
@@ -65,6 +66,7 @@ workflow INPUT_CHECK {
         taxon,
         busco_lin,
         lineage_tax_ids,
+        blastn,
     )
     ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first())
 
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index d608803a..1541c9b3 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -108,6 +108,7 @@ workflow BLOBTOOLKIT {
         ch_taxon,
         ch_busco_lin,
         ch_lineage_tax_ids,
+        ch_blastn,
     )
     ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions )
 

From 9963aa5b59ac8a0ac28a8c2c4f2bb96364cb57b7 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 28 May 2024 09:15:25 +0000
Subject: [PATCH 19/30] All these parameters are mandatory

---
 bin/generate_config.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index 15cd6ef9..d070bccd 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -29,14 +29,14 @@ def parse_args(args=None):
     Description = "Produce the various configuration files needed within the pipeline"
 
     parser = argparse.ArgumentParser(description=Description)
-    parser.add_argument("--fasta", help="Path to the Fasta file of the assembly.")
-    parser.add_argument("--taxon_query", help="Query string/integer for this taxon.")
-    parser.add_argument("--lineage_tax_ids", help="Mapping between BUSCO lineages and taxon IDs.")
-    parser.add_argument("--yml_out", help="Output YML file.")
-    parser.add_argument("--csv_out", help="Output CSV file.")
+    parser.add_argument("--fasta", required=True, help="Path to the Fasta file of the assembly.")
+    parser.add_argument("--taxon_query", required=True, help="Query string/integer for this taxon.")
+    parser.add_argument("--lineage_tax_ids", required=True, help="Mapping between BUSCO lineages and taxon IDs.")
+    parser.add_argument("--yml_out", required=True, help="Output YML file.")
+    parser.add_argument("--csv_out", required=True, help="Output CSV file.")
     parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None)
-    parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None)
-    parser.add_argument("--blastn", help="Path to the NCBI Taxonomy database")
+    parser.add_argument("--busco", required=True, help="Requested BUSCO lineages.", default=None)
+    parser.add_argument("--blastn", required=True, help="Path to the NCBI Taxonomy database")
     parser.add_argument("--version", action="version", version="%(prog)s 1.4")
     return parser.parse_args(args)
 

From e959c0407548cc70866a2837d2c0b335245f78aa Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 31 May 2024 10:55:10 +0000
Subject: [PATCH 20/30] bugfix: --busco is optional

---
 bin/generate_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index d070bccd..6b1fb211 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -35,7 +35,7 @@ def parse_args(args=None):
     parser.add_argument("--yml_out", required=True, help="Output YML file.")
     parser.add_argument("--csv_out", required=True, help="Output CSV file.")
     parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None)
-    parser.add_argument("--busco", required=True, help="Requested BUSCO lineages.", default=None)
+    parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None)
     parser.add_argument("--blastn", required=True, help="Path to the NCBI Taxonomy database")
     parser.add_argument("--version", action="version", version="%(prog)s 1.4")
     return parser.parse_args(args)

From ed941118058c30565eb02af812111f73e9c5fdc2 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 31 May 2024 15:39:45 +0000
Subject: [PATCH 21/30] bugfix: this should be a "path" so that the file is
 made available to the container

---
 modules/local/generate_config.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf
index 5cf08eda..3aab7ee9 100644
--- a/modules/local/generate_config.nf
+++ b/modules/local/generate_config.nf
@@ -10,7 +10,7 @@ process GENERATE_CONFIG {
     val taxon_query
     val busco_lin
     path lineage_tax_ids
-    tuple val(meta2), val(blastn)
+    tuple val(meta2), path(blastn)
 
     output:
     tuple val(meta), path("*.yaml"), emit: yaml

From 596203496292161c45298959dc68d1bb3b0ca9e0 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 4 Jun 2024 14:21:35 +0100
Subject: [PATCH 22/30] bugfix: accept older assembly versions

---
 bin/generate_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index 6b1fb211..d6e3c684 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -10,7 +10,7 @@
 import yaml
 
 NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/taxonomy/taxon/%s"
-NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report"
+NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report?filters.assembly_version=all_assemblies"
 
 RANKS = [
     "genus",

From 121e3722ec8a6adf2880c79f41d282124a16eadc Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 4 Jun 2024 14:24:19 +0100
Subject: [PATCH 23/30] These fields can be missing

---
 bin/generate_config.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index d6e3c684..bcf4a4ab 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -103,18 +103,21 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]
         sys.exit(1)
     assembly_report = response["reports"][0]
     assembly_info = assembly_report["assembly_info"]
-    return {
+    d = {
         "accession": accession,
         "alias": assembly_info["assembly_name"],
         "bioproject": assembly_info["bioproject_accession"],
-        "biosample": assembly_info["biosample"]["accession"],
         "level": assembly_info["assembly_level"].lower(),
-        "prefix": assembly_report["wgs_info"]["wgs_project_accession"],
         "scaffold-count": assembly_report["assembly_stats"]["number_of_component_sequences"]
         + assembly_report["assembly_stats"]["number_of_organelles"],
         "span": int(assembly_report["assembly_stats"]["total_sequence_length"])
         + sum(int(oi["total_seq_length"]) for oi in assembly_report["organelle_info"]),
     }
+    if "biosample" in assembly_info:
+        d["biosample"] = assembly_info["biosample"]["accession"]
+    if "wgs_info" in assembly_report:
+        d["prefix"] = assembly_report["wgs_info"]["wgs_project_accession"]
+    return d
 
 
 def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int:

From 8085a740d508f6e79ac4d43f42ec9493a8691db6 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 4 Jun 2024 14:25:58 +0100
Subject: [PATCH 24/30] Some genomes don't have organelles

---
 bin/generate_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index bcf4a4ab..8036c4b1 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -109,9 +109,9 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]
         "bioproject": assembly_info["bioproject_accession"],
         "level": assembly_info["assembly_level"].lower(),
         "scaffold-count": assembly_report["assembly_stats"]["number_of_component_sequences"]
-        + assembly_report["assembly_stats"]["number_of_organelles"],
+        + assembly_report["assembly_stats"].get("number_of_organelles", 0),
         "span": int(assembly_report["assembly_stats"]["total_sequence_length"])
-        + sum(int(oi["total_seq_length"]) for oi in assembly_report["organelle_info"]),
+        + sum(int(oi["total_seq_length"]) for oi in assembly_report.get("organelle_info", [])),
     }
     if "biosample" in assembly_info:
         d["biosample"] = assembly_info["biosample"]["accession"]

From 30af129a204b0dba1f98c1796858165f8dcadcbb Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jul 2024 18:39:34 +0100
Subject: [PATCH 25/30] Easier to read

---
 bin/generate_config.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index 8036c4b1..cbe3a2ab 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -103,15 +103,17 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]
         sys.exit(1)
     assembly_report = response["reports"][0]
     assembly_info = assembly_report["assembly_info"]
+    scaffold_count = assembly_report["assembly_stats"]["number_of_component_sequences"]
+    scaffold_count += assembly_report["assembly_stats"].get("number_of_organelles", 0)
+    span = int(assembly_report["assembly_stats"]["total_sequence_length"])
+    span += sum(int(oi["total_seq_length"]) for oi in assembly_report.get("organelle_info", []))
     d = {
         "accession": accession,
         "alias": assembly_info["assembly_name"],
         "bioproject": assembly_info["bioproject_accession"],
         "level": assembly_info["assembly_level"].lower(),
-        "scaffold-count": assembly_report["assembly_stats"]["number_of_component_sequences"]
-        + assembly_report["assembly_stats"].get("number_of_organelles", 0),
-        "span": int(assembly_report["assembly_stats"]["total_sequence_length"])
-        + sum(int(oi["total_seq_length"]) for oi in assembly_report.get("organelle_info", [])),
+        "scaffold-count": scaffold_count,
+        "span": span,
     }
     if "biosample" in assembly_info:
         d["biosample"] = assembly_info["biosample"]["accession"]

From 9dcd338d88958ea7fcc152958cccf1c239bc4e25 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Wed, 10 Jul 2024 19:43:33 +0100
Subject: [PATCH 26/30] Release name

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 35c1b810..dfe865ba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [[1.0.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/1.0.0)] – – []
+## [[1.0.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/1.0.0)] – Bellsprout – []
 
 The pipeline has now been validated for draft (unpublished) assemblies.
 

From 8886dba40a86c3ab9fb3ef8cd9f8fbd97f54e3a7 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 21 Jun 2024 16:59:19 +0100
Subject: [PATCH 27/30] 
 https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/

---
 bin/generate_config.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bin/generate_config.py b/bin/generate_config.py
index cbe3a2ab..7330ae48 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -67,6 +67,10 @@ def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]:
         anc_taxon_info = make_taxon_info(anc_taxon_id)
         if anc_taxon_info.rank:
             ancestors[anc_taxon_info.rank.lower()] = anc_taxon_info.organism_name
+    # https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/
+    # "superkingdom" will be called "domain"
+    if "superkingdom" not in ancestors:
+        ancestors["superkingdom"] = ancestors["domain"]
     return {r: ancestors[r] for r in RANKS if r in ancestors}
 
 

From 70f961ce72811ca2ca2b239d08589001a12e4a2c Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Tue, 2 Jul 2024 13:52:12 +0100
Subject: [PATCH 28/30] Use GoaT in addition to the NCBI because GoaT also has
 the freshest ENA taxon_ids

NCBI is still the first database we query
---
 CHANGELOG.md             |  2 ++
 CITATIONS.md             |  4 +++
 bin/generate_config.py   | 61 ++++++++++++++++++++++++++++++++--------
 workflows/blobtoolkit.nf |  2 +-
 4 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dfe865ba..c6330537 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ The pipeline has now been validated for draft (unpublished) assemblies.
 
 - The pipeline now queries the NCBI database instead of GoaT to establish the
   taxonomic classification of the species and the relevant Busco lineages.
+  In case the taxon_id is not found, the pipeline falls back to GoaT, which
+  is aware of upcoming taxon_ids in ENA.
 - New `--busco_lineages` parameter to choose specific Busco lineages instead of
   automatically selecting based on the taxonomy.
 - All parameters are now passed the regular Nextflow way. There is no support
diff --git a/CITATIONS.md b/CITATIONS.md
index c2cadb65..8b960779 100644
--- a/CITATIONS.md
+++ b/CITATIONS.md
@@ -26,6 +26,10 @@
 
 - [Fasta_windows](https://github.com/tolkit/fasta_windows)
 
+- [GoaT](https://goat.genomehubs.org)
+
+  > Challis, Richard, et al. “Genomes on a Tree (GoaT): A versatile, scalable search engine for genomic and sequencing project metadata across the eukaryotic tree of life.” Wellcome Open Research, vol. 8, no. 24, 2023, https://doi.org/10.12688/wellcomeopenres.18658.1.
+
 - [Minimap2](https://github.com/lh3/minimap2)
 
   > Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, https://doi.org/10.1093/bioinformatics/bty191.
diff --git a/bin/generate_config.py b/bin/generate_config.py
index 7330ae48..e3d00dfa 100755
--- a/bin/generate_config.py
+++ b/bin/generate_config.py
@@ -10,6 +10,8 @@
 import yaml
 
 NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/taxonomy/taxon/%s"
+GOAT_LOOKUP_API = "https://goat.genomehubs.org/api/v2/lookup?searchTerm=%s&result=taxon&size=10&taxonomy=ncbi"
+GOAT_RECORD_API = "https://goat.genomehubs.org/api/v2/record?recordId=%s&result=taxon&size=10&taxonomy=ncbi"
 NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report?filters.assembly_version=all_assemblies"
 
 RANKS = [
@@ -51,20 +53,57 @@ class TaxonInfo:
     taxon_id: int
     organism_name: str
     rank: typing.Optional[str]
-    lineage: typing.List[int]
+    lineage: typing.List["TaxonInfo"]
 
 
-def make_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo:
+def make_taxon_info_from_goat(body: dict[str, str]) -> TaxonInfo:
+    rank = body["taxon_rank"]
+    if rank == "no rank":
+        rank = None
+    if "lineage" in body:
+        lineage = [make_taxon_info_from_goat(b) for b in body["lineage"]]
+    else:
+        lineage = []
+    return TaxonInfo(int(body["taxon_id"]), body["scientific_name"], rank, lineage)
+
+
+def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo:
+    if isinstance(taxon_name, int):
+        taxon_id = taxon_name
+        record_id = "taxon-%d" % taxon_name
+    else:
+        # Resolve the taxon_id of the species
+        response = requests.get(GOAT_LOOKUP_API % taxon_name).json()
+        taxon_id = int(response["results"][0]["result"]["taxon_id"])
+        record_id = response["results"][0]["id"]
+
+    # Using API, get the taxon_ids of the species and all parents
+    response = requests.get(GOAT_RECORD_API % record_id).json()
+    body = response["records"][0]["record"]
+    return make_taxon_info_from_goat(body)
+
+
+def fetch_taxon_info_from_ncbi(taxon_name: typing.Union[str, int], with_lineage=True) -> typing.Optional[TaxonInfo]:
     # Using API, get the taxon_ids of the species and all parents
     response = requests.get(NCBI_TAXONOMY_API % taxon_name).json()
-    body = response["taxonomy_nodes"][0]["taxonomy"]
-    return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), body["lineage"])
+    if "taxonomy" in response["taxonomy_nodes"][0]:
+        body = response["taxonomy_nodes"][0]["taxonomy"]
+        if with_lineage:
+            lineage = [
+                fetch_taxon_info_from_ncbi(t, with_lineage=False) for t in reversed(body["lineage"][2:])
+            ]  # skip root and cellular_organisms
+        else:
+            lineage = []
+        return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), lineage)
+
+
+def fetch_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo:
+    return fetch_taxon_info_from_ncbi(taxon_name) or fetch_taxon_info_from_goat(taxon_name)
 
 
 def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]:
     ancestors: typing.Dict[str, str] = {}
-    for anc_taxon_id in taxon_info.lineage[1:]:  # skip the root taxon_id=1
-        anc_taxon_info = make_taxon_info(anc_taxon_id)
+    for anc_taxon_info in taxon_info.lineage:
         if anc_taxon_info.rank:
             ancestors[anc_taxon_info.rank.lower()] = anc_taxon_info.organism_name
     # https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/
@@ -92,9 +131,9 @@ def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typin
     else:
         # Do the intersection to find the ancestors that have a BUSCO lineage
         odb_arr = [
-            lineage_tax_ids_dict[taxon_id]
-            for taxon_id in reversed(taxon_info.lineage)
-            if taxon_id in lineage_tax_ids_dict
+            lineage_tax_ids_dict[anc_taxon_info.taxon_id]
+            for anc_taxon_info in taxon_info.lineage
+            if anc_taxon_info.taxon_id in lineage_tax_ids_dict
         ]
 
     return odb_arr
@@ -129,7 +168,7 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]
 def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int:
     con = sqlite3.connect(os.path.join(blastn, "taxonomy4blast.sqlite3"))
     cur = con.cursor()
-    for taxon_id in [taxon_info.taxon_id] + list(reversed(taxon_info.lineage)):
+    for taxon_id in [taxon_info.taxon_id] + [anc_taxon_info.taxon_id for anc_taxon_info in taxon_info.lineage]:
         res = cur.execute("SELECT * FROM TaxidInfo WHERE taxid = ?", (taxon_id,))
         if res.fetchone():
             return taxon_id
@@ -214,7 +253,7 @@ def main(args=None):
         assembly_info = {"level": "scaffold"}
     assembly_info["file"] = args.fasta
 
-    taxon_info = make_taxon_info(args.taxon_query)
+    taxon_info = fetch_taxon_info(args.taxon_query)
     classification = get_classification(taxon_info)
     odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco)
     taxon_id = adjust_taxon_id(args.blastn, taxon_info)
diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf
index 1541c9b3..cd97fd99 100644
--- a/workflows/blobtoolkit.nf
+++ b/workflows/blobtoolkit.nf
@@ -130,7 +130,7 @@ workflow BLOBTOOLKIT {
     ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions )
 
     //
-    // SUBWORKFLOW: Run BUSCO using lineages fetched from NCBI, then run diamond_blastp
+    // SUBWORKFLOW: Run BUSCO using lineages fetched from GoaT, then run diamond_blastp
     //
     BUSCO_DIAMOND (
         PREPARE_GENOME.out.genome,

From e9d3a645442b0c747c452e25f375e202de856390 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Thu, 22 Aug 2024 03:34:14 +0100
Subject: [PATCH 29/30] Added an option to skip filtering hits from the same
 species

---
 CHANGELOG.md                      | 11 +++++---
 nextflow.config                   |  1 +
 nextflow_schema.json              |  5 ++++
 subworkflows/local/input_check.nf |  3 +-
 subworkflows/local/run_blastn.nf  | 46 ++++++++++++++++++++-----------
 5 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c6330537..ff57d235 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,13 +15,16 @@ The pipeline has now been validated for draft (unpublished) assemblies.
   automatically selecting based on the taxonomy.
 - All parameters are now passed the regular Nextflow way. There is no support
   for the original Yaml configuration files of the Snakemake version.
+- New option `--skip_taxon_filtering` to skip the taxon filtering in blast searches.
+  Mostly relevant for draft assemblies.
 
 ### Parameters
 
-| Old parameter | New parameter    |
-| ------------- | ---------------- |
-| --yaml        |                  |
-|               | --busco_lineages |
+| Old parameter | New parameter          |
+| ------------- | ---------------------- |
+| --yaml        |                        |
+|               | --busco_lineages       |
+|               | --skip_taxon_filtering |
 
 > **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.
 
diff --git a/nextflow.config b/nextflow.config
index a1ea3ae9..dc8cc891 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -33,6 +33,7 @@ params {
     blastp                     = null
     blastx                     = null
     blastn                     = null
+    skip_taxon_filtering       = false
 
     // MultiQC options
     multiqc_config             = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index cd0b7341..e838ab04 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -32,6 +32,11 @@
                     "description": "Turn on optional genome masking if needed.",
                     "fa_icon": "fas fa-toggle-off"
                 },
+                "skip_taxon_filtering": {
+                    "type": "boolean",
+                    "description": "Skip filtering out hits from the same species in blast* searches.",
+                    "fa_icon": "fas fa-toggle-off"
+                },
                 "fetchngs_samplesheet": {
                     "type": "boolean",
                     "description": "Turn on the conversion from a nf-core/fetchngs samplesheet.",
diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
index 53d21d80..7a8fd112 100644
--- a/subworkflows/local/input_check.nf
+++ b/subworkflows/local/input_check.nf
@@ -87,9 +87,10 @@ workflow INPUT_CHECK {
 
 
     //
-    // Get the taxon ID
+    // Get the taxon ID if we do taxon filtering in blast* searches
     //
     ch_parsed_csv.taxon_id
+    | map { params.skip_taxon_filtering ? '' : it }
     | first
     | set { ch_taxon_id }
 
diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf
index 7dc92fd7..d479e832 100644
--- a/subworkflows/local/run_blastn.nf
+++ b/subworkflows/local/run_blastn.nf
@@ -47,23 +47,37 @@ workflow RUN_BLASTN {
     | set { ch_chunks }
 
     // Run blastn search
-    // run blastn excluding taxon_id
-    BLASTN_TAXON ( ch_chunks, blastn, taxon_id )
-    ch_versions = ch_versions.mix ( BLASTN_TAXON.out.versions.first() )
-
-    // check if blastn output table is empty
-    BLASTN_TAXON.out.txt
-    | branch { meta, txt ->
-        empty:     txt.isEmpty()
-        not_empty: true
-    }
-    | set { ch_blastn_taxon_out }
+    if (params.skip_taxon_filtering) {
+
+        // skip BLASTN_TAXON
+        ch_blast_blastn_input = ch_chunks
+
+        // fake ch_blastn_taxon_out.not_empty
+        ch_blastn_taxon_out = [
+            not_empty: Channel.empty()
+        ]
+
+    } else {
 
-    // repeat the blastn search without excluding taxon_id
-    ch_blastn_taxon_out.empty
-    | join ( ch_chunks )
-    | map { meta, txt, fasta -> [meta, fasta] }
-    | set { ch_blast_blastn_input }
+        // run blastn excluding taxon_id
+        BLASTN_TAXON ( ch_chunks, blastn, taxon_id )
+        ch_versions = ch_versions.mix ( BLASTN_TAXON.out.versions.first() )
+
+        // check if blastn output table is empty
+        BLASTN_TAXON.out.txt
+        | branch { meta, txt ->
+            empty:     txt.isEmpty()
+            not_empty: true
+        }
+        | set { ch_blastn_taxon_out }
+
+        // repeat the blastn search without excluding taxon_id
+        ch_blastn_taxon_out.empty
+        | join ( ch_chunks )
+        | map { meta, txt, fasta -> [meta, fasta] }
+        | set { ch_blast_blastn_input }
+
+    }
 
     BLAST_BLASTN ( ch_blast_blastn_input, blastn, [] )
     ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() )

From 8c70c777ade843c965d603393119ed8cda6a6424 Mon Sep 17 00:00:00 2001
From: Matthieu Muffato <mm49@sanger.ac.uk>
Date: Fri, 23 Aug 2024 16:59:10 +0100
Subject: [PATCH 30/30] Corrected the version as I want to be sure this is
 really ready for the big show

---
 CHANGELOG.md    | 2 +-
 nextflow.config | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ff57d235..0cc1a361 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [[1.0.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/1.0.0)] – Bellsprout – []
+## – Bellsprout – []
 
 The pipeline has now been validated for draft (unpublished) assemblies.
 
diff --git a/nextflow.config b/nextflow.config
index dc8cc891..319b9b3d 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -246,7 +246,7 @@ manifest {
     description     = """Quality assessment of genome assemblies"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=23.04.0'
-    version         = '1.0.0'
+    version         = '0.6.0-dev'
     doi             = '10.5281/zenodo.7949058'
 }