From e1ef4516753f7b55f2082a8ca553629a40b80ef4 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 15 Aug 2024 20:08:42 +0100 Subject: [PATCH 01/30] Skip blastn if there are no chunks The filter in SEQTK_SUBSEQ is not sufficient because some BLOBTOOLKIT_CHUNK further excludes masked regions --- subworkflows/local/run_blastn.nf | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf index 1ea64b82..7dc92fd7 100644 --- a/subworkflows/local/run_blastn.nf +++ b/subworkflows/local/run_blastn.nf @@ -41,10 +41,14 @@ workflow RUN_BLASTN { BLOBTOOLKIT_CHUNK ( SEQTK_SUBSEQ.out.sequences, [[],[]] ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() ) + // Check that there are still sequences left after chunking (which excludes masked regions) + BLOBTOOLKIT_CHUNK.out.chunks + | filter { it[1].size() > 0 } + | set { ch_chunks } // Run blastn search // run blastn excluding taxon_id - BLASTN_TAXON ( BLOBTOOLKIT_CHUNK.out.chunks, blastn, taxon_id ) + BLASTN_TAXON ( ch_chunks, blastn, taxon_id ) ch_versions = ch_versions.mix ( BLASTN_TAXON.out.versions.first() ) // check if blastn output table is empty @@ -56,7 +60,8 @@ workflow RUN_BLASTN { | set { ch_blastn_taxon_out } // repeat the blastn search without excluding taxon_id - ch_blastn_taxon_out.empty.join ( BLOBTOOLKIT_CHUNK.out.chunks ) + ch_blastn_taxon_out.empty + | join ( ch_chunks ) | map { meta, txt, fasta -> [meta, fasta] } | set { ch_blast_blastn_input } From ed44da107826cea25752fdbc3fc294d79701b54b Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 15 Aug 2024 20:12:56 +0100 Subject: [PATCH 02/30] Version bump --- CHANGELOG.md | 6 ++++++ nextflow.config | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1f08975..5146c972 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-15] + +### Enhancements & fixes + +- Bugfix: skip BLASTN if there are no chunks to align + ## [[0.5.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.0)] – Snorlax – [2024-07-31] General tidy up of the configuration and the pipeline diff --git a/nextflow.config b/nextflow.config index db5ef388..84bae361 100644 --- a/nextflow.config +++ b/nextflow.config @@ -243,7 +243,7 @@ manifest { description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.5.0' + version = '0.5.1' doi = '10.5281/zenodo.7949058' } From 3053bdbf48d6e79793254d90ffc10210e72cfc35 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 22 Aug 2024 08:12:57 +0100 Subject: [PATCH 03/30] New date --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5146c972..bf3e3e6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-15] +## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-22] ### Enhancements & fixes From 12a75579da3d947304d9caefcd8d3e45355c2c6d Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 13 May 2024 10:32:31 +0100 Subject: [PATCH 04/30] Version bump --- CHANGELOG.md | 4 ++++ nextflow.config | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf3e3e6a..0614c6d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.0.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/1.0.0)] – – [] + +The pipeline has now been validated for draft (unpublished) assemblies. + ## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-22] ### Enhancements & fixes diff --git a/nextflow.config b/nextflow.config index 84bae361..2345a399 100644 --- a/nextflow.config +++ b/nextflow.config @@ -243,7 +243,7 @@ manifest { description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.5.1' + version = '1.0.0' doi = '10.5281/zenodo.7949058' } From bd2334e098a3f542dcc99b83d1d03f980d3c5dcc Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 13 May 2024 10:53:39 +0100 Subject: [PATCH 05/30] Directly query the NCBI API --- CHANGELOG.md | 11 +++ CITATIONS.md | 4 -- README.md | 2 +- ...ataset_name.eukaryota_odb10.2019-12-16.txt | 67 +++++++++++++++++++ bin/get_odb_taxon.py | 61 +++++++++++++++++ conf/modules.config | 4 -- docs/usage.md | 1 - modules.json | 5 -- modules/local/get_odb_taxon.nf | 32 +++++++++ .../nf-core/goat/taxonsearch/environment.yml | 7 -- modules/nf-core/goat/taxonsearch/main.nf | 36 ---------- modules/nf-core/goat/taxonsearch/meta.yml | 50 -------------- nextflow.config | 2 + nextflow_schema.json | 9 ++- subworkflows/local/busco_diamond_blastp.nf | 35 +++++----- workflows/blobtoolkit.nf | 7 +- 16 files changed, 206 insertions(+), 127 deletions(-) create mode 100644 assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt create mode 100755 bin/get_odb_taxon.py create mode 100644 modules/local/get_odb_taxon.nf delete mode 100644 modules/nf-core/goat/taxonsearch/environment.yml delete mode 100644 modules/nf-core/goat/taxonsearch/main.nf delete mode 100644 modules/nf-core/goat/taxonsearch/meta.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 0614c6d9..5f8f155d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 The pipeline has now been validated for draft (unpublished) assemblies. +- The pipeline now queries the NCBI database instead of GoaT to establish the + taxonomic classification of the species and the relevant Busco lineages. + +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| goat | 0.2.5 | | + ## [[0.5.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.1)] – Snorlax (patch 1) – [2024-08-22] ### Enhancements & fixes diff --git a/CITATIONS.md b/CITATIONS.md index 8b960779..c2cadb65 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -26,10 +26,6 @@ - [Fasta_windows](https://github.com/tolkit/fasta_windows) -- [GoaT](https://goat.genomehubs.org) - - > Challis, Richard, et al. “Genomes on a Tree (GoaT): A versatile, scalable search engine for genomic and sequencing project metadata across the eukaryotic tree of life.” Wellcome Open Research, vol. 8, no. 24, 2023, https://doi.org/10.12688/wellcomeopenres.18658.1. - - [Minimap2](https://github.com/lh3/minimap2) > Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, https://doi.org/10.1093/bioinformatics/bty191. diff --git a/README.md b/README.md index c7b92970..b55ab353 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome 1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows)) 2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk)) -3. Fetch associated BUSCO lineages ([`goat/taxonsearch`](https://github.com/genomehubs/goat-cli)) +3. Determine the appropriate BUSCO lineages from the taxonomy. 4. Run BUSCO ([`busco`](https://busco.ezlab.org/)) 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond)) diff --git a/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt b/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt new file mode 100644 index 00000000..5d8af90f --- /dev/null +++ b/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt @@ -0,0 +1,67 @@ +422676 aconoidasida +7898 actinopterygii +5338 agaricales +155619 agaricomycetes +33630 alveolata +5794 apicomplexa +6854 arachnida +6656 arthropoda +4890 ascomycota +8782 aves +5204 basidiomycota +68889 boletales +3699 brassicales +134362 capnodiales +33554 carnivora +91561 cetartiodactyla +34395 chaetothyriales +3041 chlorophyta +5796 coccidia +28738 cyprinodontiformes +7147 diptera +147541 dothideomycetes +3193 embryophyta +33392 endopterygota +314146 euarchontoglires +33682 euglenozoa +2759 eukaryota +5042 eurotiales +147545 eurotiomycetes +9347 eutheria +72025 fabales +4751 fungi +314147 glires +1028384 glomerellales +5178 helotiales +7524 hemiptera +7399 hymenoptera +5125 hypocreales +50557 insecta +314145 laurasiatheria +147548 leotiomycetes +7088 lepidoptera +4447 liliopsida +40674 mammalia +33208 metazoa +6029 microsporidia +6447 mollusca +4827 mucorales +1913637 mucoromycota +6231 nematoda +33183 onygenales +9126 passeriformes +5820 plasmodium +92860 pleosporales +38820 poales +5303 polyporales +9443 primates +4891 saccharomycetes +8457 sauropsida +4069 solanales +147550 sordariomycetes +33634 stramenopiles +32523 tetrapoda +155616 tremellomycetes +7742 vertebrata +33090 viridiplantae +71240 eudicots diff --git a/bin/get_odb_taxon.py b/bin/get_odb_taxon.py new file mode 100755 index 00000000..ade35e34 --- /dev/null +++ b/bin/get_odb_taxon.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys +import requests + + +NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s" + + +def parse_args(args=None): + Description = "Get Taxon ID and ODB database value using NCBI API and BUSCO configuration file" + + parser = argparse.ArgumentParser(description=Description) + parser.add_argument("TAXON_QUERY", help="Query string/integer for this taxon.") + parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.") + parser.add_argument("FILE_OUT", help="Output CSV file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def get_odb(taxon_query, lineage_tax_ids, file_out): + # Read the mapping between the BUSCO lineages and their taxon_id + with open(lineage_tax_ids) as file_in: + lineage_tax_ids_dict = {} + for line in file_in: + arr = line.split() + lineage_tax_ids_dict[int(arr[0])] = arr[1] + + # Using API, get the taxon_ids of the species and all parents + response = requests.get(NCBI_TAXONOMY_API % taxon_query).json() + this_taxon_id = response["taxonomy_nodes"][0]["taxonomy"]["tax_id"] + ancestor_taxon_ids = response["taxonomy_nodes"][0]["taxonomy"]["lineage"] + + # Do the intersection to find the ancestors that have a BUSCO lineage + odb_arr = [ + lineage_tax_ids_dict[taxon_id] for taxon_id in reversed(ancestor_taxon_ids) if taxon_id in lineage_tax_ids_dict + ] + + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + + with open(file_out, "w") as fout: + print("taxon_id", this_taxon_id, sep=",", file=fout) + for odb_val in odb_arr: + print("busco_lineage", odb_val + "_odb10", sep=",", file=fout) + + +def main(args=None): + args = parse_args(args) + get_odb(args.TAXON_QUERY, args.LINEAGE_TAX_IDS, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index ac597dc4..d4b3f897 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -64,10 +64,6 @@ process { ext.args = "-c" } - withName: "GOAT_TAXONSEARCH" { - ext.args = "--lineage --busco" - } - withName: "PIGZ_COMPRESS" { publishDir = [ path: { "${params.outdir}/base_content" }, diff --git a/docs/usage.md b/docs/usage.md index 4789ff84..0619d8e0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -235,7 +235,6 @@ List of tools for any given dataset can be fetched from the API, for example htt | busco | 5.3.2 | 5.5.0 | | diamond | 2.0.15 | 2.1.8 | | fasta_windows | | 0.2.4 | -| goat | | 0.2.5 | | minimap2 | 2.24 | 2.24 | | ncbi-datasets-cli | 14.1.0 | | | nextflow | | 23.10.0 | diff --git a/modules.json b/modules.json index ebb45a6c..3ce2a831 100644 --- a/modules.json +++ b/modules.json @@ -45,11 +45,6 @@ "installed_by": ["modules"], "patch": "modules/nf-core/fastawindows/fastawindows.diff" }, - "goat/taxonsearch": { - "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] - }, "gunzip": { "branch": "master", "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", diff --git a/modules/local/get_odb_taxon.nf b/modules/local/get_odb_taxon.nf new file mode 100644 index 00000000..ff061ea3 --- /dev/null +++ b/modules/local/get_odb_taxon.nf @@ -0,0 +1,32 @@ +process NCBI_GET_ODB_TAXON { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::requests=2.26.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/requests:2.26.0': + 'biocontainers/requests:2.26.0' }" + + input: + tuple val(meta), val(taxon_query) + path lineage_tax_ids + + output: + tuple val(meta), path("*.csv"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + get_odb_taxon.py "$taxon_query" $lineage_tax_ids ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + get_odb_taxon.py: \$(get_odb.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/goat/taxonsearch/environment.yml b/modules/nf-core/goat/taxonsearch/environment.yml deleted file mode 100644 index e56e71f1..00000000 --- a/modules/nf-core/goat/taxonsearch/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: goat_taxonsearch -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::goat=0.2.5 diff --git a/modules/nf-core/goat/taxonsearch/main.nf b/modules/nf-core/goat/taxonsearch/main.nf deleted file mode 100644 index 62c12baa..00000000 --- a/modules/nf-core/goat/taxonsearch/main.nf +++ /dev/null @@ -1,36 +0,0 @@ -process GOAT_TAXONSEARCH { - tag "$meta.id" - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/goat:0.2.5--h9d3141d_2': - 'biocontainers/goat:0.2.5--h9d3141d_2' }" - - input: - tuple val(meta), val(taxon), path(taxa_file) - - output: - tuple val(meta), path("*.tsv"), emit: taxonsearch - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - input = taxa_file ? "-f ${taxa_file}" : "-t \"${taxon}\"" - if (!taxon && !taxa_file) error "No input. Valid input: single taxon identifier or a .txt file with identifiers" - if (taxon && taxa_file ) error "Only one input is required: a single taxon identifier or a .txt file with identifiers" - """ - goat-cli taxon search \\ - $args \\ - $input > ${prefix}.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - goat: \$(goat-cli --version | cut -d' ' -f2) - END_VERSIONS - """ -} diff --git a/modules/nf-core/goat/taxonsearch/meta.yml b/modules/nf-core/goat/taxonsearch/meta.yml deleted file mode 100644 index 1bb19e30..00000000 --- a/modules/nf-core/goat/taxonsearch/meta.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: "goat_taxonsearch" -description: Query metadata for any taxon across the tree of life. -keywords: - - public datasets - - ncbi - - genomes on a tree -tools: - - goat: - description: | - goat-cli is a command line interface to query the - Genomes on a Tree Open API. - homepage: https://github.com/genomehubs/goat-cli - documentation: https://github.com/genomehubs/goat-cli/wiki - tool_dev_url: https://genomehubs.github.io/goat-cli/goat_cli/ - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test'] - - taxon: - type: string - description: | - The taxon to search. An NCBI taxon ID, or the name of a taxon at any rank. - - taxa_file: - type: file - description: | - A file of NCBI taxonomy ID's (tips) and/or binomial names. Each line - should contain a single entry.File size is limited to 500 entries. - pattern: "*.txt" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - taxonsearch: - type: file - description: TSV file containing search results. - pattern: "*.tsv" -authors: - - "@alxndrdiaz" -maintainers: - - "@alxndrdiaz" - - "@muffato" diff --git a/nextflow.config b/nextflow.config index 2345a399..38964a66 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,6 +28,8 @@ params { // Databases and related options taxdump = null busco = null + // Taken from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz + lineage_tax_ids = "${projectDir}/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt" blastp = null blastx = null blastn = null diff --git a/nextflow_schema.json b/nextflow_schema.json index b392e2a5..8e7f0fba 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -100,7 +100,7 @@ "type": "object", "fa_icon": "fas fa-database", "description": "Define the location and parameters to work with databases.", - "required": ["blastp", "blastx", "blastn", "taxdump"], + "required": ["blastp", "blastx", "blastn", "lineage_tax_ids", "taxdump"], "properties": { "busco": { "type": "string", @@ -108,6 +108,13 @@ "description": "Local directory where clade-specific BUSCO lineage datasets are stored", "fa_icon": "fas fa-folder-open" }, + "lineage_tax_ids": { + "type": "string", + "format": "file-path", + "description": "Local file that holds a mapping between BUSCO lineages and taxon IDs.", + "help_text": "Initialised from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz", + "fa_icon": "fas fa-file-code" + }, "blastp": { "type": "string", "format": "file-path", diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 2a89471f..5a20823c 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -1,8 +1,8 @@ // -// Run BUSCO for a genome from GOAT and runs diamond_blastp +// Run BUSCO for a genome and runs diamond_blastp // -include { GOAT_TAXONSEARCH } from '../../modules/nf-core/goat/taxonsearch/main' +include { NCBI_GET_ODB_TAXON } from '../../modules/local/get_odb_taxon' include { BUSCO } from '../../modules/nf-core/busco/main' include { BLOBTOOLKIT_EXTRACTBUSCOS } from '../../modules/local/blobtoolkit/extractbuscos' include { DIAMOND_BLASTP } from '../../modules/nf-core/diamond/blastp/main' @@ -14,6 +14,7 @@ workflow BUSCO_DIAMOND { fasta // channel: [ val(meta), path(fasta) ] taxon // channel: val(taxon) busco_db // channel: path(busco_db) + lineage_tax_ids // channel: /path/to/lineage_tax_ids blastp // channel: path(blastp_db) @@ -24,22 +25,21 @@ workflow BUSCO_DIAMOND { // // Fetch BUSCO lineages for taxon // - GOAT_TAXONSEARCH ( - fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon, [] ] } + NCBI_GET_ODB_TAXON ( + fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon ] }, + lineage_tax_ids, ) - ch_versions = ch_versions.mix ( GOAT_TAXONSEARCH.out.versions.first() ) + ch_versions = ch_versions.mix ( NCBI_GET_ODB_TAXON.out.versions.first() ) // // Get NCBI species ID // - GOAT_TAXONSEARCH.out.taxonsearch - | map { meta, csv -> csv.splitCsv(header:true, sep:'\t', strip:true) } - | map { row -> [ row.taxon_rank, row.taxon_id ] } - | transpose() - | filter { rank,id -> rank =~ /species/ } - | map { rank, id -> id} - | first + NCBI_GET_ODB_TAXON.out.csv + | map { meta, csv -> csv } + | splitCsv(header: ['key', 'value']) + | filter { it.key == "taxon_id" } + | map { it.value } | set { ch_taxid } @@ -49,10 +49,13 @@ workflow BUSCO_DIAMOND { // 0. Initialise sone variables basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ] def lineage_position = 0 - // 1. Parse the GOAT_TAXONSEARCH output - GOAT_TAXONSEARCH.out.taxonsearch - | map { meta, csv -> csv.splitCsv(header:true, sep:'\t', strip:true) } - | map { row -> row.odb10_lineage.findAll { it != "" } } + // 1. Parse the NCBI_GET_ODB_TAXON output + NCBI_GET_ODB_TAXON.out.csv + | map { meta, csv -> csv } + | splitCsv(header: ['key', 'value']) + | filter { it.key == "busco_lineage" } + | map { it.value } + | collect // 2. Add the (missing) basal lineages | map { lineages -> (lineages + basal_lineages).unique() } | flatten () diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 3610cdde..7d8a3a75 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -17,7 +17,7 @@ WorkflowBlobtoolkit.initialise(params, log) // Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxdump, params.busco, params.blastp, params.blastx ] +def checkPathParamList = [ params.input, params.multiqc_config, params.fasta, params.taxdump, params.busco, params.blastp, params.blastx, params.lineage_tax_ids ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Check mandatory parameters @@ -30,6 +30,8 @@ if (params.blastn) { ch_blastn = Channel.value([ [ 'id': file(params.blastn).bas if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' } +if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.lineage_tax_ids) } else { exit 1, 'Mapping BUSCO lineage <-> taxon_ids not specified' } + // Create channel for optional parameters if (params.busco) { ch_busco_db = Channel.fromPath(params.busco).first() } else { ch_busco_db = Channel.value([]) } if (params.yaml) { ch_yaml = Channel.fromPath(params.yaml) } else { ch_yaml = Channel.empty() } @@ -123,12 +125,13 @@ workflow BLOBTOOLKIT { ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions ) // - // SUBWORKFLOW: Run BUSCO using lineages fetched from GOAT, then run diamond_blastp + // SUBWORKFLOW: Run BUSCO using lineages fetched from NCBI, then run diamond_blastp // BUSCO_DIAMOND ( PREPARE_GENOME.out.genome, ch_taxon, ch_busco_db, + ch_lineage_tax_ids, ch_blastp, ) ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) From da7faf9155020d3fb2bda17b3e28310432a28e47 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 28 May 2024 09:15:08 +0000 Subject: [PATCH 06/30] bugfix: Need all lineage names, not just eukaryotes --- ...g_taxids-busco_dataset_name.2019-12-16.txt | 166 ++++++++++++++++++ ...ataset_name.eukaryota_odb10.2019-12-16.txt | 67 ------- nextflow.config | 4 +- nextflow_schema.json | 2 +- 4 files changed, 169 insertions(+), 70 deletions(-) create mode 100644 assets/mapping_taxids-busco_dataset_name.2019-12-16.txt delete mode 100644 assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt diff --git a/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt b/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt new file mode 100644 index 00000000..7105beeb --- /dev/null +++ b/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt @@ -0,0 +1,166 @@ +422676 aconoidasida +7898 actinopterygii +5338 agaricales +155619 agaricomycetes +33630 alveolata +5794 apicomplexa +6854 arachnida +6656 arthropoda +4890 ascomycota +8782 aves +5204 basidiomycota +68889 boletales +3699 brassicales +134362 capnodiales +33554 carnivora +91561 cetartiodactyla +34395 chaetothyriales +3041 chlorophyta +5796 coccidia +28738 cyprinodontiformes +7147 diptera +147541 dothideomycetes +3193 embryophyta +33392 endopterygota +314146 euarchontoglires +33682 euglenozoa +2759 eukaryota +5042 eurotiales +147545 eurotiomycetes +9347 eutheria +72025 fabales +4751 fungi +314147 glires +1028384 glomerellales +5178 helotiales +7524 hemiptera +7399 hymenoptera +5125 hypocreales +50557 insecta +314145 laurasiatheria +147548 leotiomycetes +7088 lepidoptera +4447 liliopsida +40674 mammalia +33208 metazoa +6029 microsporidia +6447 mollusca +4827 mucorales +1913637 mucoromycota +6231 nematoda +33183 onygenales +9126 passeriformes +5820 plasmodium +92860 pleosporales +38820 poales +5303 polyporales +9443 primates +4891 saccharomycetes +8457 sauropsida +4069 solanales +147550 sordariomycetes +33634 stramenopiles +32523 tetrapoda +155616 tremellomycetes +7742 vertebrata +33090 viridiplantae +71240 eudicots +57723 acidobacteria +201174 actinobacteria_phylum +1760 actinobacteria_class +28211 alphaproteobacteria +135622 alteromonadales +200783 aquificae +1385 bacillales +91061 bacilli +2 bacteria +171549 bacteroidales +976 bacteroidetes +68336 bacteroidetes-chlorobi_group +200643 bacteroidia +28216 betaproteobacteria +80840 burkholderiales +213849 campylobacterales +1706369 cellvibrionales +204428 chlamydiae +1090 chlorobi +200795 chloroflexi +135613 chromatiales +1118 chroococcales +186801 clostridia +186802 clostridiales +84999 coriobacteriales +84998 coriobacteriia +85007 corynebacteriales +1117 cyanobacteria +768507 cytophagales +768503 cytophagia +68525 delta-epsilon-subdivisions +28221 deltaproteobacteria +213118 desulfobacterales +213115 desulfovibrionales +69541 desulfuromonadales +91347 enterobacterales +186328 entomoplasmatales +29547 epsilonproteobacteria +1239 firmicutes +200644 flavobacteriales +117743 flavobacteriia +32066 fusobacteria +203491 fusobacteriales +1236 gammaproteobacteria +186826 lactobacillales +118969 legionellales +85006 micrococcales +31969 mollicutes +2085 mycoplasmatales +206351 neisseriales +32003 nitrosomonadales +1161 nostocales +135619 oceanospirillales +1150 oscillatoriales +135625 pasteurellales +203682 planctomycetes +85009 propionibacteriales +1224 proteobacteria +72274 pseudomonadales +356 rhizobiales +227290 rhizobium-agrobacterium_group +204455 rhodobacterales +204441 rhodospirillales +766 rickettsiales +909929 selenomonadales +117747 sphingobacteriia +204457 sphingomonadales +136 spirochaetales +203691 spirochaetes +203692 spirochaetia +85011 streptomycetales +85012 streptosporangiales +1890424 synechococcales +508458 synergistetes +544448 tenericutes +68295 thermoanaerobacterales +200918 thermotogae +72273 thiotrichales +1737405 tissierellales +1737404 tissierellia +74201 verrucomicrobia +135623 vibrionales +135614 xanthomonadales +2157 archaea +2266 thermoproteales +2281 sulfolobales +114380 desulfurococcales +183967 thermoplasmata +651137 thaumarchaeota +2182 methanococcales +2191 methanomicrobiales +183925 methanobacteria +183924 thermoprotei +2235 halobacteriales +1644060 natrialbales +224756 methanomicrobia +1644055 haloferacales +183963 halobacteria +28890 euryarchaeota diff --git a/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt b/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt deleted file mode 100644 index 5d8af90f..00000000 --- a/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt +++ /dev/null @@ -1,67 +0,0 @@ -422676 aconoidasida -7898 actinopterygii -5338 agaricales -155619 agaricomycetes -33630 alveolata -5794 apicomplexa -6854 arachnida -6656 arthropoda -4890 ascomycota -8782 aves -5204 basidiomycota -68889 boletales -3699 brassicales -134362 capnodiales -33554 carnivora -91561 cetartiodactyla -34395 chaetothyriales -3041 chlorophyta -5796 coccidia -28738 cyprinodontiformes -7147 diptera -147541 dothideomycetes -3193 embryophyta -33392 endopterygota -314146 euarchontoglires -33682 euglenozoa -2759 eukaryota -5042 eurotiales -147545 eurotiomycetes -9347 eutheria -72025 fabales -4751 fungi -314147 glires -1028384 glomerellales -5178 helotiales -7524 hemiptera -7399 hymenoptera -5125 hypocreales -50557 insecta -314145 laurasiatheria -147548 leotiomycetes -7088 lepidoptera -4447 liliopsida -40674 mammalia -33208 metazoa -6029 microsporidia -6447 mollusca -4827 mucorales -1913637 mucoromycota -6231 nematoda -33183 onygenales -9126 passeriformes -5820 plasmodium -92860 pleosporales -38820 poales -5303 polyporales -9443 primates -4891 saccharomycetes -8457 sauropsida -4069 solanales -147550 sordariomycetes -33634 stramenopiles -32523 tetrapoda -155616 tremellomycetes -7742 vertebrata -33090 viridiplantae -71240 eudicots diff --git a/nextflow.config b/nextflow.config index 38964a66..1b4db5ed 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,8 +28,8 @@ params { // Databases and related options taxdump = null busco = null - // Taken from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz - lineage_tax_ids = "${projectDir}/assets/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt" + // Taken from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.*.2019-12-16.txt.tar.gz + lineage_tax_ids = "${projectDir}/assets/mapping_taxids-busco_dataset_name.2019-12-16.txt" blastp = null blastx = null blastn = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 8e7f0fba..529faa7e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -112,7 +112,7 @@ "type": "string", "format": "file-path", "description": "Local file that holds a mapping between BUSCO lineages and taxon IDs.", - "help_text": "Initialised from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.eukaryota_odb10.2019-12-16.txt.tar.gz", + "help_text": "Initialised from https://busco-data.ezlab.org/v5/data/placement_files/mapping_taxids-busco_dataset_name.*.2019-12-16.txt.tar.gz", "fa_icon": "fas fa-file-code" }, "blastp": { From 9c5b4124039e82f9a5a133de053ce4152e4d8bc0 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 13 May 2024 11:26:48 +0100 Subject: [PATCH 07/30] Adding a parameter to choose the busco lineages --- CHANGELOG.md | 10 +++++++ bin/get_odb_taxon.py | 32 +++++++++++++++------- modules/local/get_odb_taxon.nf | 4 ++- nextflow.config | 1 + nextflow_schema.json | 6 ++++ subworkflows/local/busco_diamond_blastp.nf | 2 ++ workflows/blobtoolkit.nf | 4 ++- 7 files changed, 47 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f8f155d..5ca038f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,16 @@ The pipeline has now been validated for draft (unpublished) assemblies. - The pipeline now queries the NCBI database instead of GoaT to establish the taxonomic classification of the species and the relevant Busco lineages. +- New `--busco_lineages` parameter to choose specific Busco lineages instead of + automatically selecting based on the taxonomy. + +### Parameters + +| Old parameter | New parameter | +| ------------- | ---------------- | +| | --busco_lineages | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. ### Software dependencies diff --git a/bin/get_odb_taxon.py b/bin/get_odb_taxon.py index ade35e34..6038122a 100755 --- a/bin/get_odb_taxon.py +++ b/bin/get_odb_taxon.py @@ -16,7 +16,8 @@ def parse_args(args=None): parser.add_argument("TAXON_QUERY", help="Query string/integer for this taxon.") parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.") parser.add_argument("FILE_OUT", help="Output CSV file.") - parser.add_argument("--version", action="version", version="%(prog)s 1.0") + parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None) + parser.add_argument("--version", action="version", version="%(prog)s 1.1") return parser.parse_args(args) @@ -25,23 +26,34 @@ def make_dir(path): os.makedirs(path, exist_ok=True) -def get_odb(taxon_query, lineage_tax_ids, file_out): +def get_odb(taxon_query, lineage_tax_ids, requested_buscos, file_out): # Read the mapping between the BUSCO lineages and their taxon_id with open(lineage_tax_ids) as file_in: lineage_tax_ids_dict = {} for line in file_in: arr = line.split() - lineage_tax_ids_dict[int(arr[0])] = arr[1] + lineage_tax_ids_dict[int(arr[0])] = arr[1] + "_odb10" # Using API, get the taxon_ids of the species and all parents response = requests.get(NCBI_TAXONOMY_API % taxon_query).json() + this_taxon_id = response["taxonomy_nodes"][0]["taxonomy"]["tax_id"] - ancestor_taxon_ids = response["taxonomy_nodes"][0]["taxonomy"]["lineage"] - # Do the intersection to find the ancestors that have a BUSCO lineage - odb_arr = [ - lineage_tax_ids_dict[taxon_id] for taxon_id in reversed(ancestor_taxon_ids) if taxon_id in lineage_tax_ids_dict - ] + if requested_buscos: + odb_arr = requested_buscos.split(",") + valid_odbs = set(lineage_tax_ids_dict.values()) + for odb in odb_arr: + if odb not in valid_odbs: + print(f"Invalid BUSCO lineage: {odb}", file=sys.stderr) + sys.exit(1) + else: + # Do the intersection to find the ancestors that have a BUSCO lineage + ancestor_taxon_ids = response["taxonomy_nodes"][0]["taxonomy"]["lineage"] + odb_arr = [ + lineage_tax_ids_dict[taxon_id] + for taxon_id in reversed(ancestor_taxon_ids) + if taxon_id in lineage_tax_ids_dict + ] out_dir = os.path.dirname(file_out) make_dir(out_dir) @@ -49,12 +61,12 @@ def get_odb(taxon_query, lineage_tax_ids, file_out): with open(file_out, "w") as fout: print("taxon_id", this_taxon_id, sep=",", file=fout) for odb_val in odb_arr: - print("busco_lineage", odb_val + "_odb10", sep=",", file=fout) + print("busco_lineage", odb_val, sep=",", file=fout) def main(args=None): args = parse_args(args) - get_odb(args.TAXON_QUERY, args.LINEAGE_TAX_IDS, args.FILE_OUT) + get_odb(args.TAXON_QUERY, args.LINEAGE_TAX_IDS, args.REQUESTED_BUSCOS, args.FILE_OUT) if __name__ == "__main__": diff --git a/modules/local/get_odb_taxon.nf b/modules/local/get_odb_taxon.nf index ff061ea3..8787c276 100644 --- a/modules/local/get_odb_taxon.nf +++ b/modules/local/get_odb_taxon.nf @@ -9,6 +9,7 @@ process NCBI_GET_ODB_TAXON { input: tuple val(meta), val(taxon_query) + val busco_lin path lineage_tax_ids output: @@ -21,8 +22,9 @@ process NCBI_GET_ODB_TAXON { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def busco_param = busco_lin ? "--busco '${busco_lin}'" : "" """ - get_odb_taxon.py "$taxon_query" $lineage_tax_ids ${prefix}.csv + get_odb_taxon.py "$taxon_query" $lineage_tax_ids $busco_param ${prefix}.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 1b4db5ed..67e5bd70 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,6 +16,7 @@ params { align = false mask = false fetchngs_samplesheet = false + busco_lineages = null // Reference options fasta = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 529faa7e..0a474234 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -43,6 +43,12 @@ "description": "Custom config file for draft assembly", "fa_icon": "fas fa-file-alt" }, + "busco_lineages": { + "type": "string", + "pattern": "([^,]+_odb10,)*[^,]+_odb10", + "description": "Name of the ODB10 lineages to run BUSCO on. By default, the pipeline will automatically selecy the correct lineage based on the taxonomy.", + "fa_icon": "fas fa-bone" + }, "image_format": { "type": "string", "enum": ["png", "svg"], diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 5a20823c..696805ed 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -13,6 +13,7 @@ workflow BUSCO_DIAMOND { take: fasta // channel: [ val(meta), path(fasta) ] taxon // channel: val(taxon) + busco_lin // channel: val([busco_lin]) busco_db // channel: path(busco_db) lineage_tax_ids // channel: /path/to/lineage_tax_ids blastp // channel: path(blastp_db) @@ -27,6 +28,7 @@ workflow BUSCO_DIAMOND { // NCBI_GET_ODB_TAXON ( fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon ] }, + busco_lin, lineage_tax_ids, ) ch_versions = ch_versions.mix ( NCBI_GET_ODB_TAXON.out.versions.first() ) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 7d8a3a75..81b2163e 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -30,9 +30,10 @@ if (params.blastn) { ch_blastn = Channel.value([ [ 'id': file(params.blastn).bas if (params.taxdump) { ch_taxdump = file(params.taxdump) } else { exit 1, 'NCBI Taxonomy database not specified!' } if (params.fetchngs_samplesheet && !params.align) { exit 1, '--align not specified, even though the input samplesheet is a nf-core/fetchngs one - i.e has fastq files!' } -if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.lineage_tax_ids) } else { exit 1, 'Mapping BUSCO lineage <-> taxon_ids not specified' } +if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.lineage_tax_ids).first() } else { exit 1, 'Mapping BUSCO lineage <-> taxon_ids not specified' } // Create channel for optional parameters +if (params.busco_lineages) { ch_busco_lin = Channel.value(params.busco_lineages) } else { ch_busco_lin = Channel.value([]) } if (params.busco) { ch_busco_db = Channel.fromPath(params.busco).first() } else { ch_busco_db = Channel.value([]) } if (params.yaml) { ch_yaml = Channel.fromPath(params.yaml) } else { ch_yaml = Channel.empty() } if (params.yaml && params.accession) { exit 1, '--yaml cannot be provided at the same time as --accession !' } @@ -130,6 +131,7 @@ workflow BLOBTOOLKIT { BUSCO_DIAMOND ( PREPARE_GENOME.out.genome, ch_taxon, + ch_busco_lin, ch_busco_db, ch_lineage_tax_ids, ch_blastp, From cb89c3b9222397566b1368f43bb42b511379262b Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sat, 18 May 2024 11:36:39 +0100 Subject: [PATCH 08/30] Removed the legacy yaml configuration file --- CHANGELOG.md | 3 +++ assets/minimal.yaml | 21 +++++++++++++++++++++ docs/usage.md | 12 +++++------- nextflow.config | 1 - nextflow_schema.json | 6 ------ subworkflows/local/input_check.nf | 7 ++++--- workflows/blobtoolkit.nf | 5 +---- 7 files changed, 34 insertions(+), 21 deletions(-) create mode 100644 assets/minimal.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ca038f1..35c1b810 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,11 +11,14 @@ The pipeline has now been validated for draft (unpublished) assemblies. taxonomic classification of the species and the relevant Busco lineages. - New `--busco_lineages` parameter to choose specific Busco lineages instead of automatically selecting based on the taxonomy. +- All parameters are now passed the regular Nextflow way. There is no support + for the original Yaml configuration files of the Snakemake version. ### Parameters | Old parameter | New parameter | | ------------- | ---------------- | +| --yaml | | | | --busco_lineages | > **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. diff --git a/assets/minimal.yaml b/assets/minimal.yaml new file mode 100644 index 00000000..e2ed4465 --- /dev/null +++ b/assets/minimal.yaml @@ -0,0 +1,21 @@ +assembly: + level: bar +settings: + stats_windows: + - 0.1 + - 0.01 + - 100000 + - 1000000 +similarity: + diamond_blastx: + foo: 0 +taxon: + class: class_name + family: family_name + genus: genus_name + kingdom: kingdom_name + name: species_name + order: order_name + phylum: phylum_name + superkingdom: superkingdom_name + taxid: 0 diff --git a/docs/usage.md b/docs/usage.md index 0619d8e0..4d0ad33e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -162,12 +162,6 @@ If you have [GNU parallel](https://www.gnu.org/software/parallel/) installed, yo find v5/data -name "*.tar.gz" | parallel "cd {//}; tar -xzf {/}" ``` -## YAML File and Nextflow configuration - -As in the Snakemake version [a YAML configuration file](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src#configuration) is needed to generate metadata summary. This YAML config file can be generated with a genome accession value for released assemblies (for example, GCA_XXXXXXXXX.X) or can be passed for draft assemblies (for example, [GCA_922984935.2.yaml](assets/test/GCA_922984935.2.yaml) using the `--yaml` parameter. Even for draft assemblies, a placeholder value should be passed with the `--accession` parameter. - -The data in the YAML is currently ignored in the Nextflow pipeline version. The YAML file is retained only to allow compatibility with the BlobDir dataset generated by the [Snakemake version](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/blobtoolkit-pipeline/src). The taxonomic information in the YAML file can be obtained from [NCBI Taxonomy](https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/). - ## Changes from Snakemake to Nextflow ### Commands @@ -189,9 +183,13 @@ Nextflow nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession GCA_ACCESSION --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB # Draft Assemblies -nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession TAG --taxon TAXON_ID --yaml CONFIG --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB +nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB ``` +The Nextflow pipeline does not support taking input from the Yaml files of the Snakemake pipeline. +Instead, Nextflow has a uniform way of setting input parameters on the command-line or via a JSON / Yaml file, +see for some examples. + ### Subworkflows Here is a full list of snakemake subworkflows and their Nextflow couterparts: diff --git a/nextflow.config b/nextflow.config index 67e5bd70..a1ea3ae9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,6 @@ params { // Specify your pipeline's command line flags // Input options input = null - yaml = null align = false mask = false fetchngs_samplesheet = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 0a474234..cd0b7341 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -37,12 +37,6 @@ "description": "Turn on the conversion from a nf-core/fetchngs samplesheet.", "fa_icon": "fas fa-toggle-off" }, - "yaml": { - "type": "string", - "format": "file-path", - "description": "Custom config file for draft assembly", - "fa_icon": "fas fa-file-alt" - }, "busco_lineages": { "type": "string", "pattern": "([^,]+_odb10,)*[^,]+_odb10", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d498269f..a7a4a017 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -12,7 +12,6 @@ workflow INPUT_CHECK { take: samplesheet // file: /path/to/samplesheet.csv fasta // channel: [ meta, path(fasta) ] - yaml // channel: [ meta, path(config ] main: ch_versions = Channel.empty() @@ -58,7 +57,7 @@ workflow INPUT_CHECK { | set { reads } - if ( !params.yaml ) { + if ( params.accession ) { read_files | map { meta, data -> meta.id.split("_")[0..-2].join("_") } | combine ( fasta ) @@ -70,7 +69,9 @@ workflow INPUT_CHECK { ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() ) ch_config = BLOBTOOLKIT_CONFIG.out.yaml } else { - ch_config = yaml + fasta + | map { meta, fa -> [meta, file("${projectDir}/assets/minimal.yaml")] } + | set { ch_config } } emit: diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 81b2163e..a1318967 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -35,9 +35,6 @@ if (params.lineage_tax_ids) { ch_lineage_tax_ids = Channel.fromPath(params.linea // Create channel for optional parameters if (params.busco_lineages) { ch_busco_lin = Channel.value(params.busco_lineages) } else { ch_busco_lin = Channel.value([]) } if (params.busco) { ch_busco_db = Channel.fromPath(params.busco).first() } else { ch_busco_db = Channel.value([]) } -if (params.yaml) { ch_yaml = Channel.fromPath(params.yaml) } else { ch_yaml = Channel.empty() } -if (params.yaml && params.accession) { exit 1, '--yaml cannot be provided at the same time as --accession !' } -if (!params.yaml && !params.accession) { exit 1, '--yaml and --accession are both mising. Pick one !' } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -105,7 +102,7 @@ workflow BLOBTOOLKIT { // // SUBWORKFLOW: Check samplesheet and create channels for downstream analysis // - INPUT_CHECK ( ch_input, PREPARE_GENOME.out.genome, ch_yaml ) + INPUT_CHECK ( ch_input, PREPARE_GENOME.out.genome ) ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) // From d08ba188e863ee7f2b9dc732275cc5ef8ddbf262 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sat, 18 May 2024 11:49:21 +0100 Subject: [PATCH 09/30] Generate a yaml file internally --- assets/minimal.yaml | 21 --- bin/generate_config.py | 154 +++++++++++++++++++++ bin/get_odb_taxon.py | 73 ---------- modules/local/generate_config.nf | 34 +++++ modules/local/get_odb_taxon.nf | 34 ----- subworkflows/local/busco_diamond_blastp.nf | 20 +-- subworkflows/local/input_check.nf | 19 ++- workflows/blobtoolkit.nf | 12 +- 8 files changed, 215 insertions(+), 152 deletions(-) delete mode 100644 assets/minimal.yaml create mode 100755 bin/generate_config.py delete mode 100755 bin/get_odb_taxon.py create mode 100644 modules/local/generate_config.nf delete mode 100644 modules/local/get_odb_taxon.nf diff --git a/assets/minimal.yaml b/assets/minimal.yaml deleted file mode 100644 index e2ed4465..00000000 --- a/assets/minimal.yaml +++ /dev/null @@ -1,21 +0,0 @@ -assembly: - level: bar -settings: - stats_windows: - - 0.1 - - 0.01 - - 100000 - - 1000000 -similarity: - diamond_blastx: - foo: 0 -taxon: - class: class_name - family: family_name - genus: genus_name - kingdom: kingdom_name - name: species_name - order: order_name - phylum: phylum_name - superkingdom: superkingdom_name - taxid: 0 diff --git a/bin/generate_config.py b/bin/generate_config.py new file mode 100755 index 00000000..8127b40a --- /dev/null +++ b/bin/generate_config.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 + +import argparse +import dataclasses +import os +import sys +import requests +import typing +import yaml + +NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s" + +RANKS = [ + "genus", + "family", + "order", + "class", + "phylum", + "kingdom", + "superkingdom", +] + +STATS_WINDOWS = [ + 0.1, + 0.01, + 100000, + 1000000, +] + + +def parse_args(args=None): + Description = "Produce the various configuration files needed within the pipeline" + + parser = argparse.ArgumentParser(description=Description) + parser.add_argument("FASTA", help="Path to the Fasta file of the assembly.") + parser.add_argument("TAXON_QUERY", help="Query string/integer for this taxon.") + parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.") + parser.add_argument("YML_OUT", help="Output YML file.") + parser.add_argument("CSV_OUT", help="Output CSV file.") + parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None) + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +@dataclasses.dataclass +class TaxonInfo: + taxon_id: int + organism_name: str + rank: typing.Optional[str] + lineage: typing.List[int] + + +def make_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo: + # Using API, get the taxon_ids of the species and all parents + response = requests.get(NCBI_TAXONOMY_API % taxon_name).json() + body = response["taxonomy_nodes"][0]["taxonomy"] + return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), body["lineage"]) + + +def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]: + ancestors: typing.Dict[str, str] = {} + for anc_taxon_id in taxon_info.lineage[1:]: # skip the root taxon_id=1 + anc_taxon_info = make_taxon_info(anc_taxon_id) + if anc_taxon_info.rank: + ancestors[anc_taxon_info.rank.lower()] = anc_taxon_info.organism_name + return {r: ancestors[r] for r in RANKS if r in ancestors} + + +def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typing.Optional[str]) -> typing.List[str]: + + # Read the mapping between the BUSCO lineages and their taxon_id + with open(lineage_tax_ids) as file_in: + lineage_tax_ids_dict: typing.Dict[int, str] = {} + for line in file_in: + arr = line.split() + lineage_tax_ids_dict[int(arr[0])] = arr[1] + "_odb10" + + if requested_buscos: + odb_arr = requested_buscos.split(",") + valid_odbs = set(lineage_tax_ids_dict.values()) + for odb in odb_arr: + if odb not in valid_odbs: + print(f"Invalid BUSCO lineage: {odb}", file=sys.stderr) + sys.exit(1) + else: + # Do the intersection to find the ancestors that have a BUSCO lineage + odb_arr = [ + lineage_tax_ids_dict[taxon_id] + for taxon_id in reversed(taxon_info.lineage) + if taxon_id in lineage_tax_ids_dict + ] + + return odb_arr + + +def print_yaml( + file_out, fasta: str, taxon_info: TaxonInfo, classification: typing.Dict[str, str], odb_arr: typing.List[str] +): + + data = { + "assembly": { + "file": fasta, + "level": "scaffold", + }, + "settings": { + "stats_windows": STATS_WINDOWS, + }, + "similarity": { + "diamond_blastx": { + "foo": 0, + }, + }, + "taxon": { + "taxid": str(taxon_info.taxon_id), # str on purpose, to mimic blobtoolkit + "name": taxon_info.organism_name, + **classification, + }, + } + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + + with open(file_out, "w") as fout: + yaml.dump(data, fout) + + +def print_csv(file_out, taxon_info: TaxonInfo, odb_arr: typing.List[str]): + + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + + with open(file_out, "w") as fout: + print("taxon_id", taxon_info.taxon_id, sep=",", file=fout) + for odb_val in odb_arr: + print("busco_lineage", odb_val, sep=",", file=fout) + + +def main(args=None): + args = parse_args(args) + + taxon_info = make_taxon_info(args.TAXON_QUERY) + classification = get_classification(taxon_info) + odb_arr = get_odb(taxon_info, args.LINEAGE_TAX_IDS, args.REQUESTED_BUSCOS) + + print_yaml(args.YML_OUT, args.FASTA, taxon_info, classification, odb_arr) + print_csv(args.CSV_OUT, taxon_info, odb_arr) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/get_odb_taxon.py b/bin/get_odb_taxon.py deleted file mode 100755 index 6038122a..00000000 --- a/bin/get_odb_taxon.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import os -import sys -import requests - - -NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s" - - -def parse_args(args=None): - Description = "Get Taxon ID and ODB database value using NCBI API and BUSCO configuration file" - - parser = argparse.ArgumentParser(description=Description) - parser.add_argument("TAXON_QUERY", help="Query string/integer for this taxon.") - parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.") - parser.add_argument("FILE_OUT", help="Output CSV file.") - parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None) - parser.add_argument("--version", action="version", version="%(prog)s 1.1") - return parser.parse_args(args) - - -def make_dir(path): - if len(path) > 0: - os.makedirs(path, exist_ok=True) - - -def get_odb(taxon_query, lineage_tax_ids, requested_buscos, file_out): - # Read the mapping between the BUSCO lineages and their taxon_id - with open(lineage_tax_ids) as file_in: - lineage_tax_ids_dict = {} - for line in file_in: - arr = line.split() - lineage_tax_ids_dict[int(arr[0])] = arr[1] + "_odb10" - - # Using API, get the taxon_ids of the species and all parents - response = requests.get(NCBI_TAXONOMY_API % taxon_query).json() - - this_taxon_id = response["taxonomy_nodes"][0]["taxonomy"]["tax_id"] - - if requested_buscos: - odb_arr = requested_buscos.split(",") - valid_odbs = set(lineage_tax_ids_dict.values()) - for odb in odb_arr: - if odb not in valid_odbs: - print(f"Invalid BUSCO lineage: {odb}", file=sys.stderr) - sys.exit(1) - else: - # Do the intersection to find the ancestors that have a BUSCO lineage - ancestor_taxon_ids = response["taxonomy_nodes"][0]["taxonomy"]["lineage"] - odb_arr = [ - lineage_tax_ids_dict[taxon_id] - for taxon_id in reversed(ancestor_taxon_ids) - if taxon_id in lineage_tax_ids_dict - ] - - out_dir = os.path.dirname(file_out) - make_dir(out_dir) - - with open(file_out, "w") as fout: - print("taxon_id", this_taxon_id, sep=",", file=fout) - for odb_val in odb_arr: - print("busco_lineage", odb_val, sep=",", file=fout) - - -def main(args=None): - args = parse_args(args) - get_odb(args.TAXON_QUERY, args.LINEAGE_TAX_IDS, args.REQUESTED_BUSCOS, args.FILE_OUT) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf new file mode 100644 index 00000000..8e8478e8 --- /dev/null +++ b/modules/local/generate_config.nf @@ -0,0 +1,34 @@ +process GENERATE_CONFIG { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::requests=2.28.1 conda-forge::pyyaml=6.0" + container "docker.io/genomehubs/blobtoolkit:4.3.9" + + input: + tuple val(meta), val(fasta) + val taxon_query + val busco_lin + path lineage_tax_ids + + output: + tuple val(meta), path("*.yaml"), emit: yaml + tuple val(meta), path("*.csv") , emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def busco_param = busco_lin ? "--busco '${busco_lin}'" : "" + """ + generate_config.py $fasta "$taxon_query" $lineage_tax_ids $busco_param ${prefix}.yaml ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + generate_config.py: \$(generate_config.py --version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/local/get_odb_taxon.nf b/modules/local/get_odb_taxon.nf deleted file mode 100644 index 8787c276..00000000 --- a/modules/local/get_odb_taxon.nf +++ /dev/null @@ -1,34 +0,0 @@ -process NCBI_GET_ODB_TAXON { - tag "$meta.id" - label 'process_single' - - conda "conda-forge::requests=2.26.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/requests:2.26.0': - 'biocontainers/requests:2.26.0' }" - - input: - tuple val(meta), val(taxon_query) - val busco_lin - path lineage_tax_ids - - output: - tuple val(meta), path("*.csv"), emit: csv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def busco_param = busco_lin ? "--busco '${busco_lin}'" : "" - """ - get_odb_taxon.py "$taxon_query" $lineage_tax_ids $busco_param ${prefix}.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - get_odb_taxon.py: \$(get_odb.py --version | cut -d' ' -f2) - END_VERSIONS - """ -} diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 696805ed..0d084f87 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -2,7 +2,6 @@ // Run BUSCO for a genome and runs diamond_blastp // -include { NCBI_GET_ODB_TAXON } from '../../modules/local/get_odb_taxon' include { BUSCO } from '../../modules/nf-core/busco/main' include { BLOBTOOLKIT_EXTRACTBUSCOS } from '../../modules/local/blobtoolkit/extractbuscos' include { DIAMOND_BLASTP } from '../../modules/nf-core/diamond/blastp/main' @@ -12,10 +11,8 @@ include { RESTRUCTUREBUSCODIR } from '../../modules/local/restructurebusco workflow BUSCO_DIAMOND { take: fasta // channel: [ val(meta), path(fasta) ] - taxon // channel: val(taxon) - busco_lin // channel: val([busco_lin]) + csv // channel: [ val(meta), path(csv) ] busco_db // channel: path(busco_db) - lineage_tax_ids // channel: /path/to/lineage_tax_ids blastp // channel: path(blastp_db) @@ -23,21 +20,10 @@ workflow BUSCO_DIAMOND { ch_versions = Channel.empty() - // - // Fetch BUSCO lineages for taxon - // - NCBI_GET_ODB_TAXON ( - fasta.combine(taxon).map { meta, fasta, taxon -> [ meta, taxon ] }, - busco_lin, - lineage_tax_ids, - ) - ch_versions = ch_versions.mix ( NCBI_GET_ODB_TAXON.out.versions.first() ) - - // // Get NCBI species ID // - NCBI_GET_ODB_TAXON.out.csv + csv | map { meta, csv -> csv } | splitCsv(header: ['key', 'value']) | filter { it.key == "taxon_id" } @@ -52,7 +38,7 @@ workflow BUSCO_DIAMOND { basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ] def lineage_position = 0 // 1. Parse the NCBI_GET_ODB_TAXON output - NCBI_GET_ODB_TAXON.out.csv + csv | map { meta, csv -> csv } | splitCsv(header: ['key', 'value']) | filter { it.key == "busco_lineage" } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index a7a4a017..466c283e 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -7,11 +7,15 @@ include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flags include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' include { FETCHNGSSAMPLESHEET_CHECK } from '../../modules/local/fetchngssamplesheet_check' include { BLOBTOOLKIT_CONFIG } from '../../modules/local/blobtoolkit/config' +include { GENERATE_CONFIG } from '../../modules/local/generate_config' workflow INPUT_CHECK { take: samplesheet // file: /path/to/samplesheet.csv fasta // channel: [ meta, path(fasta) ] + taxon // channel: val(taxon) + busco_lin // channel: val([busco_lin]) + lineage_tax_ids // channel: /path/to/lineage_tax_ids main: ch_versions = Channel.empty() @@ -57,6 +61,15 @@ workflow INPUT_CHECK { | set { reads } + GENERATE_CONFIG ( + fasta, + taxon, + busco_lin, + lineage_tax_ids, + ) + ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first()) + + if ( params.accession ) { read_files | map { meta, data -> meta.id.split("_")[0..-2].join("_") } @@ -68,15 +81,15 @@ workflow INPUT_CHECK { BLOBTOOLKIT_CONFIG ( grouped_reads, fasta ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() ) ch_config = BLOBTOOLKIT_CONFIG.out.yaml + } else { - fasta - | map { meta, fa -> [meta, file("${projectDir}/assets/minimal.yaml")] } - | set { ch_config } + ch_config = GENERATE_CONFIG.out.yaml } emit: reads // channel: [ val(meta), path(datafile) ] config = ch_config // channel: [ val(meta), path(yaml) ] + csv_params = GENERATE_CONFIG.out.csv // channel: [ val(meta), path(csv) ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index a1318967..098dfccd 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -102,7 +102,13 @@ workflow BLOBTOOLKIT { // // SUBWORKFLOW: Check samplesheet and create channels for downstream analysis // - INPUT_CHECK ( ch_input, PREPARE_GENOME.out.genome ) + INPUT_CHECK ( + ch_input, + PREPARE_GENOME.out.genome, + ch_taxon, + ch_busco_lin, + ch_lineage_tax_ids, + ) ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) // @@ -127,10 +133,8 @@ workflow BLOBTOOLKIT { // BUSCO_DIAMOND ( PREPARE_GENOME.out.genome, - ch_taxon, - ch_busco_lin, + INPUT_CHECK.out.csv_params, ch_busco_db, - ch_lineage_tax_ids, ch_blastp, ) ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) From 4e79fb011965bfa7f2fea5f5b9318f9c7fd3125f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Sat, 18 May 2024 11:52:18 +0100 Subject: [PATCH 10/30] Hide the CSV from the rest of the pipeline --- subworkflows/local/busco_diamond_blastp.nf | 26 ++++------------- subworkflows/local/input_check.nf | 34 +++++++++++++++++++++- workflows/blobtoolkit.nf | 7 +++-- 3 files changed, 42 insertions(+), 25 deletions(-) diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index 0d084f87..4b07723e 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -11,39 +11,24 @@ include { RESTRUCTUREBUSCODIR } from '../../modules/local/restructurebusco workflow BUSCO_DIAMOND { take: fasta // channel: [ val(meta), path(fasta) ] - csv // channel: [ val(meta), path(csv) ] + busco_lin // channel: val([busco_lineages]) busco_db // channel: path(busco_db) blastp // channel: path(blastp_db) + taxon_id // channel: val(taxon_id) main: ch_versions = Channel.empty() - // - // Get NCBI species ID - // - csv - | map { meta, csv -> csv } - | splitCsv(header: ['key', 'value']) - | filter { it.key == "taxon_id" } - | map { it.value } - | set { ch_taxid } - - // // Prepare the BUSCO linages // // 0. Initialise sone variables basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ] def lineage_position = 0 - // 1. Parse the NCBI_GET_ODB_TAXON output - csv - | map { meta, csv -> csv } - | splitCsv(header: ['key', 'value']) - | filter { it.key == "busco_lineage" } - | map { it.value } - | collect + // 1. Start from the taxon's lineages + busco_lin // 2. Add the (missing) basal lineages | map { lineages -> (lineages + basal_lineages).unique() } | flatten () @@ -108,7 +93,7 @@ workflow BUSCO_DIAMOND { // Hardcoded to match the format expected by blobtools def outext = 'txt' def cols = 'qseqid staxids bitscore qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' - DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols, ch_taxid ) + DIAMOND_BLASTP ( ch_busco_genes, blastp, outext, cols, taxon_id ) ch_versions = ch_versions.mix ( DIAMOND_BLASTP.out.versions.first() ) @@ -139,7 +124,6 @@ workflow BUSCO_DIAMOND { first_table = ch_first_table // channel: [ val(meta), path(full_table) ] all_tables = ch_indexed_buscos // channel: [ val(meta), path(full_tables) ] blastp_txt = DIAMOND_BLASTP.out.txt // channel: [ val(meta), path(txt) ] - taxon_id = ch_taxid // channel: taxon_id multiqc // channel: [ meta, summary ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 466c283e..7da0d1b7 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -70,6 +70,37 @@ workflow INPUT_CHECK { ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first()) + // + // Parse the CSV file + // + GENERATE_CONFIG.out.csv + | map { meta, csv -> csv } + | splitCsv(header: ['key', 'value']) + | branch { + taxon_id: it.key == "taxon_id" + return it.value + busco_lineage: it.key == "busco_lineage" + return it.value + } + | set { ch_parsed_csv } + + + // + // Get the taxon ID + // + ch_parsed_csv.taxon_id + | first + | set { ch_taxon_id } + + + // + // Get the BUSCO linages + // + ch_parsed_csv.busco_lineage + | collect + | set { ch_busco_lineages } + + if ( params.accession ) { read_files | map { meta, data -> meta.id.split("_")[0..-2].join("_") } @@ -89,7 +120,8 @@ workflow INPUT_CHECK { emit: reads // channel: [ val(meta), path(datafile) ] config = ch_config // channel: [ val(meta), path(yaml) ] - csv_params = GENERATE_CONFIG.out.csv // channel: [ val(meta), path(csv) ] + taxon_id = ch_taxon_id // channel: val(taxon_id) + busco_lineages = ch_busco_lineages // channel: val([busco_lin]) versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 098dfccd..4c557af4 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -133,9 +133,10 @@ workflow BLOBTOOLKIT { // BUSCO_DIAMOND ( PREPARE_GENOME.out.genome, - INPUT_CHECK.out.csv_params, + INPUT_CHECK.out.busco_lineages, ch_busco_db, ch_blastp, + INPUT_CHECK.out.taxon_id, ) ch_versions = ch_versions.mix ( BUSCO_DIAMOND.out.versions ) @@ -146,7 +147,7 @@ workflow BLOBTOOLKIT { PREPARE_GENOME.out.genome, BUSCO_DIAMOND.out.first_table, ch_blastx, - BUSCO_DIAMOND.out.taxon_id, + INPUT_CHECK.out.taxon_id, ) ch_versions = ch_versions.mix ( RUN_BLASTX.out.versions ) @@ -158,7 +159,7 @@ workflow BLOBTOOLKIT { RUN_BLASTX.out.blastx_out, PREPARE_GENOME.out.genome, ch_blastn, - BUSCO_DIAMOND.out.taxon_id, + INPUT_CHECK.out.taxon_id, ) // From 4c7cef6f782b3e22fe4310701443dbcf5384ac4c Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 20 May 2024 15:21:32 +0100 Subject: [PATCH 11/30] Generate a more complete yaml to match the one we get from blobtoolkit --- bin/generate_config.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index 8127b40a..da2fd278 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -20,12 +20,7 @@ "superkingdom", ] -STATS_WINDOWS = [ - 0.1, - 0.01, - 100000, - 1000000, -] +BUSCO_BASAL_LINEAGES = ["eukaryota_odb10", "bacteria_odb10", "archaea_odb10"] def parse_args(args=None): @@ -38,7 +33,7 @@ def parse_args(args=None): parser.add_argument("YML_OUT", help="Output YML file.") parser.add_argument("CSV_OUT", help="Output CSV file.") parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None) - parser.add_argument("--version", action="version", version="%(prog)s 1.0") + parser.add_argument("--version", action="version", version="%(prog)s 1.1") return parser.parse_args(args) @@ -104,15 +99,38 @@ def print_yaml( data = { "assembly": { + # Other attributes can't be filled in for draft assemblies "file": fasta, "level": "scaffold", }, + "busco": { + "basal_lineages": BUSCO_BASAL_LINEAGES, + "lineages": odb_arr + [lin for lin in BUSCO_BASAL_LINEAGES if lin not in odb_arr], + }, + "revision": 1, "settings": { - "stats_windows": STATS_WINDOWS, + # Only settings.stats_windows is mandatory, everything else is superfluous + "blast_chunk": 100000, + "blast_max_chunks": 10, + "blast_min_length": 1000, + "blast_overlap": 0, + "stats_chunk": 1000, + "stats_windows": [0.1, 0.01, 100000, 1000000], + "tmp": "/tmp", }, "similarity": { + # Only the presence similarity.diamond_blastx seems mandatory, everything else is superfluous + "blastn": { + "name": "nt", + }, + "defaults": {"evalue": 1e-10, "import_evalue": 1e-25, "max_target_seqs": 10, "taxrule": "buscogenes"}, + "diamond_blastp": { + "import_max_target_seqs": 100000, + "name": "reference_proteomes", + "taxrule": "blastp=buscogenes", + }, "diamond_blastx": { - "foo": 0, + "name": "reference_proteomes", }, }, "taxon": { @@ -120,6 +138,7 @@ def print_yaml( "name": taxon_info.organism_name, **classification, }, + "version": 2, } out_dir = os.path.dirname(file_out) make_dir(out_dir) From eb871935480ea6fea82b4c12e4afa3301f3b8299 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 20 May 2024 15:22:25 +0100 Subject: [PATCH 12/30] Update the database paths in the final meta.json --- bin/generate_config.py | 6 ++++++ bin/update_versions.py | 21 ++++++++++++++++----- modules/local/blobtoolkit/updatemeta.nf | 9 +++++++++ subworkflows/local/finalise_blobdir.nf | 9 ++++++++- 4 files changed, 39 insertions(+), 6 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index da2fd278..e8a40f33 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -105,8 +105,10 @@ def print_yaml( }, "busco": { "basal_lineages": BUSCO_BASAL_LINEAGES, + # "download_dir": "lineages": odb_arr + [lin for lin in BUSCO_BASAL_LINEAGES if lin not in odb_arr], }, + # TODO: reads "revision": 1, "settings": { # Only settings.stats_windows is mandatory, everything else is superfluous @@ -116,21 +118,25 @@ def print_yaml( "blast_overlap": 0, "stats_chunk": 1000, "stats_windows": [0.1, 0.01, 100000, 1000000], + # "taxdump": , "tmp": "/tmp", }, "similarity": { # Only the presence similarity.diamond_blastx seems mandatory, everything else is superfluous "blastn": { "name": "nt", + # "path": , }, "defaults": {"evalue": 1e-10, "import_evalue": 1e-25, "max_target_seqs": 10, "taxrule": "buscogenes"}, "diamond_blastp": { "import_max_target_seqs": 100000, "name": "reference_proteomes", + # "path": , "taxrule": "blastp=buscogenes", }, "diamond_blastx": { "name": "reference_proteomes", + # "path": , }, }, "taxon": { diff --git a/bin/update_versions.py b/bin/update_versions.py index 9e014b46..f9c5f4b5 100755 --- a/bin/update_versions.py +++ b/bin/update_versions.py @@ -15,15 +15,19 @@ def parse_args(args=None): parser.add_argument("--meta_in", help="Input JSON file.", required=True) parser.add_argument("--meta_out", help="Output JSON file.", required=True) parser.add_argument("--software", help="Input YAML file.", required=True) - parser.add_argument("--version", action="version", version="%(prog)s 1.1.0") + parser.add_argument("--blastp", help="Path to the blastp database", required=True) + parser.add_argument("--blastx", help="Path to the blastx database", required=True) + parser.add_argument("--blastn", help="Path to the blastn database", required=True) + parser.add_argument("--taxdump", help="Path to the taxonomy database", required=True) + parser.add_argument("--version", action="version", version="%(prog)s 1.2.0") return parser.parse_args(args) -def update_meta(meta, software): - with open(meta) as fh: +def update_meta(args): + with open(args.meta_in) as fh: infile = json.load(fh) - with open(software) as fh: + with open(args.software) as fh: versions = yaml.safe_load(fh) new_dict = dict() @@ -36,13 +40,20 @@ def update_meta(meta, software): del new_dict["sanger-tol/blobtoolkit"] infile["settings"]["software_versions"] = new_dict + infile["settings"]["taxdump"] = args.taxdump + for k in ["blastn", "diamond_blastp", "diamond_blastx"]: + infile["similarity"].setdefault(k, {}) + infile["similarity"]["blastn"]["path"] = args.blastn + infile["similarity"]["diamond_blastp"]["path"] = args.blastp + infile["similarity"]["diamond_blastx"]["path"] = args.blastx + return infile def main(args=None): args = parse_args(args) - data = update_meta(args.meta_in, args.software) + data = update_meta(args) with open(args.meta_out, "w") as fh: json.dump(data, fh) diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf index a5556348..f39bee85 100644 --- a/modules/local/blobtoolkit/updatemeta.nf +++ b/modules/local/blobtoolkit/updatemeta.nf @@ -10,6 +10,11 @@ process BLOBTOOLKIT_UPDATEMETA { input: tuple val(meta), path(input) path versions + // The following are passed as "val" because we just want to know the full paths. No staging necessary + val blastp + val blastx + val blastn + val taxdump output: tuple val(meta), path("*.json"), emit: json @@ -26,6 +31,10 @@ process BLOBTOOLKIT_UPDATEMETA { ${args} \\ --meta_in ${input}/meta.json \\ --software ${versions} \\ + --blastp ${blastp} \\ + --blastx ${blastx} \\ + --blastn ${blastn} \\ + --taxdump ${taxdump} \\ --meta_out ${prefix}.meta.json cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/finalise_blobdir.nf b/subworkflows/local/finalise_blobdir.nf index ffbbd534..a993c705 100644 --- a/subworkflows/local/finalise_blobdir.nf +++ b/subworkflows/local/finalise_blobdir.nf @@ -18,7 +18,14 @@ workflow FINALISE_BLOBDIR { // // MODULE: Update meta json file // - BLOBTOOLKIT_UPDATEMETA ( blobdir, software ) + BLOBTOOLKIT_UPDATEMETA ( + blobdir, + software, + params.blastp, + params.blastx, + params.blastn, + params.taxdump, + ) // // MODULE: Compress all the json files From f4c82fb02951d182aa45735ac7a3de677546dfe2 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 20 May 2024 16:05:07 +0100 Subject: [PATCH 13/30] Fill in the reads too --- bin/generate_config.py | 2 +- bin/update_versions.py | 21 +++++++++++++++++++++ modules/local/blobtoolkit/updatemeta.nf | 3 +++ subworkflows/local/finalise_blobdir.nf | 2 ++ workflows/blobtoolkit.nf | 1 + 5 files changed, 28 insertions(+), 1 deletion(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index e8a40f33..ed5868e5 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -108,7 +108,7 @@ def print_yaml( # "download_dir": "lineages": odb_arr + [lin for lin in BUSCO_BASAL_LINEAGES if lin not in odb_arr], }, - # TODO: reads + # "reads": {}, "revision": 1, "settings": { # Only settings.stats_windows is mandatory, everything else is superfluous diff --git a/bin/update_versions.py b/bin/update_versions.py index f9c5f4b5..1d3491a3 100755 --- a/bin/update_versions.py +++ b/bin/update_versions.py @@ -15,6 +15,9 @@ def parse_args(args=None): parser.add_argument("--meta_in", help="Input JSON file.", required=True) parser.add_argument("--meta_out", help="Output JSON file.", required=True) parser.add_argument("--software", help="Input YAML file.", required=True) + parser.add_argument("--read_id", action="append", help="ID of a read set") + parser.add_argument("--read_type", action="append", help="Type of a read set") + parser.add_argument("--read_path", action="append", help="Path of a read set") parser.add_argument("--blastp", help="Path to the blastp database", required=True) parser.add_argument("--blastx", help="Path to the blastx database", required=True) parser.add_argument("--blastn", help="Path to the blastn database", required=True) @@ -23,6 +26,17 @@ def parse_args(args=None): return parser.parse_args(args) +def datatype_to_platform(s): + if s == "ont": + return "OXFORD_NANOPORE" + elif s.startswith("pacbio"): + return "PACBIO_SMRT" + elif s in ["hic", "illumina"]: + return "ILLUMINA" + else: + return "OTHER" + + def update_meta(args): with open(args.meta_in) as fh: infile = json.load(fh) @@ -47,6 +61,13 @@ def update_meta(args): infile["similarity"]["diamond_blastp"]["path"] = args.blastp infile["similarity"]["diamond_blastx"]["path"] = args.blastx + infile["reads"] = {} + for id, datatype, path in zip(args.read_id, args.read_type, args.read_path): + infile["reads"][id] = { + "file": path, + "plaform": datatype_to_platform(datatype), + } + return infile diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf index f39bee85..0c3aaaaf 100644 --- a/modules/local/blobtoolkit/updatemeta.nf +++ b/modules/local/blobtoolkit/updatemeta.nf @@ -9,6 +9,7 @@ process BLOBTOOLKIT_UPDATEMETA { input: tuple val(meta), path(input) + val reads path versions // The following are passed as "val" because we just want to know the full paths. No staging necessary val blastp @@ -26,6 +27,7 @@ process BLOBTOOLKIT_UPDATEMETA { script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" + def input_reads = reads.collect{"--read_id ${it[0].id} --read_type ${it[0].datatype} --read_path ${it[1]}"}.join(' ') """ update_versions.py \\ ${args} \\ @@ -35,6 +37,7 @@ process BLOBTOOLKIT_UPDATEMETA { --blastx ${blastx} \\ --blastn ${blastn} \\ --taxdump ${taxdump} \\ + $input_reads \\ --meta_out ${prefix}.meta.json cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/finalise_blobdir.nf b/subworkflows/local/finalise_blobdir.nf index a993c705..352cb01b 100644 --- a/subworkflows/local/finalise_blobdir.nf +++ b/subworkflows/local/finalise_blobdir.nf @@ -8,6 +8,7 @@ include { COMPRESSBLOBDIR } from '../../modules/local/compressblobdir' workflow FINALISE_BLOBDIR { take: blobdir // channel: [ val(meta), path(blobdir) ] + reads // channel: [ [meta, reads] ] software // channel: [ val(meta), path(software_yml) ] summary // channel: [ val(meta), path(summary_json) ] @@ -20,6 +21,7 @@ workflow FINALISE_BLOBDIR { // BLOBTOOLKIT_UPDATEMETA ( blobdir, + reads, software, params.blastp, params.blastx, diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 4c557af4..d608803a 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -206,6 +206,7 @@ workflow BLOBTOOLKIT { // FINALISE_BLOBDIR ( BLOBTOOLS.out.blobdir, + INPUT_CHECK.out.reads.collect(flat: false).ifEmpty([]), CUSTOM_DUMPSOFTWAREVERSIONS.out.yml, VIEW.out.summary ) From a28690faca5f25087163203903c7c5b3a558833e Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 20 May 2024 17:22:16 +0100 Subject: [PATCH 14/30] Fill in the assembly information too --- bin/generate_config.py | 48 ++++++++++++++++++++++++++------ modules/local/generate_config.nf | 3 +- 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index ed5868e5..cc95dbd9 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -9,6 +9,7 @@ import yaml NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s" +NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report" RANKS = [ "genus", @@ -32,8 +33,11 @@ def parse_args(args=None): parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.") parser.add_argument("YML_OUT", help="Output YML file.") parser.add_argument("CSV_OUT", help="Output CSV file.") + parser.add_argument( + "--accession", dest="ACCESSION", help="Accession number of the assembly (optional).", default=None + ) parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None) - parser.add_argument("--version", action="version", version="%(prog)s 1.1") + parser.add_argument("--version", action="version", version="%(prog)s 1.2") return parser.parse_args(args) @@ -93,16 +97,37 @@ def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typin return odb_arr +def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]]: + response = requests.get(NCBI_DATASETS_API % accession).json() + if response["total_count"] != 1: + print(f"Assembly not found: {accession}", file=sys.stderr) + sys.exit(1) + assembly_report = response["reports"][0] + assembly_info = assembly_report["assembly_info"] + return { + "accession": accession, + "alias": assembly_info["assembly_name"], + "bioproject": assembly_info["bioproject_accession"], + "biosample": assembly_info["biosample"]["accession"], + "level": assembly_info["assembly_level"].lower(), + "prefix": assembly_report["wgs_info"]["wgs_project_accession"], + "scaffold-count": assembly_report["assembly_stats"]["number_of_component_sequences"] + + assembly_report["assembly_stats"]["number_of_organelles"], + "span": int(assembly_report["assembly_stats"]["total_sequence_length"]) + + sum(int(oi["total_seq_length"]) for oi in assembly_report["organelle_info"]), + } + + def print_yaml( - file_out, fasta: str, taxon_info: TaxonInfo, classification: typing.Dict[str, str], odb_arr: typing.List[str] + file_out, + assembly_info: typing.Dict[str, typing.Union[str, int]], + taxon_info: TaxonInfo, + classification: typing.Dict[str, str], + odb_arr: typing.List[str], ): data = { - "assembly": { - # Other attributes can't be filled in for draft assemblies - "file": fasta, - "level": "scaffold", - }, + "assembly": assembly_info, "busco": { "basal_lineages": BUSCO_BASAL_LINEAGES, # "download_dir": @@ -167,11 +192,18 @@ def print_csv(file_out, taxon_info: TaxonInfo, odb_arr: typing.List[str]): def main(args=None): args = parse_args(args) + assembly_info: typing.Dict[str, typing.Union[str, int]] + if args.ACCESSION: + assembly_info = get_assembly_info(args.ACCESSION) + else: + assembly_info = {"level": "scaffold"} + assembly_info["file"] = args.FASTA + taxon_info = make_taxon_info(args.TAXON_QUERY) classification = get_classification(taxon_info) odb_arr = get_odb(taxon_info, args.LINEAGE_TAX_IDS, args.REQUESTED_BUSCOS) - print_yaml(args.YML_OUT, args.FASTA, taxon_info, classification, odb_arr) + print_yaml(args.YML_OUT, assembly_info, taxon_info, classification, odb_arr) print_csv(args.CSV_OUT, taxon_info, odb_arr) diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index 8e8478e8..f80b1a0c 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -23,8 +23,9 @@ process GENERATE_CONFIG { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def busco_param = busco_lin ? "--busco '${busco_lin}'" : "" + def accession_params = params.accession ? "--accession ${params.accession}" : "" """ - generate_config.py $fasta "$taxon_query" $lineage_tax_ids $busco_param ${prefix}.yaml ${prefix}.csv + generate_config.py $fasta "$taxon_query" $lineage_tax_ids $busco_param $accession_params ${prefix}.yaml ${prefix}.csv cat <<-END_VERSIONS > versions.yml "${task.process}": From a514f2f48d16cf2b396fe10319226797bbdfe05d Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 20 May 2024 17:30:57 +0100 Subject: [PATCH 15/30] No need to generate the reference initial yaml file --- modules/local/blobtoolkit/config.nf | 37 ----------------------------- subworkflows/local/input_check.nf | 19 +-------------- 2 files changed, 1 insertion(+), 55 deletions(-) delete mode 100644 modules/local/blobtoolkit/config.nf diff --git a/modules/local/blobtoolkit/config.nf b/modules/local/blobtoolkit/config.nf deleted file mode 100644 index 32d4eacd..00000000 --- a/modules/local/blobtoolkit/config.nf +++ /dev/null @@ -1,37 +0,0 @@ -process BLOBTOOLKIT_CONFIG { - tag "$meta.id" - label 'process_single' - - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - exit 1, "GENERATE_CONFIG module does not support Conda. Please use Docker / Singularity / Podman instead." - } - container "docker.io/genomehubs/blobtoolkit:4.3.9" - - input: - tuple val(meta), val(reads) - tuple val(meta), val(fasta) - - output: - tuple val(meta), path("${meta.id}/*.yaml"), emit: yaml - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def input_reads = reads.collect{"--reads $it"}.join(' ') - """ - btk pipeline \\ - generate-config \\ - ${prefix} \\ - $args \\ - ${input_reads} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') - END_VERSIONS - """ -} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 7da0d1b7..ef2a4e42 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -6,7 +6,6 @@ include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main' include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' include { FETCHNGSSAMPLESHEET_CHECK } from '../../modules/local/fetchngssamplesheet_check' -include { BLOBTOOLKIT_CONFIG } from '../../modules/local/blobtoolkit/config' include { GENERATE_CONFIG } from '../../modules/local/generate_config' workflow INPUT_CHECK { @@ -101,25 +100,9 @@ workflow INPUT_CHECK { | set { ch_busco_lineages } - if ( params.accession ) { - read_files - | map { meta, data -> meta.id.split("_")[0..-2].join("_") } - | combine ( fasta ) - | map { sample, meta, fasta -> [ meta, sample ] } - | groupTuple() - | set { grouped_reads } - - BLOBTOOLKIT_CONFIG ( grouped_reads, fasta ) - ch_versions = ch_versions.mix ( BLOBTOOLKIT_CONFIG.out.versions.first() ) - ch_config = BLOBTOOLKIT_CONFIG.out.yaml - - } else { - ch_config = GENERATE_CONFIG.out.yaml - } - emit: reads // channel: [ val(meta), path(datafile) ] - config = ch_config // channel: [ val(meta), path(yaml) ] + config = GENERATE_CONFIG.out.yaml // channel: [ val(meta), path(yaml) ] taxon_id = ch_taxon_id // channel: val(taxon_id) busco_lineages = ch_busco_lineages // channel: val([busco_lin]) versions = ch_versions // channel: [ versions.yml ] From df6d7d87fec1976884d6586813f69d987b52bf52 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 21 May 2024 10:08:14 +0100 Subject: [PATCH 16/30] Switched to the newer endpoint --- bin/generate_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index cc95dbd9..37a39ea5 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -8,7 +8,7 @@ import typing import yaml -NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v1/taxonomy/taxon/%s" +NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/taxonomy/taxon/%s" NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report" RANKS = [ @@ -37,7 +37,7 @@ def parse_args(args=None): "--accession", dest="ACCESSION", help="Accession number of the assembly (optional).", default=None ) parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None) - parser.add_argument("--version", action="version", version="%(prog)s 1.2") + parser.add_argument("--version", action="version", version="%(prog)s 1.3") return parser.parse_args(args) From e190bddce636f447cac91ff29e68ccb47679da69 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 24 May 2024 11:43:35 +0000 Subject: [PATCH 17/30] Introduced --parameters to have flexibility regarding their order --- bin/generate_config.py | 33 ++++++++++++++------------------ modules/local/generate_config.nf | 9 ++++++++- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index 37a39ea5..99bf70db 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -28,15 +28,13 @@ def parse_args(args=None): Description = "Produce the various configuration files needed within the pipeline" parser = argparse.ArgumentParser(description=Description) - parser.add_argument("FASTA", help="Path to the Fasta file of the assembly.") - parser.add_argument("TAXON_QUERY", help="Query string/integer for this taxon.") - parser.add_argument("LINEAGE_TAX_IDS", help="Mapping between BUSCO lineages and taxon IDs.") - parser.add_argument("YML_OUT", help="Output YML file.") - parser.add_argument("CSV_OUT", help="Output CSV file.") - parser.add_argument( - "--accession", dest="ACCESSION", help="Accession number of the assembly (optional).", default=None - ) - parser.add_argument("--busco", dest="REQUESTED_BUSCOS", help="Requested BUSCO lineages.", default=None) + parser.add_argument("--fasta", help="Path to the Fasta file of the assembly.") + parser.add_argument("--taxon_query", help="Query string/integer for this taxon.") + parser.add_argument("--lineage_tax_ids", help="Mapping between BUSCO lineages and taxon IDs.") + parser.add_argument("--yml_out", help="Output YML file.") + parser.add_argument("--csv_out", help="Output CSV file.") + parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None) + parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None) parser.add_argument("--version", action="version", version="%(prog)s 1.3") return parser.parse_args(args) @@ -71,7 +69,6 @@ def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]: def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typing.Optional[str]) -> typing.List[str]: - # Read the mapping between the BUSCO lineages and their taxon_id with open(lineage_tax_ids) as file_in: lineage_tax_ids_dict: typing.Dict[int, str] = {} @@ -125,7 +122,6 @@ def print_yaml( classification: typing.Dict[str, str], odb_arr: typing.List[str], ): - data = { "assembly": assembly_info, "busco": { @@ -179,7 +175,6 @@ def print_yaml( def print_csv(file_out, taxon_info: TaxonInfo, odb_arr: typing.List[str]): - out_dir = os.path.dirname(file_out) make_dir(out_dir) @@ -193,18 +188,18 @@ def main(args=None): args = parse_args(args) assembly_info: typing.Dict[str, typing.Union[str, int]] - if args.ACCESSION: - assembly_info = get_assembly_info(args.ACCESSION) + if args.accession: + assembly_info = get_assembly_info(args.accession) else: assembly_info = {"level": "scaffold"} - assembly_info["file"] = args.FASTA + assembly_info["file"] = args.fasta - taxon_info = make_taxon_info(args.TAXON_QUERY) + taxon_info = make_taxon_info(args.taxon_query) classification = get_classification(taxon_info) - odb_arr = get_odb(taxon_info, args.LINEAGE_TAX_IDS, args.REQUESTED_BUSCOS) + odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco) - print_yaml(args.YML_OUT, assembly_info, taxon_info, classification, odb_arr) - print_csv(args.CSV_OUT, taxon_info, odb_arr) + print_yaml(args.yml_out, assembly_info, taxon_info, classification, odb_arr) + print_csv(args.csv_out, taxon_info, odb_arr) if __name__ == "__main__": diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index f80b1a0c..95ed385d 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -25,7 +25,14 @@ process GENERATE_CONFIG { def busco_param = busco_lin ? "--busco '${busco_lin}'" : "" def accession_params = params.accession ? "--accession ${params.accession}" : "" """ - generate_config.py $fasta "$taxon_query" $lineage_tax_ids $busco_param $accession_params ${prefix}.yaml ${prefix}.csv + generate_config.py \\ + --fasta $fasta \\ + --taxon_query "$taxon_query" \\ + --lineage_tax_ids $lineage_tax_ids \\ + $busco_param \\ + $accession_params \\ + --yml_out ${prefix}.yaml \\ + --csv_out ${prefix}.csv cat <<-END_VERSIONS > versions.yml "${task.process}": From c37044c14af7e605fc1d8f12d2e41c9ee1952cbb Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 24 May 2024 12:06:14 +0000 Subject: [PATCH 18/30] Adjust the taxon_id to make sure it exists in the NT database --- .../test/nt_mMelMel3.1/taxonomy4blast.sqlite3 | Bin 0 -> 3072 bytes .../nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 | Bin 0 -> 3072 bytes bin/generate_config.py | 20 ++++++++++++++---- modules/local/generate_config.nf | 2 ++ subworkflows/local/input_check.nf | 2 ++ workflows/blobtoolkit.nf | 1 + 6 files changed, 21 insertions(+), 4 deletions(-) create mode 100644 assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3 create mode 100644 assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 diff --git a/assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3 b/assets/test/nt_mMelMel3.1/taxonomy4blast.sqlite3 new file mode 100644 index 0000000000000000000000000000000000000000..dc933c1f9b259bd9900799e3f5094e925b8d1e23 GIT binary patch literal 3072 zcmWFz^vNtqRY=P(%1ta$FlJz3U}R))P*7lCU|Rv9Oe;Jv-G62yi7!85Z5Euy| Qz{SGE1kC%Y`&9P<0A{9KsQ>@~ literal 0 HcmV?d00001 diff --git a/assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 b/assets/test_full/nt_gfLaeSulp1.1/taxonomy4blast.sqlite3 new file mode 100644 index 0000000000000000000000000000000000000000..2a56a82f490f58163c36942b09edc6d60de9122f GIT binary patch literal 3072 zcmWFz^vNtqRY=P(%1ta$FlJz3U}R))P*7lCU| typing.Dict[str, typing.Union[str, int] } +def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int: + con = sqlite3.connect(os.path.join(blastn, "taxonomy4blast.sqlite3")) + cur = con.cursor() + for taxon_id in [taxon_info.taxon_id] + list(reversed(taxon_info.lineage)): + res = cur.execute("SELECT * FROM TaxidInfo WHERE taxid = ?", (taxon_id,)) + if res.fetchone(): + return taxon_id + + def print_yaml( file_out, assembly_info: typing.Dict[str, typing.Union[str, int]], @@ -174,12 +185,12 @@ def print_yaml( yaml.dump(data, fout) -def print_csv(file_out, taxon_info: TaxonInfo, odb_arr: typing.List[str]): +def print_csv(file_out, taxon_id: int, odb_arr: typing.List[str]): out_dir = os.path.dirname(file_out) make_dir(out_dir) with open(file_out, "w") as fout: - print("taxon_id", taxon_info.taxon_id, sep=",", file=fout) + print("taxon_id", taxon_id, sep=",", file=fout) for odb_val in odb_arr: print("busco_lineage", odb_val, sep=",", file=fout) @@ -197,9 +208,10 @@ def main(args=None): taxon_info = make_taxon_info(args.taxon_query) classification = get_classification(taxon_info) odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco) + taxon_id = adjust_taxon_id(args.blastn, taxon_info) print_yaml(args.yml_out, assembly_info, taxon_info, classification, odb_arr) - print_csv(args.csv_out, taxon_info, odb_arr) + print_csv(args.csv_out, taxon_id, odb_arr) if __name__ == "__main__": diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index 95ed385d..5cf08eda 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -10,6 +10,7 @@ process GENERATE_CONFIG { val taxon_query val busco_lin path lineage_tax_ids + tuple val(meta2), val(blastn) output: tuple val(meta), path("*.yaml"), emit: yaml @@ -31,6 +32,7 @@ process GENERATE_CONFIG { --lineage_tax_ids $lineage_tax_ids \\ $busco_param \\ $accession_params \\ + --blastn $blastn \\ --yml_out ${prefix}.yaml \\ --csv_out ${prefix}.csv diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index ef2a4e42..53d21d80 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -15,6 +15,7 @@ workflow INPUT_CHECK { taxon // channel: val(taxon) busco_lin // channel: val([busco_lin]) lineage_tax_ids // channel: /path/to/lineage_tax_ids + blastn // channel: [ val(meta), path(blastn_db) ] main: ch_versions = Channel.empty() @@ -65,6 +66,7 @@ workflow INPUT_CHECK { taxon, busco_lin, lineage_tax_ids, + blastn, ) ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first()) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index d608803a..1541c9b3 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -108,6 +108,7 @@ workflow BLOBTOOLKIT { ch_taxon, ch_busco_lin, ch_lineage_tax_ids, + ch_blastn, ) ch_versions = ch_versions.mix ( INPUT_CHECK.out.versions ) From 9963aa5b59ac8a0ac28a8c2c4f2bb96364cb57b7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 28 May 2024 09:15:25 +0000 Subject: [PATCH 19/30] All these parameters are mandatory --- bin/generate_config.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index 15cd6ef9..d070bccd 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -29,14 +29,14 @@ def parse_args(args=None): Description = "Produce the various configuration files needed within the pipeline" parser = argparse.ArgumentParser(description=Description) - parser.add_argument("--fasta", help="Path to the Fasta file of the assembly.") - parser.add_argument("--taxon_query", help="Query string/integer for this taxon.") - parser.add_argument("--lineage_tax_ids", help="Mapping between BUSCO lineages and taxon IDs.") - parser.add_argument("--yml_out", help="Output YML file.") - parser.add_argument("--csv_out", help="Output CSV file.") + parser.add_argument("--fasta", required=True, help="Path to the Fasta file of the assembly.") + parser.add_argument("--taxon_query", required=True, help="Query string/integer for this taxon.") + parser.add_argument("--lineage_tax_ids", required=True, help="Mapping between BUSCO lineages and taxon IDs.") + parser.add_argument("--yml_out", required=True, help="Output YML file.") + parser.add_argument("--csv_out", required=True, help="Output CSV file.") parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None) - parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None) - parser.add_argument("--blastn", help="Path to the NCBI Taxonomy database") + parser.add_argument("--busco", required=True, help="Requested BUSCO lineages.", default=None) + parser.add_argument("--blastn", required=True, help="Path to the NCBI Taxonomy database") parser.add_argument("--version", action="version", version="%(prog)s 1.4") return parser.parse_args(args) From e959c0407548cc70866a2837d2c0b335245f78aa Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 31 May 2024 10:55:10 +0000 Subject: [PATCH 20/30] bugfix: --busco is optional --- bin/generate_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index d070bccd..6b1fb211 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -35,7 +35,7 @@ def parse_args(args=None): parser.add_argument("--yml_out", required=True, help="Output YML file.") parser.add_argument("--csv_out", required=True, help="Output CSV file.") parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None) - parser.add_argument("--busco", required=True, help="Requested BUSCO lineages.", default=None) + parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None) parser.add_argument("--blastn", required=True, help="Path to the NCBI Taxonomy database") parser.add_argument("--version", action="version", version="%(prog)s 1.4") return parser.parse_args(args) From ed941118058c30565eb02af812111f73e9c5fdc2 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 31 May 2024 15:39:45 +0000 Subject: [PATCH 21/30] bugfix: this should be a "path" so that the file is made available to the container --- modules/local/generate_config.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index 5cf08eda..3aab7ee9 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -10,7 +10,7 @@ process GENERATE_CONFIG { val taxon_query val busco_lin path lineage_tax_ids - tuple val(meta2), val(blastn) + tuple val(meta2), path(blastn) output: tuple val(meta), path("*.yaml"), emit: yaml From 596203496292161c45298959dc68d1bb3b0ca9e0 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 4 Jun 2024 14:21:35 +0100 Subject: [PATCH 22/30] bugfix: accept older assembly versions --- bin/generate_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index 6b1fb211..d6e3c684 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -10,7 +10,7 @@ import yaml NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/taxonomy/taxon/%s" -NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report" +NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report?filters.assembly_version=all_assemblies" RANKS = [ "genus", From 121e3722ec8a6adf2880c79f41d282124a16eadc Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 4 Jun 2024 14:24:19 +0100 Subject: [PATCH 23/30] These fields can be missing --- bin/generate_config.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index d6e3c684..bcf4a4ab 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -103,18 +103,21 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int] sys.exit(1) assembly_report = response["reports"][0] assembly_info = assembly_report["assembly_info"] - return { + d = { "accession": accession, "alias": assembly_info["assembly_name"], "bioproject": assembly_info["bioproject_accession"], - "biosample": assembly_info["biosample"]["accession"], "level": assembly_info["assembly_level"].lower(), - "prefix": assembly_report["wgs_info"]["wgs_project_accession"], "scaffold-count": assembly_report["assembly_stats"]["number_of_component_sequences"] + assembly_report["assembly_stats"]["number_of_organelles"], "span": int(assembly_report["assembly_stats"]["total_sequence_length"]) + sum(int(oi["total_seq_length"]) for oi in assembly_report["organelle_info"]), } + if "biosample" in assembly_info: + d["biosample"] = assembly_info["biosample"]["accession"] + if "wgs_info" in assembly_report: + d["prefix"] = assembly_report["wgs_info"]["wgs_project_accession"] + return d def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int: From 8085a740d508f6e79ac4d43f42ec9493a8691db6 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 4 Jun 2024 14:25:58 +0100 Subject: [PATCH 24/30] Some genomes don't have organelles --- bin/generate_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index bcf4a4ab..8036c4b1 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -109,9 +109,9 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int] "bioproject": assembly_info["bioproject_accession"], "level": assembly_info["assembly_level"].lower(), "scaffold-count": assembly_report["assembly_stats"]["number_of_component_sequences"] - + assembly_report["assembly_stats"]["number_of_organelles"], + + assembly_report["assembly_stats"].get("number_of_organelles", 0), "span": int(assembly_report["assembly_stats"]["total_sequence_length"]) - + sum(int(oi["total_seq_length"]) for oi in assembly_report["organelle_info"]), + + sum(int(oi["total_seq_length"]) for oi in assembly_report.get("organelle_info", [])), } if "biosample" in assembly_info: d["biosample"] = assembly_info["biosample"]["accession"] From 30af129a204b0dba1f98c1796858165f8dcadcbb Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 10 Jul 2024 18:39:34 +0100 Subject: [PATCH 25/30] Easier to read --- bin/generate_config.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index 8036c4b1..cbe3a2ab 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -103,15 +103,17 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int] sys.exit(1) assembly_report = response["reports"][0] assembly_info = assembly_report["assembly_info"] + scaffold_count = assembly_report["assembly_stats"]["number_of_component_sequences"] + scaffold_count += assembly_report["assembly_stats"].get("number_of_organelles", 0) + span = int(assembly_report["assembly_stats"]["total_sequence_length"]) + span += sum(int(oi["total_seq_length"]) for oi in assembly_report.get("organelle_info", [])) d = { "accession": accession, "alias": assembly_info["assembly_name"], "bioproject": assembly_info["bioproject_accession"], "level": assembly_info["assembly_level"].lower(), - "scaffold-count": assembly_report["assembly_stats"]["number_of_component_sequences"] - + assembly_report["assembly_stats"].get("number_of_organelles", 0), - "span": int(assembly_report["assembly_stats"]["total_sequence_length"]) - + sum(int(oi["total_seq_length"]) for oi in assembly_report.get("organelle_info", [])), + "scaffold-count": scaffold_count, + "span": span, } if "biosample" in assembly_info: d["biosample"] = assembly_info["biosample"]["accession"] From 9dcd338d88958ea7fcc152958cccf1c239bc4e25 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 10 Jul 2024 19:43:33 +0100 Subject: [PATCH 26/30] Release name --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35c1b810..dfe865ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.0.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/1.0.0)] – – [] +## [[1.0.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/1.0.0)] – Bellsprout – [] The pipeline has now been validated for draft (unpublished) assemblies. From 8886dba40a86c3ab9fb3ef8cd9f8fbd97f54e3a7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 21 Jun 2024 16:59:19 +0100 Subject: [PATCH 27/30] https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/ --- bin/generate_config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/generate_config.py b/bin/generate_config.py index cbe3a2ab..7330ae48 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -67,6 +67,10 @@ def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]: anc_taxon_info = make_taxon_info(anc_taxon_id) if anc_taxon_info.rank: ancestors[anc_taxon_info.rank.lower()] = anc_taxon_info.organism_name + # https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/ + # "superkingdom" will be called "domain" + if "superkingdom" not in ancestors: + ancestors["superkingdom"] = ancestors["domain"] return {r: ancestors[r] for r in RANKS if r in ancestors} From 70f961ce72811ca2ca2b239d08589001a12e4a2c Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 2 Jul 2024 13:52:12 +0100 Subject: [PATCH 28/30] Use GoaT in addition to the NCBI because GoaT also has the freshest ENA taxon_ids NCBI is still the first database we query --- CHANGELOG.md | 2 ++ CITATIONS.md | 4 +++ bin/generate_config.py | 61 ++++++++++++++++++++++++++++++++-------- workflows/blobtoolkit.nf | 2 +- 4 files changed, 57 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dfe865ba..c6330537 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ The pipeline has now been validated for draft (unpublished) assemblies. - The pipeline now queries the NCBI database instead of GoaT to establish the taxonomic classification of the species and the relevant Busco lineages. + In case the taxon_id is not found, the pipeline falls back to GoaT, which + is aware of upcoming taxon_ids in ENA. - New `--busco_lineages` parameter to choose specific Busco lineages instead of automatically selecting based on the taxonomy. - All parameters are now passed the regular Nextflow way. There is no support diff --git a/CITATIONS.md b/CITATIONS.md index c2cadb65..8b960779 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -26,6 +26,10 @@ - [Fasta_windows](https://github.com/tolkit/fasta_windows) +- [GoaT](https://goat.genomehubs.org) + + > Challis, Richard, et al. “Genomes on a Tree (GoaT): A versatile, scalable search engine for genomic and sequencing project metadata across the eukaryotic tree of life.” Wellcome Open Research, vol. 8, no. 24, 2023, https://doi.org/10.12688/wellcomeopenres.18658.1. + - [Minimap2](https://github.com/lh3/minimap2) > Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, https://doi.org/10.1093/bioinformatics/bty191. diff --git a/bin/generate_config.py b/bin/generate_config.py index 7330ae48..e3d00dfa 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -10,6 +10,8 @@ import yaml NCBI_TAXONOMY_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/taxonomy/taxon/%s" +GOAT_LOOKUP_API = "https://goat.genomehubs.org/api/v2/lookup?searchTerm=%s&result=taxon&size=10&taxonomy=ncbi" +GOAT_RECORD_API = "https://goat.genomehubs.org/api/v2/record?recordId=%s&result=taxon&size=10&taxonomy=ncbi" NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report?filters.assembly_version=all_assemblies" RANKS = [ @@ -51,20 +53,57 @@ class TaxonInfo: taxon_id: int organism_name: str rank: typing.Optional[str] - lineage: typing.List[int] + lineage: typing.List["TaxonInfo"] -def make_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo: +def make_taxon_info_from_goat(body: dict[str, str]) -> TaxonInfo: + rank = body["taxon_rank"] + if rank == "no rank": + rank = None + if "lineage" in body: + lineage = [make_taxon_info_from_goat(b) for b in body["lineage"]] + else: + lineage = [] + return TaxonInfo(int(body["taxon_id"]), body["scientific_name"], rank, lineage) + + +def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo: + if isinstance(taxon_name, int): + taxon_id = taxon_name + record_id = "taxon-%d" % taxon_name + else: + # Resolve the taxon_id of the species + response = requests.get(GOAT_LOOKUP_API % taxon_name).json() + taxon_id = int(response["results"][0]["result"]["taxon_id"]) + record_id = response["results"][0]["id"] + + # Using API, get the taxon_ids of the species and all parents + response = requests.get(GOAT_RECORD_API % record_id).json() + body = response["records"][0]["record"] + return make_taxon_info_from_goat(body) + + +def fetch_taxon_info_from_ncbi(taxon_name: typing.Union[str, int], with_lineage=True) -> typing.Optional[TaxonInfo]: # Using API, get the taxon_ids of the species and all parents response = requests.get(NCBI_TAXONOMY_API % taxon_name).json() - body = response["taxonomy_nodes"][0]["taxonomy"] - return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), body["lineage"]) + if "taxonomy" in response["taxonomy_nodes"][0]: + body = response["taxonomy_nodes"][0]["taxonomy"] + if with_lineage: + lineage = [ + fetch_taxon_info_from_ncbi(t, with_lineage=False) for t in reversed(body["lineage"][2:]) + ] # skip root and cellular_organisms + else: + lineage = [] + return TaxonInfo(body["tax_id"], body["organism_name"], body.get("rank"), lineage) + + +def fetch_taxon_info(taxon_name: typing.Union[str, int]) -> TaxonInfo: + return fetch_taxon_info_from_ncbi(taxon_name) or fetch_taxon_info_from_goat(taxon_name) def get_classification(taxon_info: TaxonInfo) -> typing.Dict[str, str]: ancestors: typing.Dict[str, str] = {} - for anc_taxon_id in taxon_info.lineage[1:]: # skip the root taxon_id=1 - anc_taxon_info = make_taxon_info(anc_taxon_id) + for anc_taxon_info in taxon_info.lineage: if anc_taxon_info.rank: ancestors[anc_taxon_info.rank.lower()] = anc_taxon_info.organism_name # https://ncbiinsights.ncbi.nlm.nih.gov/2024/06/04/changes-ncbi-taxonomy-classifications/ @@ -92,9 +131,9 @@ def get_odb(taxon_info: TaxonInfo, lineage_tax_ids: str, requested_buscos: typin else: # Do the intersection to find the ancestors that have a BUSCO lineage odb_arr = [ - lineage_tax_ids_dict[taxon_id] - for taxon_id in reversed(taxon_info.lineage) - if taxon_id in lineage_tax_ids_dict + lineage_tax_ids_dict[anc_taxon_info.taxon_id] + for anc_taxon_info in taxon_info.lineage + if anc_taxon_info.taxon_id in lineage_tax_ids_dict ] return odb_arr @@ -129,7 +168,7 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int] def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int: con = sqlite3.connect(os.path.join(blastn, "taxonomy4blast.sqlite3")) cur = con.cursor() - for taxon_id in [taxon_info.taxon_id] + list(reversed(taxon_info.lineage)): + for taxon_id in [taxon_info.taxon_id] + [anc_taxon_info.taxon_id for anc_taxon_info in taxon_info.lineage]: res = cur.execute("SELECT * FROM TaxidInfo WHERE taxid = ?", (taxon_id,)) if res.fetchone(): return taxon_id @@ -214,7 +253,7 @@ def main(args=None): assembly_info = {"level": "scaffold"} assembly_info["file"] = args.fasta - taxon_info = make_taxon_info(args.taxon_query) + taxon_info = fetch_taxon_info(args.taxon_query) classification = get_classification(taxon_info) odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco) taxon_id = adjust_taxon_id(args.blastn, taxon_info) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 1541c9b3..cd97fd99 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -130,7 +130,7 @@ workflow BLOBTOOLKIT { ch_versions = ch_versions.mix ( COVERAGE_STATS.out.versions ) // - // SUBWORKFLOW: Run BUSCO using lineages fetched from NCBI, then run diamond_blastp + // SUBWORKFLOW: Run BUSCO using lineages fetched from GoaT, then run diamond_blastp // BUSCO_DIAMOND ( PREPARE_GENOME.out.genome, From e9d3a645442b0c747c452e25f375e202de856390 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 22 Aug 2024 03:34:14 +0100 Subject: [PATCH 29/30] Added an option to skip filtering hits from the same species --- CHANGELOG.md | 11 +++++--- nextflow.config | 1 + nextflow_schema.json | 5 ++++ subworkflows/local/input_check.nf | 3 +- subworkflows/local/run_blastn.nf | 46 ++++++++++++++++++++----------- 5 files changed, 45 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6330537..ff57d235 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,13 +15,16 @@ The pipeline has now been validated for draft (unpublished) assemblies. automatically selecting based on the taxonomy. - All parameters are now passed the regular Nextflow way. There is no support for the original Yaml configuration files of the Snakemake version. +- New option `--skip_taxon_filtering` to skip the taxon filtering in blast searches. + Mostly relevant for draft assemblies. ### Parameters -| Old parameter | New parameter | -| ------------- | ---------------- | -| --yaml | | -| | --busco_lineages | +| Old parameter | New parameter | +| ------------- | ---------------------- | +| --yaml | | +| | --busco_lineages | +| | --skip_taxon_filtering | > **NB:** Parameter has been **updated** if both old and new parameter information is present.
**NB:** Parameter has been **added** if just the new parameter information is present.
**NB:** Parameter has been **removed** if new parameter information isn't present. diff --git a/nextflow.config b/nextflow.config index a1ea3ae9..dc8cc891 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,6 +33,7 @@ params { blastp = null blastx = null blastn = null + skip_taxon_filtering = false // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index cd0b7341..e838ab04 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -32,6 +32,11 @@ "description": "Turn on optional genome masking if needed.", "fa_icon": "fas fa-toggle-off" }, + "skip_taxon_filtering": { + "type": "boolean", + "description": "Skip filtering out hits from the same species in blast* searches.", + "fa_icon": "fas fa-toggle-off" + }, "fetchngs_samplesheet": { "type": "boolean", "description": "Turn on the conversion from a nf-core/fetchngs samplesheet.", diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 53d21d80..7a8fd112 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -87,9 +87,10 @@ workflow INPUT_CHECK { // - // Get the taxon ID + // Get the taxon ID if we do taxon filtering in blast* searches // ch_parsed_csv.taxon_id + | map { params.skip_taxon_filtering ? '' : it } | first | set { ch_taxon_id } diff --git a/subworkflows/local/run_blastn.nf b/subworkflows/local/run_blastn.nf index 7dc92fd7..d479e832 100644 --- a/subworkflows/local/run_blastn.nf +++ b/subworkflows/local/run_blastn.nf @@ -47,23 +47,37 @@ workflow RUN_BLASTN { | set { ch_chunks } // Run blastn search - // run blastn excluding taxon_id - BLASTN_TAXON ( ch_chunks, blastn, taxon_id ) - ch_versions = ch_versions.mix ( BLASTN_TAXON.out.versions.first() ) - - // check if blastn output table is empty - BLASTN_TAXON.out.txt - | branch { meta, txt -> - empty: txt.isEmpty() - not_empty: true - } - | set { ch_blastn_taxon_out } + if (params.skip_taxon_filtering) { + + // skip BLASTN_TAXON + ch_blast_blastn_input = ch_chunks + + // fake ch_blastn_taxon_out.not_empty + ch_blastn_taxon_out = [ + not_empty: Channel.empty() + ] + + } else { - // repeat the blastn search without excluding taxon_id - ch_blastn_taxon_out.empty - | join ( ch_chunks ) - | map { meta, txt, fasta -> [meta, fasta] } - | set { ch_blast_blastn_input } + // run blastn excluding taxon_id + BLASTN_TAXON ( ch_chunks, blastn, taxon_id ) + ch_versions = ch_versions.mix ( BLASTN_TAXON.out.versions.first() ) + + // check if blastn output table is empty + BLASTN_TAXON.out.txt + | branch { meta, txt -> + empty: txt.isEmpty() + not_empty: true + } + | set { ch_blastn_taxon_out } + + // repeat the blastn search without excluding taxon_id + ch_blastn_taxon_out.empty + | join ( ch_chunks ) + | map { meta, txt, fasta -> [meta, fasta] } + | set { ch_blast_blastn_input } + + } BLAST_BLASTN ( ch_blast_blastn_input, blastn, [] ) ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) From 8c70c777ade843c965d603393119ed8cda6a6424 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 23 Aug 2024 16:59:10 +0100 Subject: [PATCH 30/30] Corrected the version as I want to be sure this is really ready for the big show --- CHANGELOG.md | 2 +- nextflow.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff57d235..0cc1a361 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.0.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/1.0.0)] – Bellsprout – [] +## – Bellsprout – [] The pipeline has now been validated for draft (unpublished) assemblies. diff --git a/nextflow.config b/nextflow.config index dc8cc891..319b9b3d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -246,7 +246,7 @@ manifest { description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.0.0' + version = '0.6.0-dev' doi = '10.5281/zenodo.7949058' }