From 0daf1e2b00c7ae1611d6d667fcb1fa2f15e8d36c Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 2 Jul 2024 14:12:09 +0100 Subject: [PATCH 01/22] Also generate TSV files --- bin/generate_config.py | 55 ++++++++++++++++++++++++++++--- modules/local/generate_config.nf | 4 +-- subworkflows/local/input_check.nf | 1 + 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index e3d00dfa..ae532395 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -13,6 +13,7 @@ GOAT_LOOKUP_API = "https://goat.genomehubs.org/api/v2/lookup?searchTerm=%s&result=taxon&size=10&taxonomy=ncbi" GOAT_RECORD_API = "https://goat.genomehubs.org/api/v2/record?recordId=%s&result=taxon&size=10&taxonomy=ncbi" NCBI_DATASETS_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/dataset_report?filters.assembly_version=all_assemblies" +NCBI_SEQUENCE_API = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/%s/sequence_reports" RANKS = [ "genus", @@ -34,8 +35,7 @@ def parse_args(args=None): parser.add_argument("--fasta", required=True, help="Path to the Fasta file of the assembly.") parser.add_argument("--taxon_query", required=True, help="Query string/integer for this taxon.") parser.add_argument("--lineage_tax_ids", required=True, help="Mapping between BUSCO lineages and taxon IDs.") - parser.add_argument("--yml_out", required=True, help="Output YML file.") - parser.add_argument("--csv_out", required=True, help="Output CSV file.") + parser.add_argument("--output_prefix", required=True, help="Prefix to name the output files.") parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None) parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None) parser.add_argument("--blastn", required=True, help="Path to the NCBI Taxonomy database") @@ -165,6 +165,21 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int] return d +def get_sequence_report(accession: str): + response = requests.get(NCBI_SEQUENCE_API % accession).json() + if not response["reports"]: + print(f"Assembly not found: {accession}", file=sys.stderr) + sys.exit(1) + sequence_report = response["reports"] + for rec in sequence_report: + if rec["role"] == "assembled-molecule": + rec["assembly_level"] = rec["assigned_molecule_location_type"] + else: + rec["assembly_level"] = "na" + rec["chr_name"] = "na" + return sequence_report + + def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int: con = sqlite3.connect(os.path.join(blastn, "taxonomy4blast.sqlite3")) cur = con.cursor() @@ -226,6 +241,7 @@ def print_yaml( }, "version": 2, } + out_dir = os.path.dirname(file_out) make_dir(out_dir) @@ -243,14 +259,42 @@ def print_csv(file_out, taxon_id: int, odb_arr: typing.List[str]): print("busco_lineage", odb_val, sep=",", file=fout) +def print_tsvs(output_prefix, sequence_report): + categories_tsv = f"{output_prefix}.categories.tsv" + synonyms_tsv = f"{output_prefix}.synonyms.tsv" + with open(categories_tsv, "w") as fhc: + with open(synonyms_tsv, "w") as fhs: + print("identifier", "assembly_role", "assembly_level", "assembly_unit", sep="\t", file=fhc) + print("identifier", "name", "assigned_name", "refseq_accession", sep="\t", file=fhs) + for rec in sequence_report: + print( + rec["genbank_accession"], + rec["role"], + rec["assembly_level"], + rec["assembly_unit"], + sep="\t", + file=fhc, + ) + print( + rec["genbank_accession"], + rec["sequence_name"], + rec["chr_name"], + rec.get("refseq_accession", "na"), + sep="\t", + file=fhs, + ) + + def main(args=None): args = parse_args(args) assembly_info: typing.Dict[str, typing.Union[str, int]] if args.accession: assembly_info = get_assembly_info(args.accession) + sequence_report = get_sequence_report(args.accession) else: assembly_info = {"level": "scaffold"} + sequence_report = None assembly_info["file"] = args.fasta taxon_info = fetch_taxon_info(args.taxon_query) @@ -258,8 +302,11 @@ def main(args=None): odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco) taxon_id = adjust_taxon_id(args.blastn, taxon_info) - print_yaml(args.yml_out, assembly_info, taxon_info, classification, odb_arr) - print_csv(args.csv_out, taxon_id, odb_arr) + if sequence_report: + print_tsvs(args.output_prefix, sequence_report) + + print_yaml(f"{args.output_prefix}.yaml", assembly_info, taxon_info, classification, odb_arr) + print_csv(f"{args.output_prefix}.csv", taxon_id, odb_arr) if __name__ == "__main__": diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index 3aab7ee9..ed73b601 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -14,6 +14,7 @@ process GENERATE_CONFIG { output: tuple val(meta), path("*.yaml"), emit: yaml + tuple val(meta), path("*.tsv") , emit: tsv tuple val(meta), path("*.csv") , emit: csv path "versions.yml" , emit: versions @@ -33,8 +34,7 @@ process GENERATE_CONFIG { $busco_param \\ $accession_params \\ --blastn $blastn \\ - --yml_out ${prefix}.yaml \\ - --csv_out ${prefix}.csv + --output_prefix ${prefix} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 7a8fd112..a5ad22bd 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -106,6 +106,7 @@ workflow INPUT_CHECK { emit: reads // channel: [ val(meta), path(datafile) ] config = GENERATE_CONFIG.out.yaml // channel: [ val(meta), path(yaml) ] + tsv = GENERATE_CONFIG.out.tsv // channel: [ val(meta), path(tsv) ] taxon_id = ch_taxon_id // channel: val(taxon_id) busco_lineages = ch_busco_lineages // channel: val([busco_lin]) versions = ch_versions // channel: [ versions.yml ] From 2cf12fd06dfdbce6a2a8ebd584096a03e6c749ba Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 2 Jul 2024 14:20:00 +0100 Subject: [PATCH 02/22] Example plugging the TSV files into blobtools commands --- modules/local/blobtoolkit/metadata.nf | 1 + subworkflows/local/blobtools.nf | 3 ++- workflows/blobtoolkit.nf | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/local/blobtoolkit/metadata.nf b/modules/local/blobtoolkit/metadata.nf index ffae2a8c..ebf14563 100644 --- a/modules/local/blobtoolkit/metadata.nf +++ b/modules/local/blobtoolkit/metadata.nf @@ -9,6 +9,7 @@ process BLOBTOOLKIT_METADATA { input: tuple val(meta), path(yaml) + tuple val(meta2), path(tsvs) output: tuple val(meta), path("*.metadata.yaml"), emit: yaml diff --git a/subworkflows/local/blobtools.nf b/subworkflows/local/blobtools.nf index 747bc9fa..8c19145a 100644 --- a/subworkflows/local/blobtools.nf +++ b/subworkflows/local/blobtools.nf @@ -9,6 +9,7 @@ include { BLOBTOOLKIT_UPDATEBLOBDIR } from '../../modules/local/blobtoolkit/upda workflow BLOBTOOLS { take: config // channel: [ val(meta), path(config) ] + tsvs // channel: [ val(meta), [path(tsv)] ] windowstats // channel: [ val(meta), path(window_stats_tsvs) ] busco // channel: [ val(meta), path(full_table) ] blastp // channel: [ val(meta), path(txt) ] @@ -24,7 +25,7 @@ workflow BLOBTOOLS { // // Create metadata summary file // - BLOBTOOLKIT_METADATA ( config ) + BLOBTOOLKIT_METADATA ( config, tsvs ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_METADATA.out.versions.first() ) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index cd97fd99..4db2475c 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -180,6 +180,7 @@ workflow BLOBTOOLKIT { // BLOBTOOLS ( INPUT_CHECK.out.config, + INPUT_CHECK.out.tsv, COLLATE_STATS.out.window_tsv, BUSCO_DIAMOND.out.all_tables, BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), From b8b1fa4e1023ade36ff67d762281b386e1ba0b6c Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 19 Sep 2024 16:07:02 +0100 Subject: [PATCH 03/22] Skip BLOBTOOLKIT_METADATA as we generate the yaml file correctly from the beginning --- bin/generate_config.py | 10 ++------ modules/local/blobtoolkit/metadata.nf | 34 --------------------------- subworkflows/local/blobtools.nf | 11 +-------- 3 files changed, 3 insertions(+), 52 deletions(-) delete mode 100644 modules/local/blobtoolkit/metadata.nf diff --git a/bin/generate_config.py b/bin/generate_config.py index ae532395..b4d2e0e9 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -194,16 +194,10 @@ def print_yaml( assembly_info: typing.Dict[str, typing.Union[str, int]], taxon_info: TaxonInfo, classification: typing.Dict[str, str], - odb_arr: typing.List[str], ): data = { "assembly": assembly_info, - "busco": { - "basal_lineages": BUSCO_BASAL_LINEAGES, - # "download_dir": - "lineages": odb_arr + [lin for lin in BUSCO_BASAL_LINEAGES if lin not in odb_arr], - }, - # "reads": {}, + "reads": {}, "revision": 1, "settings": { # Only settings.stats_windows is mandatory, everything else is superfluous @@ -305,7 +299,7 @@ def main(args=None): if sequence_report: print_tsvs(args.output_prefix, sequence_report) - print_yaml(f"{args.output_prefix}.yaml", assembly_info, taxon_info, classification, odb_arr) + print_yaml(f"{args.output_prefix}.yaml", assembly_info, taxon_info, classification) print_csv(f"{args.output_prefix}.csv", taxon_id, odb_arr) diff --git a/modules/local/blobtoolkit/metadata.nf b/modules/local/blobtoolkit/metadata.nf deleted file mode 100644 index ebf14563..00000000 --- a/modules/local/blobtoolkit/metadata.nf +++ /dev/null @@ -1,34 +0,0 @@ -process BLOBTOOLKIT_METADATA { - tag "$meta.id" - label 'process_single' - - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - exit 1, "BLOBTOOLKIT_METADATA module does not support Conda. Please use Docker / Singularity / Podman instead." - } - container "docker.io/genomehubs/blobtoolkit:4.3.9" - - input: - tuple val(meta), path(yaml) - tuple val(meta2), path(tsvs) - - output: - tuple val(meta), path("*.metadata.yaml"), emit: yaml - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - btk pipeline add-summary-to-metadata \\ - --config ${yaml} \\ - --out ${prefix}.metadata.yaml - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - blobtoolkit: \$(btk --version | cut -d' ' -f2 | sed 's/v//') - END_VERSIONS - """ -} diff --git a/subworkflows/local/blobtools.nf b/subworkflows/local/blobtools.nf index 8c19145a..a0f9e31a 100644 --- a/subworkflows/local/blobtools.nf +++ b/subworkflows/local/blobtools.nf @@ -2,7 +2,6 @@ // Create BlobTools dataset // -include { BLOBTOOLKIT_METADATA } from '../../modules/local/blobtoolkit/metadata' include { BLOBTOOLKIT_CREATEBLOBDIR } from '../../modules/local/blobtoolkit/createblobdir' include { BLOBTOOLKIT_UPDATEBLOBDIR } from '../../modules/local/blobtoolkit/updateblobdir' @@ -22,17 +21,10 @@ workflow BLOBTOOLS { ch_versions = Channel.empty() - // - // Create metadata summary file - // - BLOBTOOLKIT_METADATA ( config, tsvs ) - ch_versions = ch_versions.mix ( BLOBTOOLKIT_METADATA.out.versions.first() ) - - // // Create Blobtools dataset files // - BLOBTOOLKIT_CREATEBLOBDIR ( windowstats, busco, blastp, BLOBTOOLKIT_METADATA.out.yaml, taxdump ) + BLOBTOOLKIT_CREATEBLOBDIR ( windowstats, busco, blastp, config, taxdump ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_CREATEBLOBDIR.out.versions.first() ) @@ -44,7 +36,6 @@ workflow BLOBTOOLS { emit: - metadata = BLOBTOOLKIT_METADATA.out.yaml // channel: [ val(meta), path(yaml) ] blobdir = BLOBTOOLKIT_UPDATEBLOBDIR.out.blobdir // channel: [ val(meta), path(dir) ] versions = ch_versions // channel: [ versions.yml ] } From d13d1ed2aca3b059dbfcac51d2b31f9e19721cb5 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 19 Sep 2024 16:35:25 +0100 Subject: [PATCH 04/22] Fixed the comment --- subworkflows/local/run_blastx.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/run_blastx.nf b/subworkflows/local/run_blastx.nf index 715e5ae2..cfcb0219 100644 --- a/subworkflows/local/run_blastx.nf +++ b/subworkflows/local/run_blastx.nf @@ -19,7 +19,7 @@ workflow RUN_BLASTX { // - // Create metadata summary file + // Split the sequences // BLOBTOOLKIT_CHUNK ( fasta, table ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_CHUNK.out.versions.first() ) From 2c793aa61785914826569a2c6fca8e0fc5550216 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 19 Sep 2024 16:48:44 +0100 Subject: [PATCH 05/22] UPDATEBLOBDIR needs to pass the tsv files through certain arguments --- modules/local/blobtoolkit/updateblobdir.nf | 10 ++++++++-- modules/local/generate_config.nf | 9 +++++---- subworkflows/local/blobtools.nf | 5 +++-- subworkflows/local/input_check.nf | 3 ++- workflows/blobtoolkit.nf | 3 ++- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/modules/local/blobtoolkit/updateblobdir.nf b/modules/local/blobtoolkit/updateblobdir.nf index 50167f8b..d736a03f 100644 --- a/modules/local/blobtoolkit/updateblobdir.nf +++ b/modules/local/blobtoolkit/updateblobdir.nf @@ -9,8 +9,10 @@ process BLOBTOOLKIT_UPDATEBLOBDIR { input: tuple val(meta), path(input, stageAs: "input_blobdir") - tuple val(meta1), path(blastx, stageAs: "blastx.txt") - tuple val(meta2), path(blastn, stageAs: "blastn.txt") + tuple val(meta2), path(synonyms_tsv) + tuple val(meta3), path(categories_tsv) + tuple val(meta4), path(blastx, stageAs: "blastx.txt") + tuple val(meta5), path(blastn, stageAs: "blastn.txt") path(taxdump) output: @@ -25,6 +27,9 @@ process BLOBTOOLKIT_UPDATEBLOBDIR { prefix = task.ext.prefix ?: "${meta.id}" def hits_blastx = blastx ? "--hits ${blastx}" : "" def hits_blastn = blastn ? "--hits ${blastn}" : "" + def syn = synonyms_tsv ? "--synonyms ${synonyms_tsv}" : "" + def cat = categories_tsv ? "--text ${categories_tsv}" : "" + def head = (synonyms_tsv || categories_tsv) ? "--text-header" : "" """ # In-place modifications are not great in Nextflow, so work on a copy of ${input} mkdir ${prefix} @@ -34,6 +39,7 @@ process BLOBTOOLKIT_UPDATEBLOBDIR { --taxrule bestdistorder=buscoregions \\ ${hits_blastx} \\ ${hits_blastn} \\ + ${syn} ${cat} ${head} \\ --threads ${task.cpus} \\ $args \\ ${prefix} diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index ed73b601..4644f762 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -13,10 +13,11 @@ process GENERATE_CONFIG { tuple val(meta2), path(blastn) output: - tuple val(meta), path("*.yaml"), emit: yaml - tuple val(meta), path("*.tsv") , emit: tsv - tuple val(meta), path("*.csv") , emit: csv - path "versions.yml" , emit: versions + tuple val(meta), path("*.yaml") , emit: yaml + tuple val(meta), path("*.csv") , emit: csv + tuple val(meta), path("*.synonyms.tsv") , emit: synonyms_tsv, optional: true + tuple val(meta), path("*.categories.tsv"), emit: categories_tsv, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/subworkflows/local/blobtools.nf b/subworkflows/local/blobtools.nf index a0f9e31a..eea7f432 100644 --- a/subworkflows/local/blobtools.nf +++ b/subworkflows/local/blobtools.nf @@ -8,7 +8,8 @@ include { BLOBTOOLKIT_UPDATEBLOBDIR } from '../../modules/local/blobtoolkit/upda workflow BLOBTOOLS { take: config // channel: [ val(meta), path(config) ] - tsvs // channel: [ val(meta), [path(tsv)] ] + syn_tsv // channel: [ val(meta), [path(tsv)] ] + cat_tsv // channel: [ val(meta), [path(tsv)] ] windowstats // channel: [ val(meta), path(window_stats_tsvs) ] busco // channel: [ val(meta), path(full_table) ] blastp // channel: [ val(meta), path(txt) ] @@ -31,7 +32,7 @@ workflow BLOBTOOLS { // // Update Blobtools dataset files // - BLOBTOOLKIT_UPDATEBLOBDIR ( BLOBTOOLKIT_CREATEBLOBDIR.out.blobdir, blastx, blastn, taxdump ) + BLOBTOOLKIT_UPDATEBLOBDIR ( BLOBTOOLKIT_CREATEBLOBDIR.out.blobdir, syn_tsv, cat_tsv, blastx, blastn, taxdump ) ch_versions = ch_versions.mix ( BLOBTOOLKIT_UPDATEBLOBDIR.out.versions.first() ) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index a5ad22bd..f3521186 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -106,7 +106,8 @@ workflow INPUT_CHECK { emit: reads // channel: [ val(meta), path(datafile) ] config = GENERATE_CONFIG.out.yaml // channel: [ val(meta), path(yaml) ] - tsv = GENERATE_CONFIG.out.tsv // channel: [ val(meta), path(tsv) ] + synonyms_tsv = GENERATE_CONFIG.out.synonyms_tsv // channel: [ val(meta), path(tsv) ] + categories_tsv = GENERATE_CONFIG.out.categories_tsv // channel: [ val(meta), path(tsv) ] taxon_id = ch_taxon_id // channel: val(taxon_id) busco_lineages = ch_busco_lineages // channel: val([busco_lin]) versions = ch_versions // channel: [ versions.yml ] diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 4db2475c..62a66368 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -180,7 +180,8 @@ workflow BLOBTOOLKIT { // BLOBTOOLS ( INPUT_CHECK.out.config, - INPUT_CHECK.out.tsv, + INPUT_CHECK.out.synonyms_tsv.ifEmpty([[],[]]), + INPUT_CHECK.out.categories_tsv.ifEmpty([[],[]]), COLLATE_STATS.out.window_tsv, BUSCO_DIAMOND.out.all_tables, BUSCO_DIAMOND.out.blastp_txt.ifEmpty([[],[]]), From b55f76c5c0c47e256d2defbb8acfb55d7b4ea6de Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 19 Sep 2024 17:33:25 +0100 Subject: [PATCH 06/22] Populate as much as possible of the metadata at the beginning --- bin/generate_config.py | 22 +++++++++++++++++++++- bin/update_versions.py | 11 ----------- modules/local/blobtoolkit/updatemeta.nf | 3 --- modules/local/generate_config.nf | 3 +++ subworkflows/local/finalise_blobdir.nf | 2 -- subworkflows/local/input_check.nf | 1 + workflows/blobtoolkit.nf | 1 - 7 files changed, 25 insertions(+), 18 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index b4d2e0e9..9171b71c 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -40,6 +40,9 @@ def parse_args(args=None): parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None) parser.add_argument("--blastn", required=True, help="Path to the NCBI Taxonomy database") parser.add_argument("--version", action="version", version="%(prog)s 1.4") + parser.add_argument("--read_id", action="append", help="ID of a read set") + parser.add_argument("--read_type", action="append", help="Type of a read set") + parser.add_argument("--read_path", action="append", help="Path of a read set") return parser.parse_args(args) @@ -189,11 +192,23 @@ def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int: return taxon_id +def datatype_to_platform(s): + if s == "ont": + return "OXFORD_NANOPORE" + elif s.startswith("pacbio"): + return "PACBIO_SMRT" + elif s in ["hic", "illumina"]: + return "ILLUMINA" + else: + return "OTHER" + + def print_yaml( file_out, assembly_info: typing.Dict[str, typing.Union[str, int]], taxon_info: TaxonInfo, classification: typing.Dict[str, str], + reads, ): data = { "assembly": assembly_info, @@ -236,6 +251,9 @@ def print_yaml( "version": 2, } + for id, datatype, path in zip(args.read_id, args.read_type, args.read_path): + data["reads"][id] = {"file": path, "plaform": datatype_to_platform(datatype)} + out_dir = os.path.dirname(file_out) make_dir(out_dir) @@ -299,7 +317,9 @@ def main(args=None): if sequence_report: print_tsvs(args.output_prefix, sequence_report) - print_yaml(f"{args.output_prefix}.yaml", assembly_info, taxon_info, classification) + reads = zip(args.read_id, args.read_type, args.read_path) + + print_yaml(f"{args.output_prefix}.yaml", assembly_info, taxon_info, classification, reads) print_csv(f"{args.output_prefix}.csv", taxon_id, odb_arr) diff --git a/bin/update_versions.py b/bin/update_versions.py index 1d3491a3..f8ef9a9c 100755 --- a/bin/update_versions.py +++ b/bin/update_versions.py @@ -15,9 +15,6 @@ def parse_args(args=None): parser.add_argument("--meta_in", help="Input JSON file.", required=True) parser.add_argument("--meta_out", help="Output JSON file.", required=True) parser.add_argument("--software", help="Input YAML file.", required=True) - parser.add_argument("--read_id", action="append", help="ID of a read set") - parser.add_argument("--read_type", action="append", help="Type of a read set") - parser.add_argument("--read_path", action="append", help="Path of a read set") parser.add_argument("--blastp", help="Path to the blastp database", required=True) parser.add_argument("--blastx", help="Path to the blastx database", required=True) parser.add_argument("--blastn", help="Path to the blastn database", required=True) @@ -60,14 +57,6 @@ def update_meta(args): infile["similarity"]["blastn"]["path"] = args.blastn infile["similarity"]["diamond_blastp"]["path"] = args.blastp infile["similarity"]["diamond_blastx"]["path"] = args.blastx - - infile["reads"] = {} - for id, datatype, path in zip(args.read_id, args.read_type, args.read_path): - infile["reads"][id] = { - "file": path, - "plaform": datatype_to_platform(datatype), - } - return infile diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf index 0c3aaaaf..f39bee85 100644 --- a/modules/local/blobtoolkit/updatemeta.nf +++ b/modules/local/blobtoolkit/updatemeta.nf @@ -9,7 +9,6 @@ process BLOBTOOLKIT_UPDATEMETA { input: tuple val(meta), path(input) - val reads path versions // The following are passed as "val" because we just want to know the full paths. No staging necessary val blastp @@ -27,7 +26,6 @@ process BLOBTOOLKIT_UPDATEMETA { script: def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" - def input_reads = reads.collect{"--read_id ${it[0].id} --read_type ${it[0].datatype} --read_path ${it[1]}"}.join(' ') """ update_versions.py \\ ${args} \\ @@ -37,7 +35,6 @@ process BLOBTOOLKIT_UPDATEMETA { --blastx ${blastx} \\ --blastn ${blastn} \\ --taxdump ${taxdump} \\ - $input_reads \\ --meta_out ${prefix}.meta.json cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index 4644f762..b9a6f66f 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -11,6 +11,7 @@ process GENERATE_CONFIG { val busco_lin path lineage_tax_ids tuple val(meta2), path(blastn) + val reads output: tuple val(meta), path("*.yaml") , emit: yaml @@ -27,6 +28,7 @@ process GENERATE_CONFIG { def prefix = task.ext.prefix ?: "${meta.id}" def busco_param = busco_lin ? "--busco '${busco_lin}'" : "" def accession_params = params.accession ? "--accession ${params.accession}" : "" + def input_reads = reads.collect{"--read_id ${it[0].id} --read_type ${it[0].datatype} --read_path ${it[1]}"}.join(' ') """ generate_config.py \\ --fasta $fasta \\ @@ -35,6 +37,7 @@ process GENERATE_CONFIG { $busco_param \\ $accession_params \\ --blastn $blastn \\ + $input_reads \\ --output_prefix ${prefix} cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/finalise_blobdir.nf b/subworkflows/local/finalise_blobdir.nf index 352cb01b..a993c705 100644 --- a/subworkflows/local/finalise_blobdir.nf +++ b/subworkflows/local/finalise_blobdir.nf @@ -8,7 +8,6 @@ include { COMPRESSBLOBDIR } from '../../modules/local/compressblobdir' workflow FINALISE_BLOBDIR { take: blobdir // channel: [ val(meta), path(blobdir) ] - reads // channel: [ [meta, reads] ] software // channel: [ val(meta), path(software_yml) ] summary // channel: [ val(meta), path(summary_json) ] @@ -21,7 +20,6 @@ workflow FINALISE_BLOBDIR { // BLOBTOOLKIT_UPDATEMETA ( blobdir, - reads, software, params.blastp, params.blastx, diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index f3521186..6a1c4ad4 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -67,6 +67,7 @@ workflow INPUT_CHECK { busco_lin, lineage_tax_ids, blastn, + reads.collect(flat: false).ifEmpty([]), ) ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first()) diff --git a/workflows/blobtoolkit.nf b/workflows/blobtoolkit.nf index 62a66368..280278a7 100644 --- a/workflows/blobtoolkit.nf +++ b/workflows/blobtoolkit.nf @@ -209,7 +209,6 @@ workflow BLOBTOOLKIT { // FINALISE_BLOBDIR ( BLOBTOOLS.out.blobdir, - INPUT_CHECK.out.reads.collect(flat: false).ifEmpty([]), CUSTOM_DUMPSOFTWAREVERSIONS.out.yml, VIEW.out.summary ) From 41b101f9e1db99b26285040c1689d2a195ed84fb Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 20 Sep 2024 09:05:11 +0100 Subject: [PATCH 07/22] Introduced a new samplesheet parameter to track single/paired readsets --- assets/schema_input.json | 5 +++++ assets/test/samplesheet.csv | 10 +++++----- assets/test/samplesheet_raw.csv | 8 ++++---- assets/test/samplesheet_s3.csv | 10 +++++----- assets/test_full/full_samplesheet.csv | 6 +++--- bin/check_samplesheet.py | 24 +++++++++++++++++++++--- bin/generate_config.py | 18 ++++++++++++++---- bin/update_versions.py | 11 ----------- modules/local/generate_config.nf | 2 +- subworkflows/local/input_check.nf | 3 ++- 10 files changed, 60 insertions(+), 37 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 26ed41cb..812541d8 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -23,6 +23,11 @@ "type": "string", "pattern": "^\\S+\\.(bam|cram|fa|fa.gz|fasta|fasta.gz|fq|fq.gz|fastq|fastq.gz)$", "errorMessage": "Data file for reads cannot contain spaces and must be BAM/CRAM/FASTQ/FASTA" + }, + "library_layout": { + "type": "string", + "pattern": "^(SINGLE|PAIRED)$", + "errorMessage": "The only valid layouts are SINGLE and PAIRED" } }, "required": ["datafile", "datatype", "sample"] diff --git a/assets/test/samplesheet.csv b/assets/test/samplesheet.csv index 5f8d4463..c8d555be 100644 --- a/assets/test/samplesheet.csv +++ b/assets/test/samplesheet.csv @@ -1,5 +1,5 @@ -sample,datatype,datafile -mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram -mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram -mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram -mMelMel3,ont,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram +sample,datatype,datafile,library_layout +mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram,PAIRED +mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram,PAIRED +mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram,PAIRED +mMelMel3,ont,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram,SINGLE diff --git a/assets/test/samplesheet_raw.csv b/assets/test/samplesheet_raw.csv index 830753a7..53a5a42e 100644 --- a/assets/test/samplesheet_raw.csv +++ b/assets/test/samplesheet_raw.csv @@ -1,4 +1,4 @@ -sample,datatype,datafile -mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram -mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram -mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram +sample,datatype,datafile,library_layout +mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram,PAIRED +mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram,PAIRED +mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram,PAIRED diff --git a/assets/test/samplesheet_s3.csv b/assets/test/samplesheet_s3.csv index dbb34181..532f584e 100644 --- a/assets/test/samplesheet_s3.csv +++ b/assets/test/samplesheet_s3.csv @@ -1,5 +1,5 @@ -sample,datatype,datafile -mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram -mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram -mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram -mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram +sample,datatype,datafile,library_layout +mMelMel3,hic,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram,PAIRED +mMelMel1,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram,PAIRED +mMelMel2,illumina,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram,PAIRED +mMelMel3,ont,https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram,SINGLE diff --git a/assets/test_full/full_samplesheet.csv b/assets/test_full/full_samplesheet.csv index 6a3ba69d..52f029e2 100644 --- a/assets/test_full/full_samplesheet.csv +++ b/assets/test_full/full_samplesheet.csv @@ -1,3 +1,3 @@ -sample,datatype,datafile -gfLaeSulp1,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram -gfLaeSulp1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram +sample,datatype,datafile,library_layout +gfLaeSulp1,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram,PAIRED +gfLaeSulp1,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram,SINGLE diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 6b5392bf..bc468469 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -45,11 +45,17 @@ class RowChecker: "ont", ) + VALID_LAYOUTS = ( + "SINGLE", + "PAIRED", + ) + def __init__( self, sample_col="sample", type_col="datatype", file_col="datafile", + layout_col="library_layout", **kwargs, ): """ @@ -62,11 +68,14 @@ def __init__( the read data (default "datatype"). file_col (str): The name of the column that contains the file path for the read data (default "datafile"). + layout_col(str): The name of the column that contains the layout of the + library (i.e. "PAIRED" or "SINGLE"). """ super().__init__(**kwargs) self._sample_col = sample_col self._type_col = type_col self._file_col = file_col + self._layout_col = layout_col self._seen = set() self.modified = [] @@ -82,6 +91,7 @@ def validate_and_transform(self, row): self._validate_sample(row) self._validate_type(row) self._validate_file(row) + self._validate_layout(row) self._seen.add((row[self._sample_col], row[self._file_col])) self.modified.append(row) @@ -94,7 +104,7 @@ def _validate_sample(self, row): def _validate_type(self, row): """Assert that the data type matches expected values.""" - if not any(row[self._type_col] for datatype in self.VALID_DATATYPES): + if row[self._type_col] not in self.VALID_DATATYPES: raise AssertionError( f"The datatype is unrecognized: {row[self._type_col]}\n" f"It should be one of: {', '.join(self.VALID_DATATYPES)}" @@ -114,6 +124,14 @@ def _validate_data_format(self, filename): f"It should be one of: {', '.join(self.VALID_FORMATS)}" ) + def _validate_layout(self, row): + """Assert that the library layout matches expected values.""" + if not row[self._layout_col] in self.VALID_LAYOUTS: + raise AssertionError( + f"The library layout is unrecognized: {row[self._layout_col]}\n" + f"It should be one of: {', '.join(self.VALID_LAYOUTS)}" + ) + def validate_unique_samples(self): """ Assert that the combination of sample name and aligned filename is unique. @@ -178,7 +196,7 @@ def check_samplesheet(file_in, file_out): This function checks that the samplesheet follows the following structure, see also the `blobtoolkit samplesheet`_:: - sample,datatype,datafile + sample,datatype,datafile,library_layout sample1,hic,/path/to/file1.cram sample1,pacbio,/path/to/file2.cram sample1,ont,/path/to/file3.cram @@ -187,7 +205,7 @@ def check_samplesheet(file_in, file_out): https://raw.githubusercontent.com/sanger-tol/blobtoolkit/main/assets/test/samplesheet.csv """ - required_columns = {"sample", "datatype", "datafile"} + required_columns = {"sample", "datatype", "datafile", "library_layout"} # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. with file_in.open(newline="") as in_handle: reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) diff --git a/bin/generate_config.py b/bin/generate_config.py index 9171b71c..a62a289b 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -42,6 +42,7 @@ def parse_args(args=None): parser.add_argument("--version", action="version", version="%(prog)s 1.4") parser.add_argument("--read_id", action="append", help="ID of a read set") parser.add_argument("--read_type", action="append", help="Type of a read set") + parser.add_argument("--read_layout", action="append", help="Layout of a read set") parser.add_argument("--read_path", action="append", help="Path of a read set") return parser.parse_args(args) @@ -212,7 +213,10 @@ def print_yaml( ): data = { "assembly": assembly_info, - "reads": {}, + "reads": { + "paired": [], + "single": [], + }, "revision": 1, "settings": { # Only settings.stats_windows is mandatory, everything else is superfluous @@ -251,8 +255,14 @@ def print_yaml( "version": 2, } - for id, datatype, path in zip(args.read_id, args.read_type, args.read_path): - data["reads"][id] = {"file": path, "plaform": datatype_to_platform(datatype)} + for id, datatype, layout, path in reads: + data["reads"][layout.lower()].append( + { + "prefix": id, + "file": path, + "plaform": datatype_to_platform(datatype), + } + ) out_dir = os.path.dirname(file_out) make_dir(out_dir) @@ -317,7 +327,7 @@ def main(args=None): if sequence_report: print_tsvs(args.output_prefix, sequence_report) - reads = zip(args.read_id, args.read_type, args.read_path) + reads = zip(args.read_id, args.read_type, args.read_layout, args.read_path) print_yaml(f"{args.output_prefix}.yaml", assembly_info, taxon_info, classification, reads) print_csv(f"{args.output_prefix}.csv", taxon_id, odb_arr) diff --git a/bin/update_versions.py b/bin/update_versions.py index f8ef9a9c..100bd331 100755 --- a/bin/update_versions.py +++ b/bin/update_versions.py @@ -23,17 +23,6 @@ def parse_args(args=None): return parser.parse_args(args) -def datatype_to_platform(s): - if s == "ont": - return "OXFORD_NANOPORE" - elif s.startswith("pacbio"): - return "PACBIO_SMRT" - elif s in ["hic", "illumina"]: - return "ILLUMINA" - else: - return "OTHER" - - def update_meta(args): with open(args.meta_in) as fh: infile = json.load(fh) diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index b9a6f66f..677def5b 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -28,7 +28,7 @@ process GENERATE_CONFIG { def prefix = task.ext.prefix ?: "${meta.id}" def busco_param = busco_lin ? "--busco '${busco_lin}'" : "" def accession_params = params.accession ? "--accession ${params.accession}" : "" - def input_reads = reads.collect{"--read_id ${it[0].id} --read_type ${it[0].datatype} --read_path ${it[1]}"}.join(' ') + def input_reads = reads.collect{"--read_id ${it[0].id} --read_type ${it[0].datatype} --read_layout ${it[0].layout} --read_path ${it[1]}"}.join(' ') """ generate_config.py \\ --fasta $fasta \\ diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 6a1c4ad4..4f2ecb62 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -120,7 +120,7 @@ def create_data_channels(LinkedHashMap row) { def meta = [:] meta.id = row.sample meta.datatype = row.datatype - + meta.layout = row.library_layout // add path(s) of the read file(s) to the meta map def data_meta = [] @@ -143,6 +143,7 @@ def create_data_channels_from_fetchngs(LinkedHashMap row) { // create meta map def meta = [:] meta.id = row.run_accession + meta.layout = row.library_layout // Same as https://github.com/blobtoolkit/blobtoolkit/blob/4.3.3/src/blobtoolkit-pipeline/src/lib/functions.py#L30-L39 // with the addition of "hic" From 80a100b69c598194806dea9f42192f377e2c2608 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 20 Sep 2024 09:27:57 +0100 Subject: [PATCH 08/22] Populate as much as possible of the metadata at the beginning --- bin/generate_config.py | 38 ++++++++++++++++++------- bin/update_versions.py | 10 ------- modules/local/blobtoolkit/updatemeta.nf | 9 ------ modules/local/generate_config.nf | 11 ++++++- subworkflows/local/finalise_blobdir.nf | 11 ++----- subworkflows/local/input_check.nf | 4 +++ 6 files changed, 43 insertions(+), 40 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index a62a289b..c69e3f70 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -38,12 +38,16 @@ def parse_args(args=None): parser.add_argument("--output_prefix", required=True, help="Prefix to name the output files.") parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None) parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None) - parser.add_argument("--blastn", required=True, help="Path to the NCBI Taxonomy database") + parser.add_argument("--nt", required=True, help="Path to the NT database") parser.add_argument("--version", action="version", version="%(prog)s 1.4") parser.add_argument("--read_id", action="append", help="ID of a read set") parser.add_argument("--read_type", action="append", help="Type of a read set") parser.add_argument("--read_layout", action="append", help="Layout of a read set") parser.add_argument("--read_path", action="append", help="Path of a read set") + parser.add_argument("--blastp", help="Path to the blastp database", required=True) + parser.add_argument("--blastx", help="Path to the blastx database", required=True) + parser.add_argument("--blastn", help="Path to the blastn database", required=True) + parser.add_argument("--taxdump", help="Path to the taxonomy database", required=True) return parser.parse_args(args) @@ -184,8 +188,8 @@ def get_sequence_report(accession: str): return sequence_report -def adjust_taxon_id(blastn: str, taxon_info: TaxonInfo) -> int: - con = sqlite3.connect(os.path.join(blastn, "taxonomy4blast.sqlite3")) +def adjust_taxon_id(nt: str, taxon_info: TaxonInfo) -> int: + con = sqlite3.connect(os.path.join(nt, "taxonomy4blast.sqlite3")) cur = con.cursor() for taxon_id in [taxon_info.taxon_id] + [anc_taxon_info.taxon_id for anc_taxon_info in taxon_info.lineage]: res = cur.execute("SELECT * FROM TaxidInfo WHERE taxid = ?", (taxon_id,)) @@ -210,6 +214,10 @@ def print_yaml( taxon_info: TaxonInfo, classification: typing.Dict[str, str], reads, + blastp, + blastx, + blastn, + taxdump, ): data = { "assembly": assembly_info, @@ -219,32 +227,30 @@ def print_yaml( }, "revision": 1, "settings": { - # Only settings.stats_windows is mandatory, everything else is superfluous "blast_chunk": 100000, "blast_max_chunks": 10, "blast_min_length": 1000, "blast_overlap": 0, "stats_chunk": 1000, "stats_windows": [0.1, 0.01, 100000, 1000000], - # "taxdump": , + "taxdump": taxdump, "tmp": "/tmp", }, "similarity": { - # Only the presence similarity.diamond_blastx seems mandatory, everything else is superfluous "blastn": { "name": "nt", - # "path": , + "path": blastn, }, "defaults": {"evalue": 1e-10, "import_evalue": 1e-25, "max_target_seqs": 10, "taxrule": "buscogenes"}, "diamond_blastp": { "import_max_target_seqs": 100000, "name": "reference_proteomes", - # "path": , + "path": blastp, "taxrule": "blastp=buscogenes", }, "diamond_blastx": { "name": "reference_proteomes", - # "path": , + "path": blastx, }, }, "taxon": { @@ -322,14 +328,24 @@ def main(args=None): taxon_info = fetch_taxon_info(args.taxon_query) classification = get_classification(taxon_info) odb_arr = get_odb(taxon_info, args.lineage_tax_ids, args.busco) - taxon_id = adjust_taxon_id(args.blastn, taxon_info) + taxon_id = adjust_taxon_id(args.nt, taxon_info) if sequence_report: print_tsvs(args.output_prefix, sequence_report) reads = zip(args.read_id, args.read_type, args.read_layout, args.read_path) - print_yaml(f"{args.output_prefix}.yaml", assembly_info, taxon_info, classification, reads) + print_yaml( + f"{args.output_prefix}.yaml", + assembly_info, + taxon_info, + classification, + reads, + args.blastp, + args.blastx, + args.blastn, + args.taxdump, + ) print_csv(f"{args.output_prefix}.csv", taxon_id, odb_arr) diff --git a/bin/update_versions.py b/bin/update_versions.py index 100bd331..92835084 100755 --- a/bin/update_versions.py +++ b/bin/update_versions.py @@ -15,10 +15,6 @@ def parse_args(args=None): parser.add_argument("--meta_in", help="Input JSON file.", required=True) parser.add_argument("--meta_out", help="Output JSON file.", required=True) parser.add_argument("--software", help="Input YAML file.", required=True) - parser.add_argument("--blastp", help="Path to the blastp database", required=True) - parser.add_argument("--blastx", help="Path to the blastx database", required=True) - parser.add_argument("--blastn", help="Path to the blastn database", required=True) - parser.add_argument("--taxdump", help="Path to the taxonomy database", required=True) parser.add_argument("--version", action="version", version="%(prog)s 1.2.0") return parser.parse_args(args) @@ -40,12 +36,6 @@ def update_meta(args): del new_dict["sanger-tol/blobtoolkit"] infile["settings"]["software_versions"] = new_dict - infile["settings"]["taxdump"] = args.taxdump - for k in ["blastn", "diamond_blastp", "diamond_blastx"]: - infile["similarity"].setdefault(k, {}) - infile["similarity"]["blastn"]["path"] = args.blastn - infile["similarity"]["diamond_blastp"]["path"] = args.blastp - infile["similarity"]["diamond_blastx"]["path"] = args.blastx return infile diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf index f39bee85..a5556348 100644 --- a/modules/local/blobtoolkit/updatemeta.nf +++ b/modules/local/blobtoolkit/updatemeta.nf @@ -10,11 +10,6 @@ process BLOBTOOLKIT_UPDATEMETA { input: tuple val(meta), path(input) path versions - // The following are passed as "val" because we just want to know the full paths. No staging necessary - val blastp - val blastx - val blastn - val taxdump output: tuple val(meta), path("*.json"), emit: json @@ -31,10 +26,6 @@ process BLOBTOOLKIT_UPDATEMETA { ${args} \\ --meta_in ${input}/meta.json \\ --software ${versions} \\ - --blastp ${blastp} \\ - --blastx ${blastx} \\ - --blastn ${blastn} \\ - --taxdump ${taxdump} \\ --meta_out ${prefix}.meta.json cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/generate_config.nf b/modules/local/generate_config.nf index 677def5b..a6cac943 100644 --- a/modules/local/generate_config.nf +++ b/modules/local/generate_config.nf @@ -12,6 +12,11 @@ process GENERATE_CONFIG { path lineage_tax_ids tuple val(meta2), path(blastn) val reads + // The following are passed as "val" because we just want to know the full paths. No staging necessary + val blastp_path + val blastx_path + val blastn_path + val taxdump_path output: tuple val(meta), path("*.yaml") , emit: yaml @@ -36,8 +41,12 @@ process GENERATE_CONFIG { --lineage_tax_ids $lineage_tax_ids \\ $busco_param \\ $accession_params \\ - --blastn $blastn \\ + --nt $blastn \\ $input_reads \\ + --blastp ${blastp_path} \\ + --blastx ${blastx_path} \\ + --blastn ${blastn_path} \\ + --taxdump ${taxdump_path} \\ --output_prefix ${prefix} cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/finalise_blobdir.nf b/subworkflows/local/finalise_blobdir.nf index a993c705..04dfe8d2 100644 --- a/subworkflows/local/finalise_blobdir.nf +++ b/subworkflows/local/finalise_blobdir.nf @@ -16,16 +16,9 @@ workflow FINALISE_BLOBDIR { ch_versions = Channel.empty() // - // MODULE: Update meta json file + // MODULE: Update the software listed in the meta json file // - BLOBTOOLKIT_UPDATEMETA ( - blobdir, - software, - params.blastp, - params.blastx, - params.blastn, - params.taxdump, - ) + BLOBTOOLKIT_UPDATEMETA ( blobdir, software ) // // MODULE: Compress all the json files diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 4f2ecb62..69a4757a 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -68,6 +68,10 @@ workflow INPUT_CHECK { lineage_tax_ids, blastn, reads.collect(flat: false).ifEmpty([]), + params.blastp, + params.blastx, + params.blastn, + params.taxdump, ) ch_versions = ch_versions.mix(GENERATE_CONFIG.out.versions.first()) From bea3a779ed52b658540f817720913bb3d20e80b2 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 20 Sep 2024 09:33:25 +0100 Subject: [PATCH 09/22] Version bumps --- bin/generate_config.py | 2 +- bin/update_versions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index c69e3f70..8bdec323 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -39,7 +39,6 @@ def parse_args(args=None): parser.add_argument("--accession", help="Accession number of the assembly (optional).", default=None) parser.add_argument("--busco", help="Requested BUSCO lineages.", default=None) parser.add_argument("--nt", required=True, help="Path to the NT database") - parser.add_argument("--version", action="version", version="%(prog)s 1.4") parser.add_argument("--read_id", action="append", help="ID of a read set") parser.add_argument("--read_type", action="append", help="Type of a read set") parser.add_argument("--read_layout", action="append", help="Layout of a read set") @@ -48,6 +47,7 @@ def parse_args(args=None): parser.add_argument("--blastx", help="Path to the blastx database", required=True) parser.add_argument("--blastn", help="Path to the blastn database", required=True) parser.add_argument("--taxdump", help="Path to the taxonomy database", required=True) + parser.add_argument("--version", action="version", version="%(prog)s 2.0") return parser.parse_args(args) diff --git a/bin/update_versions.py b/bin/update_versions.py index 92835084..21f379a5 100755 --- a/bin/update_versions.py +++ b/bin/update_versions.py @@ -15,7 +15,7 @@ def parse_args(args=None): parser.add_argument("--meta_in", help="Input JSON file.", required=True) parser.add_argument("--meta_out", help="Output JSON file.", required=True) parser.add_argument("--software", help="Input YAML file.", required=True) - parser.add_argument("--version", action="version", version="%(prog)s 1.2.0") + parser.add_argument("--version", action="version", version="%(prog)s 1.3.0") return parser.parse_args(args) From 4edafb4d3b33cb248eb11a04caf6838cb82abc45 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 20 Sep 2024 10:10:17 +0100 Subject: [PATCH 10/22] Updated the documentation --- CHANGELOG.md | 14 +++++++------- README.md | 23 +++++++++++----------- docs/output.md | 2 +- docs/usage.md | 52 ++++++++++++++++++++++++-------------------------- 4 files changed, 45 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 050399ed..6640109a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,13 +87,13 @@ The pipeline has now been validated on dozens of genomes, up to 11 Gbp. Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. -| Dependency | Old version | New version | -| ----------- | ------------- | ------------- | -| blobtoolkit | 4.3.3 | 4.3.9 | -| blast | 2.14.0 | 2.15.0 | -| multiqc | 1.17 and 1.18 | 1.20 and 1.21 | -| samtools | 1.18 | 1.19.2 | -| seqtk | 1.3 | 1.4 | +| Dependency | Old version | New version | +| ----------- | ------------- | ----------------- | +| blobtoolkit | 4.3.3 | 4.3.9 | +| blast | 2.14.0 | 2.15.0 and 2.14.1 | +| multiqc | 1.17 and 1.18 | 1.20 and 1.21 | +| samtools | 1.18 | 1.19.2 | +| seqtk | 1.3 | 1.4 | > **NB:** Dependency has been **updated** if both old and new version information is present.
**NB:** Dependency has been **added** if just the new version information is present.
**NB:** Dependency has been **removed** if version information isn't present. diff --git a/README.md b/README.md index b55ab353..ed74da7c 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,8 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome 4. Run BUSCO ([`busco`](https://busco.ezlab.org/)) 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond)) -7. Run BLASTx against sequences with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) -8. Run BLASTn against sequences still with not hit ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) +7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond)) +8. Run BLASTn against sequences still with not hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) 9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit)) 11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit)) @@ -37,13 +37,17 @@ First, prepare a samplesheet with your input data that looks as follows: `samplesheet.csv`: ```csv -sample,datatype,datafile -mMelMel3,hic,GCA_922984935.2.hic.mMelMel3.cram -mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram -mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram +sample,datatype,datafile,library_layout +mMelMel3,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED +mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED +mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE ``` -Each row represents an aligned file. Rows with the same sample identifier are considered technical replicates. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`). The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. +Each row represents an aligned file. +Rows with the same sample identifier are considered technical replicates. +The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`). +The library layout indicates whether the reads are paired or single. +The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. Now, you can run the pipeline using: @@ -77,9 +81,8 @@ sanger-tol/blobtoolkit was written in Nextflow by [Alexander Ramos Diaz](https:/ We thank the following people for their assistance in the development of this pipeline: - - - [Guoying Qi](https://github.com/gq1) +- [Bethan Yates](https://github.com/BethYates) ## Contributions and Support @@ -89,8 +92,6 @@ If you would like to contribute to this pipeline, please see the [contributing g If you use sanger-tol/blobtoolkit for your analysis, please cite it using the following doi: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058) - - An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). diff --git a/docs/output.md b/docs/output.md index e3204a1d..469f50ad 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,7 +2,7 @@ ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. diff --git a/docs/usage.md b/docs/usage.md index 4d0ad33e..22df5508 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,7 +10,7 @@ ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' @@ -21,30 +21,31 @@ You will need to create a samplesheet with information about the samples you wou The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: ```console -sample,datatype,datafile -sample1,hic,hic.cram -sample2,illumina,illumina.cram -sample2,illumina,illumina.cram +sample,datatype,datafile,library_layout +sample1,hic,hic.cram,PAIRED +sample2,illumina,illumina.cram,PAIRED +sample2,illumina,illumina.cram,PAIRED ``` ### Full samplesheet -The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 4 columns to match those defined in the table below. A final samplesheet file may look something like the one below. ```console -sample,datatype,datafile -sample1,hic,hic.cram -sample2,illumina,illumina.cram -sample3,ont,ont.cram +sample,datatype,datafile,library_layout +sample1,hic,hic.cram,PAIRED +sample2,illumina,illumina.cram,PAIRED +sample3,ont,ont.cram,SINGLE ``` -| Column | Description | -| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | -| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`. | -| `datafile` | Full path to read data file. | +| Column | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | +| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`. | +| `datafile` | Full path to read data file. | +| `library_layout` | Layout of the library. Must be one of `SINGLE`, `PAIRED`. | An [example samplesheet](assets/test/samplesheet.csv) has been provided with the pipeline. @@ -99,6 +100,10 @@ wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.???.tar.gz" -P $NT/ && for file in $NT/*.tar.gz; do tar xf $file -C $NT && rm $file; done + +wget "https://ftp.ncbi.nlm.nih.gov/blast/db/v5/taxdb.tar.gz" && +tar xf taxdb.tar.gz -C $NT && +rm taxdb.tar.gz ``` ### 3. UniProt reference proteomes database @@ -228,7 +233,7 @@ List of tools for any given dataset can be fetched from the API, for example htt | Dependency | Snakemake | Nextflow | | ----------------- | --------- | -------- | | blobtoolkit | 4.3.2 | 4.3.9 | -| blast | 2.12.0 | 2.15.0 | +| blast | 2.12.0 | 2.14.1 | | blobtk | 0.5.0 | 0.5.1 | | busco | 5.3.2 | 5.5.0 | | diamond | 2.0.15 | 2.1.8 | @@ -269,9 +274,8 @@ If you wish to repeatedly use the same parameters for multiple runs, rather than Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -:::warning -Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -::: +> [!WARNING] +> Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). The above pipeline run specified with a params file in yaml format: @@ -308,15 +312,11 @@ This version number will be logged in reports when you run the pipeline, so that To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -:::tip If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. -::: ## Core Nextflow arguments -:::note -These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). -::: +> These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). ### `-profile` @@ -324,9 +324,7 @@ Use this parameter to choose a configuration profile. Profiles can give configur Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -:::info -We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. -::: +> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). From 904a5128f0511f18125631a1d14e2e6106e7f277 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 20 Sep 2024 10:17:47 +0100 Subject: [PATCH 11/22] Version bump --- CHANGELOG.md | 9 +++++++++ nextflow.config | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6640109a..98a73534 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.0.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/1.0.0)] – Psyduck – [2024-XX-YY] + +The pipeline is now considered to be a complete and suitable replacement for the Snakemake version. + +- Fetch information about the chromosomes of the assemblies. Used to power + "grid plots". +- Fill in accurate read information in the blobDir. Users are now reqiured + to indicate whether the reads are paired or single. + ## [[0.6.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.6.0)] – Bellsprout – [2024-09-13] The pipeline has now been validated for draft (unpublished) assemblies. diff --git a/nextflow.config b/nextflow.config index 18994df9..036a540c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -249,7 +249,7 @@ manifest { description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '0.6.0' + version = '1.0.0' doi = '10.5281/zenodo.7949058' } From 51985addbad8d35b83735cde41cecb8a9ff95300 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 20 Sep 2024 10:24:38 +0100 Subject: [PATCH 12/22] Publish the windowmasker outputs too --- conf/modules.config | 10 ++++++++++ docs/output.md | 15 +++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index 1193cf5f..ceeb9c67 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -22,10 +22,20 @@ process { withName: "WINDOWMASKER_MKCOUNTS" { ext.args = "-infmt fasta -sformat obinary" + publishDir = [ + path: { "${params.outdir}/repeats/windowmasker" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename.substring(0, filename.length() - 3) + "obinary" } + ] } withName: "WINDOWMASKER_USTAT" { ext.args = "-infmt fasta -dust T -outfmt fasta" + publishDir = [ + path: { "${params.outdir}/repeats/windowmasker" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] } withName: "MINIMAP2_HIC" { diff --git a/docs/output.md b/docs/output.md index 469f50ad..8af0fd40 100644 --- a/docs/output.md +++ b/docs/output.md @@ -15,6 +15,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [BlobDir](#blobdir) - Output files viewable on a [BlobToolKit viewer](https://github.com/blobtoolkit/blobtoolkit) - [Static plots](#static-plots) - Static versions of the BlobToolKit plots - [BUSCO](#busco) - BUSCO results +- [Repeat masking](#repeat-masking) - Masked repeats (optional) - [Read alignments](#read-alignments) - Aligned reads (optional) - [Read coverage](#read-coverage) - Read coverage tracks - [Base content](#base-content) - _k_-mer statistics (for k ≤ 4) @@ -68,6 +69,20 @@ BUSCO results generated by the pipeline (all BUSCO lineages that match the claas +### Repeat masking + +Reults from the repeat-masker step -- only if the pipeline is run with `--mask`. + +
+Output files + +- `repeats/` + - `windowmasker/` + - `.fasta`: masked assembly in Fasta format. + - `.obinary`: frequency counts of repeats, in windowmasker's own binary format. + +
+ ### Read alignments Read alignments in BAM format -- only if the pipeline is run with `--align`. From f47dd166985f0564328450b53330237055c1c675 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 23 Sep 2024 09:40:28 +0100 Subject: [PATCH 13/22] Convert input BAM/CRAM files to Fasta on the fly This will save a lot of space --- modules.json | 6 -- modules/nf-core/minimap2/align/main.nf | 9 ++- .../minimap2/align/minimap2-align.diff | 22 +++++++ .../nf-core/samtools/fasta/environment.yml | 8 --- modules/nf-core/samtools/fasta/main.nf | 43 ------------- modules/nf-core/samtools/fasta/meta.yml | 60 ------------------- .../samtools/fasta/samtools-fasta.diff | 13 ---- subworkflows/local/minimap_alignment.nf | 19 +----- 8 files changed, 29 insertions(+), 151 deletions(-) delete mode 100644 modules/nf-core/samtools/fasta/environment.yml delete mode 100644 modules/nf-core/samtools/fasta/main.nf delete mode 100644 modules/nf-core/samtools/fasta/meta.yml delete mode 100644 modules/nf-core/samtools/fasta/samtools-fasta.diff diff --git a/modules.json b/modules.json index 3ce2a831..4af0bcd6 100644 --- a/modules.json +++ b/modules.json @@ -66,12 +66,6 @@ "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d", "installed_by": ["modules"] }, - "samtools/fasta": { - "branch": "master", - "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", - "installed_by": ["modules"], - "patch": "modules/nf-core/samtools/fasta/samtools-fasta.diff" - }, "samtools/flagstat": { "branch": "master", "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf index 7030554d..df7b1424 100644 --- a/modules/nf-core/minimap2/align/main.nf +++ b/modules/nf-core/minimap2/align/main.nf @@ -26,15 +26,18 @@ process MINIMAP2_ALIGN { def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def bam_input = reads.getExtension() in ["bam", "cram"] ? "samtools fasta ${reads} | " : "" + if (bam_input && !reference) error "Fasta/q format is required for self-alignments" + def minimap_input = bam_input ? "-" : reads def bam_output = bam_format ? "-a | samtools sort -@ ${task.cpus} -o ${prefix}.bam ${args2}" : "-o ${prefix}.paf" def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' """ - minimap2 \\ + ${bam_input} minimap2 \\ $args \\ -t $task.cpus \\ - ${reference ?: reads} \\ - $reads \\ + ${reference ?: minimap_input} \\ + $minimap_input \\ $cigar_paf \\ $set_cigar_bam \\ $bam_output diff --git a/modules/nf-core/minimap2/align/minimap2-align.diff b/modules/nf-core/minimap2/align/minimap2-align.diff index 479818b3..fa1b7d53 100644 --- a/modules/nf-core/minimap2/align/minimap2-align.diff +++ b/modules/nf-core/minimap2/align/minimap2-align.diff @@ -8,5 +8,27 @@ Changes in module 'nf-core/minimap2/align' // Note: the versions here need to match the versions used in the mulled container below and minimap2/index conda "${moduleDir}/environment.yml" +@@ -27,15 +26,18 @@ + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" ++ def bam_input = reads.getExtension() in ["bam", "cram"] ? "samtools fasta ${reads} | " : "" ++ if (bam_input && !reference) error "Fasta/q format is required for self-alignments" ++ def minimap_input = bam_input ? "-" : reads + def bam_output = bam_format ? "-a | samtools sort -@ ${task.cpus} -o ${prefix}.bam ${args2}" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ +- minimap2 \\ ++ ${bam_input} minimap2 \\ + $args \\ + -t $task.cpus \\ +- ${reference ?: reads} \\ +- $reads \\ ++ ${reference ?: minimap_input} \\ ++ $minimap_input \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output ************************************************************ diff --git a/modules/nf-core/samtools/fasta/environment.yml b/modules/nf-core/samtools/fasta/environment.yml deleted file mode 100644 index 14585013..00000000 --- a/modules/nf-core/samtools/fasta/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: samtools_fasta -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::samtools=1.19.2 - - bioconda::htslib=1.19.1 diff --git a/modules/nf-core/samtools/fasta/main.nf b/modules/nf-core/samtools/fasta/main.nf deleted file mode 100644 index 9aa03430..00000000 --- a/modules/nf-core/samtools/fasta/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process SAMTOOLS_FASTA { - tag "$meta.id" - label 'process_low' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : - 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" - - input: - tuple val(meta), path(input) - val(interleave) - - output: - tuple val(meta), path("*_{1,2}.fasta.gz") , optional:true, emit: fasta - tuple val(meta), path("*_interleaved.fasta.gz"), optional:true, emit: interleaved - tuple val(meta), path("*_singleton.fasta.gz") , optional:true, emit: singleton - tuple val(meta), path("*_other.fasta.gz") , optional:true, emit: other - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def output = ( interleave && ! meta.single_end ) ? "| gzip > ${prefix}_interleaved.fasta.gz" : - meta.single_end ? "-1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz" : - "-1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz" - """ - samtools \\ - fasta \\ - $args \\ - --threads ${task.cpus-1} \\ - $input \\ - $output - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/fasta/meta.yml b/modules/nf-core/samtools/fasta/meta.yml deleted file mode 100644 index eae26f01..00000000 --- a/modules/nf-core/samtools/fasta/meta.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: "samtools_fasta" -description: Converts a SAM/BAM/CRAM file to FASTA -keywords: - - bam - - sam - - cram - - fasta -tools: - - "samtools": - description: "Tools for dealing with SAM, BAM and CRAM files" - homepage: "http://www.htslib.org" - documentation: "https://www.htslib.org/doc/samtools-fasta.html" - tool_dev_url: "https://github.com/samtools/samtools" - doi: "10.1093/bioinformatics/btp352" - licence: ["MIT"] -input: - # Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - interleave: - type: boolean - description: Set true for interleaved fasta files -output: - #Only when we have meta - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - fasta: - type: file - description: Compressed FASTA file(s) with reads with either the READ1 or READ2 flag set in separate files. - pattern: "*_{1,2}.fasta.gz" - - interleaved: - type: file - description: Compressed FASTA file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. - pattern: "*_interleaved.fasta.gz" - - singleton: - type: file - description: Compressed FASTA file with singleton reads - pattern: "*_singleton.fasta.gz" - - other: - type: file - description: Compressed FASTA file with reads with either both READ1 and READ2 flags set or unset - pattern: "*_other.fasta.gz" -authors: - - "@priyanka-surana" -maintainers: - - "@priyanka-surana" diff --git a/modules/nf-core/samtools/fasta/samtools-fasta.diff b/modules/nf-core/samtools/fasta/samtools-fasta.diff deleted file mode 100644 index e2374ed9..00000000 --- a/modules/nf-core/samtools/fasta/samtools-fasta.diff +++ /dev/null @@ -1,13 +0,0 @@ -Changes in module 'nf-core/samtools/fasta' ---- modules/nf-core/samtools/fasta/main.nf -+++ modules/nf-core/samtools/fasta/main.nf -@@ -32,7 +32,6 @@ - fasta \\ - $args \\ - --threads ${task.cpus-1} \\ -- -0 ${prefix}_other.fasta.gz \\ - $input \\ - $output - - -************************************************************ diff --git a/subworkflows/local/minimap_alignment.nf b/subworkflows/local/minimap_alignment.nf index 0c25f4c7..33d632c7 100644 --- a/subworkflows/local/minimap_alignment.nf +++ b/subworkflows/local/minimap_alignment.nf @@ -2,7 +2,6 @@ // Optional alignment subworkflow using Minimap2 // -include { SAMTOOLS_FASTA } from '../../modules/nf-core/samtools/fasta/main' include { MINIMAP2_ALIGN as MINIMAP2_HIC } from '../../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_ILMN } from '../../modules/nf-core/minimap2/align/main' include { MINIMAP2_ALIGN as MINIMAP2_CCS } from '../../modules/nf-core/minimap2/align/main' @@ -20,24 +19,8 @@ workflow MINIMAP2_ALIGNMENT { ch_versions = Channel.empty() - // Convert BAM/CRAM reads to FASTA - input - | branch { - meta, reads -> - fasta: reads.toString().endsWith(".fasta") || reads.toString().endsWith(".fasta.gz") || reads.toString().endsWith(".fa") || reads.toString().endsWith(".fa.gz") - fastq: reads.toString().endsWith(".fastq") || reads.toString().endsWith(".fastq.gz") || reads.toString().endsWith(".fq") || reads.toString().endsWith(".fq.gz") - bamcram: true - } - | set { ch_reads_by_type } - - SAMTOOLS_FASTA ( ch_reads_by_type.bamcram, true ) - ch_versions = ch_versions.mix(SAMTOOLS_FASTA.out.versions.first()) - - // Branch input by sequencing type - SAMTOOLS_FASTA.out.interleaved - | mix ( ch_reads_by_type.fasta ) - | mix ( ch_reads_by_type.fastq ) + input | branch { meta, reads -> hic: meta.datatype == "hic" From 43254bc6dbc7a071fc1fc5d516aa6544f42cf280 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 26 Sep 2024 15:43:55 +0100 Subject: [PATCH 14/22] Bumped the version down --- CHANGELOG.md | 2 +- nextflow.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98a73534..e55b7bfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.0.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/1.0.0)] – Psyduck – [2024-XX-YY] +## [[0.7.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.7.0)] – Psyduck – [2024-XX-YY] The pipeline is now considered to be a complete and suitable replacement for the Snakemake version. diff --git a/nextflow.config b/nextflow.config index 036a540c..caac9e54 100644 --- a/nextflow.config +++ b/nextflow.config @@ -249,7 +249,7 @@ manifest { description = """Quality assessment of genome assemblies""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.0.0' + version = '0.7.0' doi = '10.5281/zenodo.7949058' } From c89b1b97b24dc244e0329dc1b1b99ea0908c392f Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 26 Sep 2024 16:11:39 +0100 Subject: [PATCH 15/22] Pin the version of editorconfig like in the readmapping pipeline --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 8d12b98a..b13dea44 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -19,7 +19,7 @@ jobs: - uses: actions/setup-node@v4 - name: Install editorconfig-checker - run: npm install -g editorconfig-checker + run: npm install -g editorconfig-checker@3.0.2 - name: Run ECLint check run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') From 152282bc0ec0e76532b5b4b1a8a79d5d2f16a559 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 26 Sep 2024 16:24:02 +0100 Subject: [PATCH 16/22] Exclude sqlite databases from ECLint --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index b13dea44..ca21ecfe 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -22,7 +22,7 @@ jobs: run: npm install -g editorconfig-checker@3.0.2 - name: Run ECLint check - run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') + run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile\|.sqlite3') Prettier: runs-on: ubuntu-latest From 476b7362427a744e6afde4d8121a8e5daafb6639 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Mon, 30 Sep 2024 14:44:27 +0100 Subject: [PATCH 17/22] add exclusions for nf-core lint --- .nf-core.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.nf-core.yml b/.nf-core.yml index 85e18745..0ad948cc 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -12,6 +12,9 @@ lint: - LICENCE - lib/NfcoreTemplate.groovy - CODE_OF_CONDUCT.md + - assets/sendmail_template.txt + - assets/email_template.html + - assets/email_template.txt - assets/nf-core-blobtoolkit_logo_light.png - docs/images/nf-core-blobtoolkit_logo_light.png - docs/images/nf-core-blobtoolkit_logo_dark.png @@ -19,6 +22,8 @@ lint: - .github/PULL_REQUEST_TEMPLATE.md - .github/workflows/linting.yml - .github/workflows/branch.yml + - .github/CONTRIBUTING.md + - .github/workflows/linting_comment.yml multiqc_config: - report_comment nextflow_config: From 303b221fa2cf8002a79dcf12813b92406974d5bb Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Mon, 30 Sep 2024 15:07:46 +0100 Subject: [PATCH 18/22] Update CITATIONS.md --- CITATIONS.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 8b960779..81557aad 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -12,6 +12,10 @@ ## Pipeline tools +- [BLAST+](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) + + > Camacho, Chritiam, et al. “BLAST+: architecture and applications.” BMC Bioinformatics, vol. 10, no. 412, Dec. 2009, https://doi.org/10.1186/1471-2105-10-421 + - [BlobToolKit](https://github.com/blobtoolkit/blobtoolkit) > Challis, Richard, et al. “BlobToolKit – Interactive Quality Assessment of Genome Assemblies.” G3 Genes|Genomes|Genetics, vol. 10, no. 4, Apr. 2020, pp. 1361–74, https://doi.org/10.1534/g3.119.400908. @@ -26,9 +30,7 @@ - [Fasta_windows](https://github.com/tolkit/fasta_windows) -- [GoaT](https://goat.genomehubs.org) - - > Challis, Richard, et al. “Genomes on a Tree (GoaT): A versatile, scalable search engine for genomic and sequencing project metadata across the eukaryotic tree of life.” Wellcome Open Research, vol. 8, no. 24, 2023, https://doi.org/10.12688/wellcomeopenres.18658.1. + > Brown, Max, et al. "Fasta_windows v0.2.3". GitHub, 2021. https://github.com/tolkit/fasta_windows - [Minimap2](https://github.com/lh3/minimap2) @@ -42,6 +44,10 @@ > Danecek, Petr, et al. “Twelve Years of SAMtools and BCFtools.” GigaScience, vol. 10, no. 2, Jan. 2021, https://doi.org/10.1093/gigascience/giab008. +- [SeqTK](https://github.com/lh3/seqtk) + + > Li, Heng. "SeqTK v1.4" GitHub, 2023, https://github.com/lh3/seqtk + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) From 27b0a5190e944fd642f3d7fd32a9c5473f116424 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Mon, 30 Sep 2024 15:12:57 +0100 Subject: [PATCH 19/22] prettier linting --- CITATIONS.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 81557aad..a3a1a070 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -15,7 +15,7 @@ - [BLAST+](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) > Camacho, Chritiam, et al. “BLAST+: architecture and applications.” BMC Bioinformatics, vol. 10, no. 412, Dec. 2009, https://doi.org/10.1186/1471-2105-10-421 - + - [BlobToolKit](https://github.com/blobtoolkit/blobtoolkit) > Challis, Richard, et al. “BlobToolKit – Interactive Quality Assessment of Genome Assemblies.” G3 Genes|Genomes|Genetics, vol. 10, no. 4, Apr. 2020, pp. 1361–74, https://doi.org/10.1534/g3.119.400908. @@ -30,7 +30,7 @@ - [Fasta_windows](https://github.com/tolkit/fasta_windows) - > Brown, Max, et al. "Fasta_windows v0.2.3". GitHub, 2021. https://github.com/tolkit/fasta_windows + > Brown, Max, et al. "Fasta_windows v0.2.3". GitHub, 2021. https://github.com/tolkit/fasta_windows - [Minimap2](https://github.com/lh3/minimap2) @@ -46,7 +46,7 @@ - [SeqTK](https://github.com/lh3/seqtk) - > Li, Heng. "SeqTK v1.4" GitHub, 2023, https://github.com/lh3/seqtk + > Li, Heng. "SeqTK v1.4" GitHub, 2023, https://github.com/lh3/seqtk ## Software packaging/containerisation tools From 4eef0fa1745f1d46f13c8721dd2f7cf840a88fc2 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 2 Oct 2024 08:25:47 +0100 Subject: [PATCH 20/22] The release is going to happen today --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e55b7bfd..20ef6a3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[0.7.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.7.0)] – Psyduck – [2024-XX-YY] +## [[0.7.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.7.0)] – Psyduck – [2024-10-02] The pipeline is now considered to be a complete and suitable replacement for the Snakemake version. From 0d8a5a9ceded4f1cce84cb88e4b753493cbc84d7 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 2 Oct 2024 08:26:42 +0100 Subject: [PATCH 21/22] Decreased the runtime requirements for illumina alignments --- conf/base.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/base.config b/conf/base.config index 75fa0d06..f24b6b99 100644 --- a/conf/base.config +++ b/conf/base.config @@ -66,11 +66,11 @@ process { time = { check_max( 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } } - // Temporarily the same settings as CCS + // Baased on a handful of runs withName: '.*:MINIMAP2_ALIGNMENT:MINIMAP2_(HIC|ILMN)' { cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) } memory = { check_max( 800.MB * log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) + 14.GB * Math.ceil( Math.pow(meta2.genome_size / 1000000000, 0.6)) * task.attempt, 'memory' ) } - time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } + time = { check_max( 1.h * Math.ceil( meta.read_count / 75000000 ) * task.attempt, 'time' ) } } withName: 'WINDOWSTATS_INPUT' { From 1909646894ddcaf283bfdd86ceec82d5e0867bc2 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 2 Oct 2024 08:28:18 +0100 Subject: [PATCH 22/22] Wording of the changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20ef6a3c..23549f6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ The pipeline is now considered to be a complete and suitable replacement for the - Fetch information about the chromosomes of the assemblies. Used to power "grid plots". - Fill in accurate read information in the blobDir. Users are now reqiured - to indicate whether the reads are paired or single. + to indicate in the samplesheet whether the reads are paired or single. ## [[0.6.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.6.0)] – Bellsprout – [2024-09-13]