From a2820ce1b6a02b31f5f2d9b1b35b7d83771415bd Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Mon, 13 May 2024 13:58:16 -0700 Subject: [PATCH] Update provenance format (#34) * Update provenance, and indenting * Fix provenance output * Remove versioned_outdir * Remove versioned_outdir --- bin/get_kraken_db_metadata.py | 46 ++++++ environments/environment.yml | 1 + main.nf | 79 +++++----- modules/provenance.nf | 46 +++--- modules/taxon_abundance.nf | 288 ++++++++++++++++++---------------- nextflow.config | 70 ++++----- 6 files changed, 305 insertions(+), 225 deletions(-) create mode 100755 bin/get_kraken_db_metadata.py diff --git a/bin/get_kraken_db_metadata.py b/bin/get_kraken_db_metadata.py new file mode 100755 index 0000000..d302386 --- /dev/null +++ b/bin/get_kraken_db_metadata.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os + +import yaml + +def main(args): + db_metadata_path = os.path.join(args.db, 'metadata.json') + db_metadata = {} + if os.path.exists(db_metadata_path): + with open(db_metadata_path, 'r') as f: + db_metadata = json.load(f) + + if not db_metadata: + exit() + + provenance = {} + if args.provenance: + with open(args.provenance, 'r') as f: + provenance = yaml.safe_load(f) + + database_provenance = {} + database_name = db_metadata.get('dbname', None) + if database_name: + database_provenance['database_name'] = database_name + database_version = db_metadata.get('version', None) + if database_version: + database_provenance['database_version'] = database_version + + for provenance_record in provenance: + if provenance_record.get('process_name') == 'kraken2': + provenance_record['databases'] = [] + provenance_record['databases'].append(database_provenance) + + with open(args.provenance, 'w') as f: + yaml.dump(provenance, f) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Get metadata from Kraken database') + parser.add_argument('--provenance', help='Path to provenance file') + parser.add_argument('--db', help='Path to Kraken database') + args = parser.parse_args() + main(args) diff --git a/environments/environment.yml b/environments/environment.yml index 08f89ea..afdfd1e 100644 --- a/environments/environment.yml +++ b/environments/environment.yml @@ -5,6 +5,7 @@ channels: - defaults dependencies: - python=3 + - pyyaml=6.0.1 - fastp=0.20.1 - kraken2=2.1.2 - bracken=2.6.1 diff --git a/main.nf b/main.nf index b3aa1ba..6827067 100644 --- a/main.nf +++ b/main.nf @@ -18,23 +18,26 @@ include { collect_provenance } from './modules/provenance.nf' workflow { - ch_start_time = Channel.of(LocalDateTime.now()) - ch_pipeline_name = Channel.of(workflow.manifest.name) - ch_pipeline_version = Channel.of(workflow.manifest.version) - ch_pipeline_provenance = pipeline_provenance(ch_pipeline_name.combine(ch_pipeline_version).combine(ch_start_time)) + ch_workflow_metadata = Channel.value([ + workflow.sessionId, + workflow.runName, + workflow.manifest.name, + workflow.manifest.version, + workflow.start, + ]) - if (params.samplesheet_input != 'NO_FILE') { - ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2']] } - } else { - ch_fastq = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] } - } + if (params.samplesheet_input != 'NO_FILE') { + ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2']] } + } else { + ch_fastq = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] } + } - ch_kraken_db = Channel.fromPath( "${params.kraken_db}", type: 'dir') - ch_bracken_db = Channel.fromPath( "${params.bracken_db}", type: 'dir') - ch_taxonomic_levels = Channel.of(params.taxonomic_level.split(',')).unique() + ch_kraken_db = Channel.fromPath( "${params.kraken_db}", type: 'dir') + ch_bracken_db = Channel.fromPath( "${params.bracken_db}", type: 'dir') + ch_taxonomic_levels = Channel.of(params.taxonomic_level.split(',')).unique() - main: + main: hash_files(ch_fastq.map{ it -> [it[0], [it[1], it[2]]] }.combine(Channel.of("fastq-input"))) fastp(ch_fastq) @@ -42,40 +45,46 @@ workflow { kraken2(fastp.out.reads.combine(ch_kraken_db)) if (!params.skip_bracken) { - bracken(kraken2.out.report.combine(ch_bracken_db).combine(ch_taxonomic_levels)) - abundance_top_5(bracken.out.abundances) - ch_abundances = bracken.out.abundances + bracken(kraken2.out.report.combine(ch_bracken_db).combine(ch_taxonomic_levels)) + abundance_top_5(bracken.out.abundances) + ch_abundances = bracken.out.abundances } else { - abundance_top_5_kraken(kraken2.out.report.combine(ch_taxonomic_levels)) - ch_abundances = kraken_abundances(kraken2.out.report.combine(ch_taxonomic_levels)) + abundance_top_5_kraken(kraken2.out.report.combine(ch_taxonomic_levels)) + ch_abundances = kraken_abundances(kraken2.out.report.combine(ch_taxonomic_levels)) } if (params.extract_reads) { - ch_to_extract = ch_abundances.map{ it -> it[1] }.splitCsv(header: true).filter{ it -> Float.parseFloat(it['fraction_total_reads']) > params.extract_reads_threshold }.map{ it -> [it['sample_id'], it['taxonomy_id']] } - extract_reads(ch_fastq.join(kraken2.out.report).combine(ch_to_extract, by: 0)) + ch_to_extract = ch_abundances.map{ it -> it[1] }.splitCsv(header: true).filter{ it -> Float.parseFloat(it['fraction_total_reads']) > params.extract_reads_threshold }.map{ it -> [it['sample_id'], it['taxonomy_id']] } + extract_reads(ch_fastq.join(kraken2.out.report).combine(ch_to_extract, by: 0)) } if (params.collect_outputs) { - fastp.out.csv.map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_fastp.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1] }) - if (!params.skip_bracken) { - ch_abundances.collectFile(storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1].split(',')[0] }){ it -> [params.collected_outputs_prefix + "_" + it[2] + "_bracken_abundances.csv", it[1]] } - abundance_top_5.out.collectFile(storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1] }){ it -> [params.collected_outputs_prefix + "_" + it[2] + "_bracken_abundances_top_5.csv", it[1]] } - } else { - ch_abundances.collectFile(storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1].split(',')[0] }){ it -> [params.collected_outputs_prefix + "_" + it[2] + "_kraken_abundances.csv", it[1]] } - abundance_top_5_kraken.out.collectFile(storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1] }){ it -> [params.collected_outputs_prefix + "_" + it[2] + "_kraken_abundances_top_5.csv", it[1]] } - } + fastp.out.csv.map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_fastp.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1] }) + if (!params.skip_bracken) { + ch_abundances.collectFile(storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1].split(',')[0] }){ it -> [params.collected_outputs_prefix + "_" + it[2] + "_bracken_abundances.csv", it[1]] } + abundance_top_5.out.collectFile(storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1] }){ it -> [params.collected_outputs_prefix + "_" + it[2] + "_bracken_abundances_top_5.csv", it[1]] } + } else { + ch_abundances.collectFile(storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1].split(',')[0] }){ it -> [params.collected_outputs_prefix + "_" + it[2] + "_kraken_abundances.csv", it[1]] } + abundance_top_5_kraken.out.collectFile(storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it.readLines()[1] }){ it -> [params.collected_outputs_prefix + "_" + it[2] + "_kraken_abundances_top_5.csv", it[1]] } + } } - ch_provenance = fastp.out.provenance - - ch_provenance = ch_provenance.join(kraken2.out.provenance).map{ it -> [it[0], [it[1], it[2]]] } + // Collect Provenance + // The basic idea is to build up a channel with the following structure: + // [sample_id, [provenance_file_1.yml, provenance_file_2.yml, provenance_file_3.yml...]] + // At each step, we add another provenance file to the list using the << operator... + // ...and then concatenate them all together in the 'collect_provenance' process. + ch_sample_ids = ch_fastq.map{ it -> it[0] } + ch_provenance = ch_sample_ids + ch_pipeline_provenance = pipeline_provenance(ch_workflow_metadata) + ch_provenance = ch_provenance.combine(ch_pipeline_provenance).map{ it -> [it[0], [it[1]]] } + ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(fastp.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(kraken2.out.provenance).map{ it -> [it[0], it[1] << it[2]] } if (!params.skip_bracken) { - ch_provenance = ch_provenance.join(bracken.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(bracken.out.provenance).map{ it -> [it[0], it[1] << it[2]] } } - - ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1] << it[2]] } - ch_provenance = ch_provenance.join(ch_fastq.map{ it -> it[0] }.combine(ch_pipeline_provenance)).map{ it -> [it[0], it[1] << it[2]] } collect_provenance(ch_provenance) } diff --git a/modules/provenance.nf b/modules/provenance.nf index 60fe7ab..a6b123f 100644 --- a/modules/provenance.nf +++ b/modules/provenance.nf @@ -1,37 +1,41 @@ process collect_provenance { - tag { sample_id } + tag { sample_id } - executor 'local' + executor 'local' - publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output" : "${params.outdir}/${sample_id}", pattern: "${sample_id}_*_provenance.yml", mode: 'copy' + publishDir "${params.outdir}/${sample_id}", pattern: "${sample_id}_*_provenance.yml", mode: 'copy' - input: - tuple val(sample_id), path(provenance_files) + input: + tuple val(sample_id), path(provenance_files) - output: - tuple val(sample_id), file("${sample_id}_*_provenance.yml") + output: + tuple val(sample_id), file("${sample_id}_*_provenance.yml") - script: - """ - cat ${provenance_files} > ${sample_id}_\$(date +%Y%m%d%H%M%S)_provenance.yml - """ + script: + """ + cat ${provenance_files} > ${sample_id}_\$(date +%Y%m%d%H%M%S)_provenance.yml + """ } process pipeline_provenance { - tag { pipeline_name + " / " + pipeline_version } + tag { pipeline_name + " / " + pipeline_version } - executor 'local' + executor 'local' - input: - tuple val(pipeline_name), val(pipeline_version), val(analysis_start) + input: + tuple val(session_id), val(run_name), val(pipeline_name), val(pipeline_version), val(timestamp_analysis_start) - output: - file("pipeline_provenance.yml") + output: + file("pipeline_provenance.yml") - script: - """ - printf -- "- pipeline_name: ${pipeline_name}\\n pipeline_version: ${pipeline_version}\\n- timestamp_analysis_start: ${analysis_start}\\n" > pipeline_provenance.yml - """ + script: + """ + printf -- "- pipeline_name: ${pipeline_name}\\n" >> pipeline_provenance.yml + printf -- " pipeline_version: ${pipeline_version}\\n" >> pipeline_provenance.yml + printf -- " nextflow_session_id: ${session_id}\\n" >> pipeline_provenance.yml + printf -- " nextflow_run_name: ${run_name}\\n" >> pipeline_provenance.yml + printf -- " timestamp_analysis_start: ${timestamp_analysis_start}\\n" >> pipeline_provenance.yml + """ } diff --git a/modules/taxon_abundance.nf b/modules/taxon_abundance.nf index 5b4dc65..1bad0f8 100644 --- a/modules/taxon_abundance.nf +++ b/modules/taxon_abundance.nf @@ -1,191 +1,211 @@ process fastp { - tag { sample_id } + tag { sample_id } - publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output" : "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_fastp.{json,csv}" + publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_fastp.{json,csv}" - input: - tuple val(sample_id), path(reads_1), path(reads_2) + input: + tuple val(sample_id), path(reads_1), path(reads_2) - output: - tuple val(sample_id), path("${sample_id}_fastp.json"), emit: json + output: + tuple val(sample_id), path("${sample_id}_fastp.json"), emit: json tuple val(sample_id), path("${sample_id}_fastp.csv"), emit: csv - tuple val(sample_id), path("${sample_id}_trimmed_R1.fastq.gz"), path("${sample_id}_trimmed_R2.fastq.gz"), emit: reads - tuple val(sample_id), path("${sample_id}_fastp_provenance.yml"), emit: provenance - - script: - """ - printf -- "- process_name: fastp\\n" > ${sample_id}_fastp_provenance.yml - printf -- " tool_name: fastp\\n tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_fastp_provenance.yml - fastp -i ${reads_1} -I ${reads_2} -o ${sample_id}_trimmed_R1.fastq.gz -O ${sample_id}_trimmed_R2.fastq.gz - mv fastp.json ${sample_id}_fastp.json - fastp_json_to_csv.py -s ${sample_id} ${sample_id}_fastp.json > ${sample_id}_fastp.csv - """ + tuple val(sample_id), path("${sample_id}_trimmed_R1.fastq.gz"), path("${sample_id}_trimmed_R2.fastq.gz"), emit: reads + tuple val(sample_id), path("${sample_id}_fastp_provenance.yml"), emit: provenance + + script: + """ + printf -- "- process_name: fastp\\n" >> ${sample_id}_fastp_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_fastp_provenance.yml + printf -- " - tool_name: fastp\\n" >> ${sample_id}_fastp_provenance.yml + printf -- " tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_fastp_provenance.yml + + fastp \ + -i ${reads_1} \ + -I ${reads_2} \ + -o ${sample_id}_trimmed_R1.fastq.gz \ + -O ${sample_id}_trimmed_R2.fastq.gz + + mv fastp.json ${sample_id}_fastp.json + + fastp_json_to_csv.py -s ${sample_id} ${sample_id}_fastp.json > ${sample_id}_fastp.csv + """ } process kraken2 { - tag { sample_id } + tag { sample_id } - publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output" : "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_kraken2_report.txt" + publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_kraken2_report.txt" - input: - tuple val(sample_id), path(reads_1), path(reads_2), path(kraken2_db) + input: + tuple val(sample_id), path(reads_1), path(reads_2), path(kraken2_db) - output: - tuple val(sample_id), path("${sample_id}_kraken2_output.tsv"), path("${sample_id}_kraken2_report.txt"), emit: report - tuple val(sample_id), path("${sample_id}_kraken2_provenance.yml"), emit: provenance + output: + tuple val(sample_id), path("${sample_id}_kraken2_output.tsv"), path("${sample_id}_kraken2_report.txt"), emit: report + tuple val(sample_id), path("${sample_id}_kraken2_provenance.yml"), emit: provenance - script: - """ - printf -- "- process_name: kraken2\\n" > ${sample_id}_kraken2_provenance.yml - printf -- " tool_name: kraken2\\n tool_version: \$(kraken2 --version | grep 'version' | cut -d ' ' -f 3)\\n" >> ${sample_id}_kraken2_provenance.yml - printf -- " database_path: \$(readlink -f ${kraken2_db})\\n" >> ${sample_id}_kraken2_provenance.yml - - kraken2 \ - --db ${kraken2_db} \ - --threads ${task.cpus} \ - --confidence ${params.confidence} \ - --output ${sample_id}_kraken2_output.tsv \ - --report ${sample_id}_kraken2_report.txt \ - --paired ${reads_1} ${reads_2} + script: + """ + printf -- "- process_name: kraken2\\n" >> ${sample_id}_kraken2_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_kraken2_provenance.yml + printf -- " - tool_name: kraken2\\n" >> ${sample_id}_kraken2_provenance.yml + printf -- " tool_version: \$(kraken2 --version | grep 'version' | cut -d ' ' -f 3)\\n" >> ${sample_id}_kraken2_provenance.yml + printf -- " parameters:\\n" >> ${sample_id}_kraken2_provenance.yml + printf -- " - name: confidence\\n" >> ${sample_id}_kraken2_provenance.yml + printf -- " value: ${params.confidence}\\n" >> ${sample_id}_kraken2_provenance.yml + + get_kraken_db_metadata.py --db ${kraken2_db} --provenance ${sample_id}_kraken2_provenance.yml + + kraken2 \ + --db ${kraken2_db} \ + --threads ${task.cpus} \ + --confidence ${params.confidence} \ + --output ${sample_id}_kraken2_output.tsv \ + --report ${sample_id}_kraken2_report.txt \ + --paired ${reads_1} ${reads_2} """ } process bracken { - tag { sample_id + " / " + taxonomic_level } + tag { sample_id + " / " + taxonomic_level } - errorStrategy 'ignore' - - publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output" : "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_*_bracken_abundances.csv" - - input: - tuple val(sample_id), path(kraken2_output), path(kraken2_report), path(bracken_db), val(taxonomic_level) - - output: - tuple val(sample_id), path("${sample_id}_${taxonomic_level}_bracken_abundances.csv"), val(taxonomic_level), emit: abundances - tuple val(sample_id), path("${sample_id}_bracken_provenance.yml"), emit: provenance - - script: - """ - printf -- "- process_name: bracken\\n" > ${sample_id}_bracken_provenance.yml - printf -- " tool_name: bracken\\n tool_version: 2.6.1\\n" >> ${sample_id}_bracken_provenance.yml - printf -- " database_path: \$(readlink -f ${bracken_db})\\n" >> ${sample_id}_bracken_provenance.yml - printf -- " taxonomic_level: ${taxonomic_level}\\n" >> ${sample_id}_bracken_provenance.yml - - bracken -d ${bracken_db} \ - -i ${kraken2_report} \ - -w ${sample_id}_${taxonomic_level}_bracken.txt \ - -o ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted.tsv \ - -r ${params.read_length} \ - -l ${taxonomic_level} - - paste <(echo "sample_id") <(head -n 1 ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted.tsv) | tr \$'\\t' ',' > bracken_abundances_header.csv - - adjust_bracken_percentages_for_unclassified_reads.py \ - -k ${kraken2_report} \ - -b ${sample_id}_${taxonomic_level}_bracken.txt \ - -a ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted.tsv \ - > ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted_with_unclassified.csv - - tail -n+2 ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted_with_unclassified.csv | \ - sort -t ',' -nrk 7,7 | \ - awk -F ',' 'BEGIN {OFS=FS}; {print "${sample_id}",\$0}' > ${sample_id}_${taxonomic_level}_bracken_abundances_data.csv - - cat bracken_abundances_header.csv ${sample_id}_${taxonomic_level}_bracken_abundances_data.csv > ${sample_id}_${taxonomic_level}_bracken_abundances.csv - """ + errorStrategy 'ignore' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_*_bracken_abundances.csv" + + input: + tuple val(sample_id), path(kraken2_output), path(kraken2_report), path(bracken_db), val(taxonomic_level) + + output: + tuple val(sample_id), path("${sample_id}_${taxonomic_level}_bracken_abundances.csv"), val(taxonomic_level), emit: abundances + tuple val(sample_id), path("${sample_id}_bracken_provenance.yml"), emit: provenance + + script: + """ + printf -- "- process_name: bracken\\n" >> ${sample_id}_bracken_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_bracken_provenance.yml + printf -- " - tool_name: bracken\\n" >> ${sample_id}_bracken_provenance.yml + printf -- " tool_version: 2.6.1\\n" >> ${sample_id}_bracken_provenance.yml + printf -- " parameters:\\n" >> ${sample_id}_bracken_provenance.yml + printf -- " - name: read_length\\n" >> ${sample_id}_bracken_provenance.yml + printf -- " value: ${params.read_length}\\n" >> ${sample_id}_bracken_provenance.yml + printf -- " - name: taxonomic_level\\n" >> ${sample_id}_bracken_provenance.yml + printf -- " value: ${taxonomic_level}\\n" >> ${sample_id}_bracken_provenance.yml + + bracken -d ${bracken_db} \ + -i ${kraken2_report} \ + -w ${sample_id}_${taxonomic_level}_bracken.txt \ + -o ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted.tsv \ + -r ${params.read_length} \ + -l ${taxonomic_level} + + paste <(echo "sample_id") <(head -n 1 ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted.tsv) | tr \$'\\t' ',' > bracken_abundances_header.csv + + adjust_bracken_percentages_for_unclassified_reads.py \ + -k ${kraken2_report} \ + -b ${sample_id}_${taxonomic_level}_bracken.txt \ + -a ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted.tsv \ + > ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted_with_unclassified.csv + + tail -n+2 ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted_with_unclassified.csv | \ + sort -t ',' -nrk 7,7 | \ + awk -F ',' 'BEGIN {OFS=FS}; {print "${sample_id}",\$0}' > ${sample_id}_${taxonomic_level}_bracken_abundances_data.csv + + cat bracken_abundances_header.csv ${sample_id}_${taxonomic_level}_bracken_abundances_data.csv > ${sample_id}_${taxonomic_level}_bracken_abundances.csv + """ } process abundance_top_5 { - tag { sample_id + " / " + taxonomic_level } + tag { sample_id + " / " + taxonomic_level } - executor 'local' + executor 'local' - publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output" : "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_*_top_5.csv" + publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_*_top_5.csv" - input: - tuple val(sample_id), path(bracken_abundances), val(taxonomic_level) + input: + tuple val(sample_id), path(bracken_abundances), val(taxonomic_level) - output: - tuple val(sample_id), path("${sample_id}_${taxonomic_level}_top_5.csv"), val(taxonomic_level) + output: + tuple val(sample_id), path("${sample_id}_${taxonomic_level}_top_5.csv"), val(taxonomic_level) - script: - """ - bracken_top_n_linelist.py ${bracken_abundances} -n 5 -s ${sample_id} > ${sample_id}_${taxonomic_level}_top_5.csv - """ + script: + """ + bracken_top_n_linelist.py ${bracken_abundances} -n 5 -s ${sample_id} > ${sample_id}_${taxonomic_level}_top_5.csv + """ } process abundance_top_5_kraken { - tag { sample_id + " / " + taxonomic_level } - - executor 'local' + tag { sample_id + " / " + taxonomic_level } - publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output" : "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_*_top_5.csv" + executor 'local' - input: - tuple val(sample_id), path(kraken_output), path(kraken_report), val(taxonomic_level) + publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_*_top_5.csv" - output: - tuple val(sample_id), path("${sample_id}_${taxonomic_level}_top_5.csv"), val(taxonomic_level) + input: + tuple val(sample_id), path(kraken_output), path(kraken_report), val(taxonomic_level) - script: - """ - kraken_top_n_linelist.py ${kraken_report} -n 5 -s ${sample_id} --taxonomy-lvl ${taxonomic_level} > ${sample_id}_${taxonomic_level}_top_5.csv - """ + output: + tuple val(sample_id), path("${sample_id}_${taxonomic_level}_top_5.csv"), val(taxonomic_level) + script: + """ + kraken_top_n_linelist.py ${kraken_report} -n 5 -s ${sample_id} --taxonomy-lvl ${taxonomic_level} > ${sample_id}_${taxonomic_level}_top_5.csv + """ } process kraken_abundances { - tag { sample_id + " / " + taxonomic_level } - - executor 'local' + tag { sample_id + " / " + taxonomic_level } - publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output" : "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_*_kraken2_abundances.csv" + executor 'local' - input: - tuple val(sample_id), path(kraken_output), path(kraken_report), val(taxonomic_level) + publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_*_kraken2_abundances.csv" - output: - tuple val(sample_id), path("${sample_id}_${taxonomic_level}_kraken2_abundances.csv"), val(taxonomic_level) + input: + tuple val(sample_id), path(kraken_output), path(kraken_report), val(taxonomic_level) - script: - """ - kraken_abundances.py ${kraken_report} -s ${sample_id} --taxonomy-lvl ${taxonomic_level} > ${sample_id}_${taxonomic_level}_kraken2_abundances.csv - """ + output: + tuple val(sample_id), path("${sample_id}_${taxonomic_level}_kraken2_abundances.csv"), val(taxonomic_level) + script: + """ + kraken_abundances.py ${kraken_report} -s ${sample_id} --taxonomy-lvl ${taxonomic_level} > ${sample_id}_${taxonomic_level}_kraken2_abundances.csv + """ } process extract_reads { - tag { sample_id + ' / ' + taxid } + tag { sample_id + ' / ' + taxid } - publishDir params.versioned_outdir ? "${params.outdir}/${sample_id}/${params.pipeline_short_name}-v${params.pipeline_minor_version}-output/extracted_reads_by_taxid" : "${params.outdir}/${sample_id}/extracted_reads_by_taxid", mode: 'copy', pattern: "${taxid}" + publishDir "${params.outdir}/${sample_id}/extracted_reads_by_taxid", mode: 'copy', pattern: "${taxid}" - input: - tuple val(sample_id), path(reads_1), path(reads_2), path(kraken2_output), path(kraken2_report), val(taxid) + input: + tuple val(sample_id), path(reads_1), path(reads_2), path(kraken2_output), path(kraken2_report), val(taxid) - output: - tuple val(sample_id), val(taxid), path("${taxid}", type: "dir") + output: + tuple val(sample_id), val(taxid), path("${taxid}", type: "dir") - script: - """ - mkdir ${taxid} - extract_kraken_reads.py \ - -k ${kraken2_output} \ - -r ${kraken2_report} \ - -1 ${reads_1} \ - -2 ${reads_2} \ - --taxid ${taxid} \ - --fastq-output \ - -o ${taxid}/${sample_id}-taxid-${taxid}_R1.fastq \ - -o2 ${taxid}/${sample_id}-taxid-${taxid}_R2.fastq \ - --include-children - gzip ${taxid}/*.fastq - """ -} \ No newline at end of file + script: + """ + mkdir ${taxid} + + extract_kraken_reads.py \ + -k ${kraken2_output} \ + -r ${kraken2_report} \ + -1 ${reads_1} \ + -2 ${reads_2} \ + --taxid ${taxid} \ + --fastq-output \ + -o ${taxid}/${sample_id}-taxid-${taxid}_R1.fastq \ + -o2 ${taxid}/${sample_id}-taxid-${taxid}_R2.fastq \ + --include-children + + gzip ${taxid}/*.fastq + """ +} diff --git a/nextflow.config b/nextflow.config index 8ddc335..da2f9e2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,32 +1,32 @@ manifest { - author = 'Dan Fornika' - name = 'BCCDC-PHL/taxon-abundance' - description = 'Taxon Abundance' - mainScript = 'main.nf' - nextflowVersion = '>=20.01.0' - version = '0.1.6' + author = 'Dan Fornika' + name = 'BCCDC-PHL/taxon-abundance' + description = 'Taxon Abundance' + mainScript = 'main.nf' + nextflowVersion = '>=20.01.0' + version = '0.1.7' } params { - profile = false - cache = '' - illumina_suffixes = ['*_R{1,2}_001', '*_R{1,2}', '*_{1,2}' ] - fastq_exts = ['.fastq.gz', '.fq.gz', '.fastq', '.fq'] - fastq_search_path = makeFastqSearchPath( illumina_suffixes, fastq_exts ) - samplesheet_input = 'NO_FILE' - kraken_db = '/data/ref_databases/kraken2/latest_standard' - bracken_db = '/data/ref_databases/kraken2/latest_standard' - confidence = 0.0 - taxonomic_level = "S" - read_length = 150 - versioned_outdir = false - extract_reads = false - extract_reads_threshold = 1.0 - skip_bracken = false - collect_outputs = false - collected_outputs_prefix = 'collected' - pipeline_short_name = parsePipelineName(manifest.toMap().get('name')) - pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version')) + profile = false + cache = '' + illumina_suffixes = ['*_R{1,2}_001', '*_R{1,2}', '*_{1,2}' ] + fastq_exts = ['.fastq.gz', '.fq.gz', '.fastq', '.fq'] + fastq_search_path = makeFastqSearchPath( illumina_suffixes, fastq_exts ) + samplesheet_input = 'NO_FILE' + kraken_db = '/data/ref_databases/kraken2/latest_standard' + bracken_db = '/data/ref_databases/kraken2/latest_standard' + confidence = 0.0 + taxonomic_level = "S" + read_length = 150 + versioned_outdir = false + extract_reads = false + extract_reads_threshold = 1.0 + skip_bracken = false + collect_outputs = false + collected_outputs_prefix = 'collected' + pipeline_short_name = parsePipelineName(manifest.toMap().get('name')) + pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version')) } def parseMinorVersion(version) { @@ -51,17 +51,17 @@ def makeFastqSearchPath ( illumina_suffixes, fastq_exts ) { } profiles { - conda { - process.conda = "$baseDir/environments/environment.yml" - if (params.cache){ - conda.cacheDir = params.cache - } - } + conda { + process.conda = "$baseDir/environments/environment.yml" + if (params.cache){ + conda.cacheDir = params.cache + } + } } process { - withName: kraken2 { - cpus = 8 - memory = '64 GB' - } + withName: kraken2 { + cpus = 8 + memory = '72 GB' + } }