From 206a8e7219a33b0b02bf4d95731d8592acfbe265 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Fri, 15 Dec 2023 13:53:58 -0800 Subject: [PATCH] Alt insert size metrics (#9) * Add samtools stats-based insert size calculation * Incorporate alignment qc metrics from qualimap and samtools * use unix dialect for csv --- README.md | 3 + bin/combine_alignment_qc.py | 94 ++++++++++++++ bin/parse_samtools_stats_summary.py | 132 ++++++++++++++++++++ bin/qualimap_bamqc_genome_results_to_csv.py | 12 -- main.nf | 11 +- modules/alignment_variants.nf | 88 +++++++++++-- nextflow.config | 2 +- 7 files changed, 317 insertions(+), 25 deletions(-) create mode 100755 bin/combine_alignment_qc.py create mode 100755 bin/parse_samtools_stats_summary.py diff --git a/README.md b/README.md index 38cbf65..45f406a 100644 --- a/README.md +++ b/README.md @@ -89,10 +89,13 @@ flowchart TD filtlong --> minimap2 minimap2 --> alignments alignments --> qualimap(qualimap_bamqc) + alignments --> samtools_stats(samtools_stats) alignments --> freebayes(freebayes) alignments --> samtools_mpileup(samtools_mpileup) samtools_mpileup --> generate_low_coverage_bed(generate_low_coverage_bed) samtools_mpileup --> percent_coverage_by_depth(percent_coverage_by_depth) + qualimap --> combined_alignment_qc(combined_alignment_qc) + samtools_stats --> combined_alignment_qc ``` ## Outputs diff --git a/bin/combine_alignment_qc.py b/bin/combine_alignment_qc.py new file mode 100755 index 0000000..28afbc6 --- /dev/null +++ b/bin/combine_alignment_qc.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import json +import sys + +def parse_samtools_stats_summary(samtools_stats_summary_file): + samtools_stats_summary_data = [] + with open(samtools_stats_summary_file, 'r') as f: + reader = csv.DictReader(f, delimiter=',') + for row in reader: + record = {} + for k, v in row.items(): + new_k = k + '__samtools' + record[new_k] = v + samtools_stats_summary_data.append(record) + + first_record = samtools_stats_summary_data[0] + + return first_record + + +def parse_qualimap_bamqc_genome_results(qualimap_bamqc_genome_results_file): + qualimap_bamqc_genome_results_data = [] + with open(qualimap_bamqc_genome_results_file) as f: + reader = csv.DictReader(f, delimiter=',') + for row in reader: + record = {} + for k, v in row.items(): + new_k = k + '__qualimap' + record[new_k] = v + qualimap_bamqc_genome_results_data.append(record) + + first_record = qualimap_bamqc_genome_results_data[0] + + return first_record + + +def main(args): + samtools_stats_summary_data = parse_samtools_stats_summary(args.samtools_stats_summary) + qualimap_bamqc_genome_results_data = parse_qualimap_bamqc_genome_results(args.qualimap_bamqc_genome_results) + + samtools_fields = [ + 'reads_mapped', + 'reads_mapped_and_paired', + 'reads_unmapped', + 'reads_properly_paired', + 'reads_paired', + 'reads_duplicated', + 'error_rate', + 'non-primary_alignments', + 'supplementary_alignments', + 'average_length', + 'average_first_fragment_length', + 'average_last_fragment_length', + 'insert_size_average', + 'insert_size_standard_deviation', + ] + + qualimap_fields = [ + 'mean_depth_coverage', + 'stdev_depth_coverage', + ] + + selected_samtools_data = { k: samtools_stats_summary_data[k + '__samtools'] for k in samtools_fields } + selected_qualimap_data = { k: qualimap_bamqc_genome_results_data[ k + '__qualimap'] for k in qualimap_fields } + combined_data = selected_samtools_data.copy() + combined_data.update(selected_qualimap_data) + + output_fields = [] + + if args.sample_id: + combined_data['sample_id'] = args.sample_id + output_fields.append('sample_id') + if args.read_type: + combined_data['read_type'] = args.read_type + output_fields.append('read_type') + + output_fields += samtools_fields + output_fields += qualimap_fields + + writer = csv.DictWriter(sys.stdout, dialect='unix', fieldnames=output_fields, quoting=csv.QUOTE_MINIMAL, extrasaction='ignore') + writer.writeheader() + writer.writerow(combined_data) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--samtools-stats-summary', required=True) + parser.add_argument('--qualimap-bamqc-genome-results', required=True) + parser.add_argument('--sample-id') + parser.add_argument('--read-type') + args = parser.parse_args() + main(args) diff --git a/bin/parse_samtools_stats_summary.py b/bin/parse_samtools_stats_summary.py new file mode 100755 index 0000000..08a6c2f --- /dev/null +++ b/bin/parse_samtools_stats_summary.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import json +import sys + + +def parse_samtools_stats_summary(samtools_stats_summary_file): + output_data = {} + int_fields = [ + 'raw_total_sequences', + 'filtered_sequences', + 'sequences', + 'first_fragments', + 'last_fragments', + 'reads_mapped', + 'reads_mapped_and_paired', + 'reads_unmapped', + 'reads_properly_paired', + 'reads_paired', + 'reads_duplicated', + 'reads_MQ0', + 'reads_QC_failed', + 'non-primary_alignments', + 'supplementary_alignments', + 'total_length', + 'total_first_fragment_length', + 'total_last_fragment_length', + 'bases_mapped', + 'bases_mapped_cigar', + 'bases_trimmed', + 'bases_duplicated', + 'mismatches', + 'maximum_length', + 'maximum_first_fragment_length', + 'maximum_last_fragment_length', + 'inward_oriented_pairs', + 'outward_oriented_pairs', + 'pairs_with_other_orientation', + 'pairs_on_different_chromosomes', + ] + float_fields = [ + 'error_rate', + 'average_length', + 'average_first_fragment_length', + 'average_last_fragment_length', + 'average_quality', + 'insert_size_average', + 'insert_size_standard_deviation', + ] + with open(samtools_stats_summary_file, 'r') as f: + for line in f: + line = line.strip() + line_split = line.split('\t') + key = line_split[0].strip().rstrip(':').replace(' ', '_').replace('(', '').replace(')', '') + if key == '1st_fragments': + key = 'first_fragments' + if key.endswith('%'): + key = key.replace('_%', '') + elif key in int_fields: + try: + output_data[key] = int(line_split[1]) + except ValueError: + output_data[key] = None + elif key == 'is_sorted': + output_data[key] = line_split[1] == '1' + elif key in float_fields: + try: + output_data[key] = float(line_split[1]) + except ValueError: + output_data[key] = None + else: + output_data[key] = line_split[1] + + return output_data + + +def main(args): + samtools_stats_summary_data = parse_samtools_stats_summary(args.input) + + if args.sample_id: + samtools_stats_summary_data['sample_id'] = args.sample_id + + output_fields = [ + 'raw_total_sequences', + 'filtered_sequences', + 'first_fragments', + 'last_fragments', + 'reads_mapped', + 'reads_mapped_and_paired', + 'reads_unmapped', + 'reads_properly_paired', + 'reads_paired', + 'reads_duplicated', + 'reads_MQ0', + 'reads_QC_failed', + 'non-primary_alignments', + 'supplementary_alignments', + 'total_length', + 'total_first_fragment_length', + 'total_last_fragment_length', + 'bases_mapped', + 'bases_mapped_cigar', + 'bases_trimmed', + 'bases_duplicated', + 'mismatches', + 'error_rate', + 'average_length', + 'average_first_fragment_length', + 'average_last_fragment_length', + 'average_quality', + 'insert_size_average', + 'insert_size_standard_deviation', + 'inward_oriented_pairs', + 'outward_oriented_pairs', + 'pairs_with_other_orientation', + 'pairs_on_different_chromosomes', + ] + if args.sample_id: + output_fields = ['sample_id'] + output_fields + + writer = csv.DictWriter(sys.stdout, fieldnames=output_fields, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, extrasaction='ignore') + writer.writeheader() + writer.writerow(samtools_stats_summary_data) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Parse samtools stats summary file') + parser.add_argument('-i', '--input', type=str, required=True, help='samtools stats summary file') + parser.add_argument('-s', '--sample-id', type=str, required=True, help='sample id') + args = parser.parse_args() + main(args) diff --git a/bin/qualimap_bamqc_genome_results_to_csv.py b/bin/qualimap_bamqc_genome_results_to_csv.py index c149b36..d52ec42 100755 --- a/bin/qualimap_bamqc_genome_results_to_csv.py +++ b/bin/qualimap_bamqc_genome_results_to_csv.py @@ -24,15 +24,6 @@ def main(args): if line.startswith('number of secondary alignments'): num_secondary_alignments = int(line.split('=')[1].strip().replace(',', '')) output_data['num_secondary_alignments'] = num_secondary_alignments - if line.startswith('mean insert size'): - mean_insert_size = line.split('=')[1].strip().replace(',', '') - output_data['mean_insert_size'] = float(mean_insert_size) - if line.startswith('median insert size'): - median_insert_size = line.split('=')[1].strip().replace(',', '') - output_data['median_insert_size'] = float(median_insert_size) - if line.startswith('std insert size'): - stdev_insert_size = line.split('=')[1].strip().replace(',', '') - output_data['stdev_insert_size'] = float(stdev_insert_size) if line.startswith('duplication rate'): duplication_rate = line.split('=')[1].strip().replace('%', '') output_data['duplication_rate_percent'] = round(float(duplication_rate), 2) @@ -91,9 +82,6 @@ def main(args): 'num_reads', 'num_mapped_reads', 'percent_mapped_reads', - 'mean_insert_size', - 'median_insert_size', - 'stdev_insert_size', 'mean_mapping_quality', 'error_rate', 'number_of_mismatches', diff --git a/main.nf b/main.nf index 5cb5ddc..62c9166 100644 --- a/main.nf +++ b/main.nf @@ -18,6 +18,8 @@ include { minimap2 } from './modules/alignment_variants.nf include { freebayes } from './modules/alignment_variants.nf' include { qualimap_bamqc } from './modules/alignment_variants.nf' include { samtools_mpileup } from './modules/alignment_variants.nf' +include { samtools_stats } from './modules/alignment_variants.nf' +include { combine_alignment_qc } from './modules/alignment_variants.nf' include { generate_low_coverage_bed } from './modules/alignment_variants.nf' include { percent_coverage_by_depth } from './modules/alignment_variants.nf' include { pipeline_provenance } from './modules/provenance.nf' @@ -84,6 +86,10 @@ workflow { samtools_mpileup(ch_alignments.join(ch_ref)) + samtools_stats(ch_alignments) + + combine_alignment_qc(qualimap_bamqc.out.alignment_qc.join(samtools_stats.out.stats_summary_csv, by: [0, 1])) + ch_depths = samtools_mpileup.out.depths generate_low_coverage_bed(ch_depths) @@ -93,7 +99,9 @@ workflow { // Collect multi-sample outputs if (params.collect_outputs) { fastp.out.fastp_csv.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_fastp.csv", storeDir: "${params.outdir}") - qualimap_bamqc.out.alignment_qc.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_qualimap_alignment_qc.csv", storeDir: "${params.outdir}") + qualimap_bamqc.out.alignment_qc.map{ it -> it[2] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_qualimap_alignment_qc.csv", storeDir: "${params.outdir}") + samtools_stats.out.stats_summary_csv.map{ it -> it[2] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_samtools_stats_summary.csv", storeDir: "${params.outdir}") + combine_alignment_qc.out.map{ it -> it[2] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_combined_alignment_qc.csv", storeDir: "${params.outdir}") } // Collect Provenance @@ -112,6 +120,7 @@ workflow { ch_provenance = ch_provenance.join(minimap2.out.provenance).map{ it -> [it[0], it[1] << it[2]] } ch_provenance = ch_provenance.join(qualimap_bamqc.out.provenance).map{ it -> [it[0], it[1] << it[2]] } ch_provenance = ch_provenance.join(samtools_mpileup.out.provenance).map{ it -> [it[0], it[1] << it[2]] } + ch_provenance = ch_provenance.join(samtools_stats.out.provenance).map{ it -> [it[0], it[1] << it[2]] } ch_provenance = ch_provenance.join(freebayes.out.provenance).map{ it -> [it[0], it[1] << it[2]] } collect_provenance(ch_provenance) diff --git a/modules/alignment_variants.nf b/modules/alignment_variants.nf index a6c798d..36246a7 100644 --- a/modules/alignment_variants.nf +++ b/modules/alignment_variants.nf @@ -27,7 +27,7 @@ process bwa_mem { tuple val(sample_id), path(reads_1), path(reads_2), path(ref), path(ref_index) output: - tuple val(sample_id), path("${sample_id}_${short_long}.{bam,bam.bai}"), val(short_long), emit: alignment + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}.{bam,bam.bai}"), emit: alignment tuple val(sample_id), path(ref), emit: ref tuple val(sample_id), path("${sample_id}_bwa_mem_provenance.yml"), emit: provenance @@ -89,7 +89,7 @@ process minimap2 { tuple val(sample_id), path(reads), path(ref), path(ref_index) output: - tuple val(sample_id), path("${sample_id}_${short_long}.{bam,bam.bai}"), val(short_long), emit: alignment + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}.{bam,bam.bai}"), emit: alignment tuple val(sample_id), path("${sample_id}_minimap2_provenance.yml"), emit: provenance script: @@ -140,12 +140,12 @@ process qualimap_bamqc { publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_qualimap_report.pdf" input: - tuple val(sample_id), file(alignment), val(short_long) + tuple val(sample_id), val(short_long), file(alignment) output: - tuple val(sample_id), path("${sample_id}_${short_long}_qualimap_alignment_qc.csv"), emit: alignment_qc - tuple val(sample_id), path("${sample_id}_${short_long}_qualimap_report.pdf"), emit: report - tuple val(sample_id), path("${sample_id}_${short_long}_qualimap_genome_results.txt"), emit: genome_results + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_alignment_qc.csv"), emit: alignment_qc + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_report.pdf"), emit: report + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_genome_results.txt"), emit: genome_results tuple val(sample_id), path("${sample_id}_${short_long}_qualimap_bamqc_provenance.yml"), emit: provenance script: @@ -182,6 +182,72 @@ process qualimap_bamqc { } +process samtools_stats { + + tag { sample_id + ' / ' + short_long } + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_samtools_stats*" + + input: + tuple val(sample_id), val(short_long), path(alignment) + + output: + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_samtools_stats.txt"), emit: stats + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_samtools_stats_summary.txt"), emit: stats_summary + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_samtools_stats_summary.csv"), emit: stats_summary_csv + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_samtools_stats_insert_sizes.tsv"), emit: insert_sizes + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_samtools_stats_coverage_distribution.tsv"), emit: coverage_distribution + tuple val(sample_id), path("${sample_id}_${short_long}_samtools_stats_provenance.yml"), emit: provenance + + script: + """ + printf -- "- process_name: samtools_stats\\n" >> ${sample_id}_${short_long}_samtools_stats_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_${short_long}_samtools_stats_provenance.yml + printf -- " - tool_name: samtools\\n" >> ${sample_id}_${short_long}_samtools_stats_provenance.yml + printf -- " tool_version: \$(samtools --version | head -n 1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_${short_long}_samtools_stats_provenance.yml + printf -- " subcommand: stats\\n" >> ${sample_id}_${short_long}_samtools_stats_provenance.yml + + samtools stats \ + --threads ${task.cpus} \ + ${alignment[0]} > ${sample_id}_${short_long}_samtools_stats.txt + + grep '^SN' ${sample_id}_${short_long}_samtools_stats.txt | cut -f 2- > ${sample_id}_${short_long}_samtools_stats_summary.txt + + parse_samtools_stats_summary.py -i ${sample_id}_${short_long}_samtools_stats_summary.txt -s ${sample_id} > ${sample_id}_${short_long}_samtools_stats_summary.csv + + echo "insert_size,pairs_total,inward_oriented_pairs,outward_oriented_pairs,other_pairs" | tr ',' '\t' > ${sample_id}_${short_long}_samtools_stats_insert_sizes.tsv + grep '^IS' ${sample_id}_${short_long}_samtools_stats.txt | cut -f 2- >> ${sample_id}_${short_long}_samtools_stats_insert_sizes.tsv + + echo "coverage,depth" | tr ',' '\t' > ${sample_id}_${short_long}_samtools_stats_coverage_distribution.tsv + grep '^COV' ${sample_id}_${short_long}_samtools_stats.txt | cut -f 2- >> ${sample_id}_${short_long}_samtools_stats_coverage_distribution.tsv + """ +} + + +process combine_alignment_qc { + + tag { sample_id + ' / ' + short_long } + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_combined_alignment_qc.csv" + + input: + tuple val(sample_id), val(short_long), path(qualimap_genome_results_csv), path(samtools_stats_summary_csv) + + output: + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_combined_alignment_qc.csv") + + script: + """ + combine_alignment_qc.py \ + --sample-id ${sample_id} \ + --read-type ${short_long} \ + --qualimap-bamqc-genome-results ${qualimap_genome_results_csv} \ + --samtools-stats-summary ${samtools_stats_summary_csv} \ + > ${sample_id}_${short_long}_combined_alignment_qc.csv + """ +} + + process samtools_mpileup { tag { sample_id + ' / ' + short_long } @@ -189,10 +255,10 @@ process samtools_mpileup { publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_depths.tsv" input: - tuple val(sample_id), path(alignment), val(short_long), path(ref) + tuple val(sample_id), val(short_long), path(alignment), path(ref) output: - tuple val(sample_id), path("${sample_id}_${short_long}_depths.tsv"), val(short_long), emit: depths + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_depths.tsv"), emit: depths tuple val(sample_id), path("${sample_id}_${short_long}_samtools_mpileup_provenance.yml"), emit: provenance script: @@ -230,7 +296,7 @@ process generate_low_coverage_bed { publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_low_coverage_regions.bed" input: - tuple val(sample_id), path(depths), val(short_long) + tuple val(sample_id), val(short_long), path(depths) output: tuple val(sample_id), path("${sample_id}_${short_long}_low_coverage_regions.bed") @@ -251,7 +317,7 @@ process percent_coverage_by_depth { publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_percent_coverage_by_depth.csv" input: - tuple val(sample_id), path(depths), val(short_long) + tuple val(sample_id), val(short_long), path(depths) output: tuple val(sample_id), path("${sample_id}_${short_long}_percent_coverage_by_depth.csv") @@ -274,7 +340,7 @@ process freebayes { publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_freebayes.vcf" input: - tuple val(sample_id), path(alignment), val(short_long), path(ref) + tuple val(sample_id), val(short_long), path(alignment), path(ref) output: tuple val(sample_id), path("${sample_id}_${short_long}_freebayes.vcf"), emit: variants diff --git a/nextflow.config b/nextflow.config index 6790356..61558a3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -4,7 +4,7 @@ manifest { description = 'Alignment and variant calling pipeline' mainScript = 'main.nf' nextflowVersion = '>=20.01.0' - version = '0.1.3' + version = '0.1.4' } params {