diff --git a/bin/make_pileup.py b/bin/make_pileup.py new file mode 100755 index 0000000..be25fea --- /dev/null +++ b/bin/make_pileup.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +import argparse +import pysam + +def write_pileup(bamfile_path, output_path, window_size=10, min_base_quality=0, min_mapping_quality=0): + """ + Generate pileup statistics for each position in each reference sequence from a BAM file. + + Args: + bamfile_path (str): Path to the BAM file + output_path (str): Path to output TSV file + window_size (int): Size of sliding window + min_base_quality (int): Minimum base quality to consider (default: 0) + min_mapping_quality (int): Minimum mapping quality to consider (default: 0) + plot_path (str, optional): Path to save the plot(s) + plot_mode (str): 'facet' for single stacked plot, 'separate' for individual plots + """ + + # Prepare a list to collect data for DataFrame and plotting + pileup_data = [] + + with pysam.AlignmentFile(bamfile_path, "rb") as bamfile, open(output_path, 'w') as outfile: + outfile.write("\t".join(["contig", "position", "depth", "A", "C", "G", "T", "del", "N", 'base', 'max_freq', 'window']) + '\n') + + # Iterate through each reference sequence in the BAM file + for reference in bamfile.references: + sliding_window = [] + # Generate pileup for current reference sequence + for plup_column in bamfile.pileup( + reference, + ignore_orphans=False, + min_base_quality=min_base_quality, + min_mapping_quality=min_mapping_quality + ): + freqs = {'A': 0, 'T': 0, 'G': 0, 'C': 0, '-': 0, 'N': 0} + + # Count bases at current position + for plup_read in plup_column.pileups: + if plup_read.is_del: + freqs['-'] += 1 + else: + base = plup_read.alignment.query_sequence[plup_read.query_position] + freqs[base] += 1 + + # Calculate depth (excluding refskips) + base_freqs = [freqs[base] for base in "ACGT-"] + depth = sum(base_freqs) + top_base = max(freqs, key=lambda x : freqs.get(x)) + max_freq = max(base_freqs) / depth if depth != 0 else 0 + max_freq = round(max_freq, 2) + + if len(sliding_window) > int(window_size): + sliding_window.pop(0) + + # Prepare output for TSV + outputs = [ + plup_column.reference_name, + plup_column.pos + 1, # 1-based coordinates + depth, + freqs['A'], + freqs['C'], + freqs['G'], + freqs['T'], + freqs['-'], + freqs['N'], + top_base, + max_freq, + "".join(sliding_window) + ] + outputs = [str(x) for x in outputs] + + outfile.write("\t".join(outputs) + '\n') + sliding_window.append(top_base) + + # Collect data for DataFrame + pileup_data.append({ + 'contig': plup_column.reference_name, + 'position': plup_column.pos + 1, + 'max_freq': max_freq, + 'window': "".join(sliding_window), + 'top_base': top_base, + 'depth': depth + }) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input', required=True, help='BAM file of reads aligned to reference') + parser.add_argument('-b', '--min-base-qual', type=int, default=0, help='Minimum base quality to include base in pileup. Default: 0') + parser.add_argument('-m', '--min-map-qual', type=int, default=0, help='Minimum mapping quality to include a read in pileup. Default: 0') + parser.add_argument('-w', '--window-size', type=int, default=10, help='Number of bases to display in the sliding window column. Default: 10') + parser.add_argument('-o', '--output', required=True, help='Output path of pileup in TSV format') + + return parser.parse_args() + +if __name__ == '__main__': + args = parse_args() + write_pileup( + args.input, + args.output, + args.window_size, + args.min_base_qual, + args.min_map_qual + ) diff --git a/bin/plot_pileup.py b/bin/plot_pileup.py new file mode 100755 index 0000000..573479c --- /dev/null +++ b/bin/plot_pileup.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +import seaborn as sns +import pandas as pd +import matplotlib.pyplot as plt +import argparse +import os + +def load_pileup(pileup_path): + df = pd.read_csv(pileup_path, sep='\t') + return df + +def create_facet_plot(pileup_df, plot_path): + """ + Create dot plot(s) of max_freq across genome positions for multiple contigs. + + Args: + pileup_df (DataFrame): Pandas DataFrame containing pileup information + plot_path (str): Path to save the plot(s) + sample_name (str): Name of the BAM file for plot title + plot_mode (str): 'facet' for single stacked plot, 'separate' for individual plots + """ + # Convert to DataFrame + + # Get unique contigs + contigs = pileup_df['contig'].unique() + + plt.figure(figsize=(20, 4 * len(contigs))) # Adjust height based on number of contigs + + for i, contig in enumerate(contigs, 1): + plt.subplot(len(contigs), 1, i) + + contig_data = pileup_df[pileup_df['contig'] == contig] + + # Separate normal and low-frequency points + normal_points = contig_data[contig_data['max_freq'] >= 0.9] + low_freq_points = contig_data[contig_data['max_freq'] < 0.9] + + # Plot normal points + plt.scatter(normal_points['position'], normal_points['max_freq'], + alpha=0.3, color='gray', s=10) + + # Plot and label low-frequency points with annotations + for _, row in low_freq_points.iterrows(): + plt.scatter(row['position'], row['max_freq'], + color='red', s=50, edgecolors='black', linewidth=1, zorder=5) + plt.annotate( + f"Pos: {row['position']}\nFreq: {row['max_freq']:.2f}\nDepth: {row['depth']}\nWindow: {row['window']}", + (row['position'], row['max_freq']), + xytext=(10, 10), + textcoords='offset points', + fontsize=8, + bbox=dict(boxstyle="round,pad=0.3", fc="yellow", ec="black", lw=1, alpha=0.7), + arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.2") + ) + + plt.title(f'Max Base Frequency - {contig}', fontsize=10) + plt.xlabel('Genome Position') + plt.ylabel('Max Base Frequency') + plt.ylim(0.2, 1.02) + plt.grid(True, linestyle='--', alpha=0.7) + + plt.tight_layout() + plt.savefig(plot_path, dpi=300, bbox_inches='tight') + plt.close() + +def create_separate_plots(pileup_df, plot_path, sample_name): + + def extract_contig_name(contig): + """ + Extract contig name between first and second '|' characters. + If '|' is not found twice, return the original contig name. + """ + parts = contig.split('|') + return parts[1] if len(parts) >= 2 else contig + + # Create separate plots for each contig + contigs = pileup_df['contig'].unique() + + for contig in contigs: + print(contig) + plt.figure(figsize=(20, 6)) + + contig_data = pileup_df[pileup_df['contig'] == contig] + + # Separate normal and low-frequency points + normal_points = contig_data[contig_data['max_freq'] >= 0.9] + low_freq_points = contig_data[contig_data['max_freq'] < 0.9] + + # Plot normal points + plt.scatter(normal_points['position'], normal_points['max_freq'], + alpha=0.3, color='gray', s=10) + + # Plot and label low-frequency points with annotations + for _, row in low_freq_points.iterrows(): + plt.scatter(row['position'], row['max_freq'], + color='red', s=50, edgecolors='black', linewidth=1, zorder=5) + plt.annotate( + f"Pos: {row['position']}\nFreq: {row['max_freq']:.2f}\nDepth: {row['depth']}\nWindow: {row['window']}", + (row['position'], row['max_freq']), + xytext=(10, 10), + textcoords='offset points', + fontsize=8, + bbox=dict(boxstyle="round,pad=0.3", fc="yellow", ec="black", lw=1, alpha=0.7), + arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.2") + ) + + # Extract simplified contig name + simple_contig_name = extract_contig_name(contig) + + plt.title(f'Max Base Frequency Across {simple_contig_name}\n{sample_name}', fontsize=16) + plt.xlabel('Genome Position', fontsize=12) + plt.ylabel('Max Base Frequency', fontsize=12) + plt.ylim(0.2, 1.02) + plt.grid(True, linestyle='--', alpha=0.7) + + # Save individual contig plot with unique filename using simplified contig name + individual_plot_path = plot_path.replace('.png', f'_{simple_contig_name}.png') + plt.tight_layout() + plt.savefig(individual_plot_path, dpi=300, bbox_inches='tight') + plt.close() + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input', required=True, help='Pileup in TSV format') + parser.add_argument('-o', '--output', required=True, help='Output path for dot plot PNG') + parser.add_argument('-m', '--plot-mode', choices=['facet', 'separate'], default='facet', + help='Plot mode: "facet" for stacked plot, "separate" for individual contig plots. Default: facet') + + return parser.parse_args() + +if __name__ == '__main__': + args = parse_args() + + sample_name = os.path.basename(args.input).split("_")[0] + + pileup_df = load_pileup(args.input) + + if args.plot_mode == 'facet': + print("Generating facet plot...") + create_facet_plot(pileup_df, args.output) + + else: + print("Generating separate plots...") + create_separate_plots(pileup_df, args.output, sample_name) + + diff --git a/environments/environment.yml b/environments/environment.yml index a3c01e4..628195c 100644 --- a/environments/environment.yml +++ b/environments/environment.yml @@ -8,3 +8,4 @@ dependencies: - multiqc=1.15 - fastqc=0.12.1 - genoflu=1.05 + - pysam=0.22 diff --git a/main.nf b/main.nf index c52d108..8326759 100644 --- a/main.nf +++ b/main.nf @@ -21,6 +21,8 @@ include { snp_calling } from './modules/snp_calling.nf' include { pull_genoflu } from './modules/genoflu.nf' include { checkout_genoflu } from './modules/genoflu.nf' include { genoflu } from './modules/genoflu.nf' +include { make_pileup } from './modules/pileup.nf' +include { plot_pileup } from './modules/pileup.nf' // prints to the screen and to the log @@ -93,6 +95,9 @@ workflow { // Run FluViewer fluviewer(normalize_depth.out.normalized_reads.combine(ch_db)) + make_pileup(fluviewer.out.alignment) + plot_pileup(make_pileup.out.pileup) + //Collect al the relevant filesfor multiqc ch_fastqc_collected = fastqc.out.zip.map{ it -> [it[1], it[2]]}.collect() multiqc(fastp.out.json.mix( cutadapt.out.log, ch_fastqc_collected ).collect().ifEmpty([]) ) diff --git a/modules/fluviewer.nf b/modules/fluviewer.nf index d0c705f..decd9bd 100644 --- a/modules/fluviewer.nf +++ b/modules/fluviewer.nf @@ -45,8 +45,7 @@ process fluviewer { tuple val(sample_id), path(reads_1), path(reads_2), path(db) output: - tuple val(sample_id), path("${sample_id}*.bam"), emit: alignment - tuple val(sample_id), path("${sample_id}*.bam.bai"), emit: alignmentindex, optional: true + tuple val(sample_id), path("${sample_id}*.bam"), path("${sample_id}*.bam.bai"), emit: alignment tuple val(sample_id), path("${sample_id}*report.tsv"), emit: reports, optional: true tuple val(sample_id), path("${sample_id}*_consensus.fa"), emit: consensus_seqs, optional: true tuple val(sample_id), path("${sample_id}_HA_consensus.fa"), emit: ha_consensus_seq, optional: true diff --git a/modules/pileup.nf b/modules/pileup.nf new file mode 100644 index 0000000..7a4b9bd --- /dev/null +++ b/modules/pileup.nf @@ -0,0 +1,40 @@ +process make_pileup { + + tag { sample_id } + + publishDir "${params.outdir}/${sample_id}", pattern: "*pileup.tsv", mode:'copy' + + input: + tuple val(sample_id), path(bam), path(bam_index) + + output: + tuple val(sample_id), path("${sample_id}_pileup.tsv"), emit: pileup + + script: + """ + make_pileup.py --input ${bam} --output ${sample_id}_pileup.tsv + """ +} + +process plot_pileup { + + tag { sample_id } + + conda "${projectDir}/environments/fluviewer.yml" + + publishDir "${params.outdir}/${sample_id}/plots", pattern: "*png", mode:'copy' + + input: + tuple val(sample_id), path(pileup) + + output: + tuple val(sample_id), path("${sample_id}_*combined.png"), emit: combined + tuple val(sample_id), path("${sample_id}_*{NA,HA,PB2,PB1,PA,NS,M,NP}.png"), emit: separate + + script: + """ + plot_pileup.py --input ${pileup} --output ${sample_id}_pileup_combined.png -m facet + + plot_pileup.py --input ${pileup} --output ${sample_id}_pileup.png -m separate + """ +} \ No newline at end of file