Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pileup process to pipeline #49

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions bin/make_pileup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python3
import argparse
import pysam

def write_pileup(bamfile_path, output_path, window_size=10, min_base_quality=0, min_mapping_quality=0):
"""
Generate pileup statistics for each position in each reference sequence from a BAM file.

Args:
bamfile_path (str): Path to the BAM file
output_path (str): Path to output TSV file
window_size (int): Size of sliding window
min_base_quality (int): Minimum base quality to consider (default: 0)
min_mapping_quality (int): Minimum mapping quality to consider (default: 0)
plot_path (str, optional): Path to save the plot(s)
plot_mode (str): 'facet' for single stacked plot, 'separate' for individual plots
"""

# Prepare a list to collect data for DataFrame and plotting
pileup_data = []

with pysam.AlignmentFile(bamfile_path, "rb") as bamfile, open(output_path, 'w') as outfile:
outfile.write("\t".join(["contig", "position", "depth", "A", "C", "G", "T", "del", "N", 'base', 'max_freq', 'window']) + '\n')

# Iterate through each reference sequence in the BAM file
for reference in bamfile.references:
sliding_window = []
# Generate pileup for current reference sequence
for plup_column in bamfile.pileup(
reference,
ignore_orphans=False,
min_base_quality=min_base_quality,
min_mapping_quality=min_mapping_quality
):
freqs = {'A': 0, 'T': 0, 'G': 0, 'C': 0, '-': 0, 'N': 0}

# Count bases at current position
for plup_read in plup_column.pileups:
if plup_read.is_del:
freqs['-'] += 1
else:
base = plup_read.alignment.query_sequence[plup_read.query_position]
freqs[base] += 1

# Calculate depth (excluding refskips)
base_freqs = [freqs[base] for base in "ACGT-"]
depth = sum(base_freqs)
top_base = max(freqs, key=lambda x : freqs.get(x))
max_freq = max(base_freqs) / depth if depth != 0 else 0
max_freq = round(max_freq, 2)

if len(sliding_window) > int(window_size):
sliding_window.pop(0)

# Prepare output for TSV
outputs = [
plup_column.reference_name,
plup_column.pos + 1, # 1-based coordinates
depth,
freqs['A'],
freqs['C'],
freqs['G'],
freqs['T'],
freqs['-'],
freqs['N'],
top_base,
max_freq,
"".join(sliding_window)
]
outputs = [str(x) for x in outputs]

outfile.write("\t".join(outputs) + '\n')
sliding_window.append(top_base)

# Collect data for DataFrame
pileup_data.append({
'contig': plup_column.reference_name,
'position': plup_column.pos + 1,
'max_freq': max_freq,
'window': "".join(sliding_window),
'top_base': top_base,
'depth': depth
})


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', required=True, help='BAM file of reads aligned to reference')
parser.add_argument('-b', '--min-base-qual', type=int, default=0, help='Minimum base quality to include base in pileup. Default: 0')
parser.add_argument('-m', '--min-map-qual', type=int, default=0, help='Minimum mapping quality to include a read in pileup. Default: 0')
parser.add_argument('-w', '--window-size', type=int, default=10, help='Number of bases to display in the sliding window column. Default: 10')
parser.add_argument('-o', '--output', required=True, help='Output path of pileup in TSV format')

return parser.parse_args()

if __name__ == '__main__':
args = parse_args()
write_pileup(
args.input,
args.output,
args.window_size,
args.min_base_qual,
args.min_map_qual
)
147 changes: 147 additions & 0 deletions bin/plot_pileup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#!/usr/bin/env python3
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import argparse
import os

def load_pileup(pileup_path):
df = pd.read_csv(pileup_path, sep='\t')
return df

def create_facet_plot(pileup_df, plot_path):
"""
Create dot plot(s) of max_freq across genome positions for multiple contigs.

Args:
pileup_df (DataFrame): Pandas DataFrame containing pileup information
plot_path (str): Path to save the plot(s)
sample_name (str): Name of the BAM file for plot title
plot_mode (str): 'facet' for single stacked plot, 'separate' for individual plots
"""
# Convert to DataFrame

# Get unique contigs
contigs = pileup_df['contig'].unique()

plt.figure(figsize=(20, 4 * len(contigs))) # Adjust height based on number of contigs

for i, contig in enumerate(contigs, 1):
plt.subplot(len(contigs), 1, i)

contig_data = pileup_df[pileup_df['contig'] == contig]

# Separate normal and low-frequency points
normal_points = contig_data[contig_data['max_freq'] >= 0.9]
low_freq_points = contig_data[contig_data['max_freq'] < 0.9]

# Plot normal points
plt.scatter(normal_points['position'], normal_points['max_freq'],
alpha=0.3, color='gray', s=10)

# Plot and label low-frequency points with annotations
for _, row in low_freq_points.iterrows():
plt.scatter(row['position'], row['max_freq'],
color='red', s=50, edgecolors='black', linewidth=1, zorder=5)
plt.annotate(
f"Pos: {row['position']}\nFreq: {row['max_freq']:.2f}\nDepth: {row['depth']}\nWindow: {row['window']}",
(row['position'], row['max_freq']),
xytext=(10, 10),
textcoords='offset points',
fontsize=8,
bbox=dict(boxstyle="round,pad=0.3", fc="yellow", ec="black", lw=1, alpha=0.7),
arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.2")
)

plt.title(f'Max Base Frequency - {contig}', fontsize=10)
plt.xlabel('Genome Position')
plt.ylabel('Max Base Frequency')
plt.ylim(0.2, 1.02)
plt.grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
plt.close()

def create_separate_plots(pileup_df, plot_path, sample_name):

def extract_contig_name(contig):
"""
Extract contig name between first and second '|' characters.
If '|' is not found twice, return the original contig name.
"""
parts = contig.split('|')
return parts[1] if len(parts) >= 2 else contig

# Create separate plots for each contig
contigs = pileup_df['contig'].unique()

for contig in contigs:
print(contig)
plt.figure(figsize=(20, 6))

contig_data = pileup_df[pileup_df['contig'] == contig]

# Separate normal and low-frequency points
normal_points = contig_data[contig_data['max_freq'] >= 0.9]
low_freq_points = contig_data[contig_data['max_freq'] < 0.9]

# Plot normal points
plt.scatter(normal_points['position'], normal_points['max_freq'],
alpha=0.3, color='gray', s=10)

# Plot and label low-frequency points with annotations
for _, row in low_freq_points.iterrows():
plt.scatter(row['position'], row['max_freq'],
color='red', s=50, edgecolors='black', linewidth=1, zorder=5)
plt.annotate(
f"Pos: {row['position']}\nFreq: {row['max_freq']:.2f}\nDepth: {row['depth']}\nWindow: {row['window']}",
(row['position'], row['max_freq']),
xytext=(10, 10),
textcoords='offset points',
fontsize=8,
bbox=dict(boxstyle="round,pad=0.3", fc="yellow", ec="black", lw=1, alpha=0.7),
arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.2")
)

# Extract simplified contig name
simple_contig_name = extract_contig_name(contig)

plt.title(f'Max Base Frequency Across {simple_contig_name}\n{sample_name}', fontsize=16)
plt.xlabel('Genome Position', fontsize=12)
plt.ylabel('Max Base Frequency', fontsize=12)
plt.ylim(0.2, 1.02)
plt.grid(True, linestyle='--', alpha=0.7)

# Save individual contig plot with unique filename using simplified contig name
individual_plot_path = plot_path.replace('.png', f'_{simple_contig_name}.png')
plt.tight_layout()
plt.savefig(individual_plot_path, dpi=300, bbox_inches='tight')
plt.close()


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', required=True, help='Pileup in TSV format')
parser.add_argument('-o', '--output', required=True, help='Output path for dot plot PNG')
parser.add_argument('-m', '--plot-mode', choices=['facet', 'separate'], default='facet',
help='Plot mode: "facet" for stacked plot, "separate" for individual contig plots. Default: facet')

return parser.parse_args()

if __name__ == '__main__':
args = parse_args()

sample_name = os.path.basename(args.input).split("_")[0]

pileup_df = load_pileup(args.input)

if args.plot_mode == 'facet':
print("Generating facet plot...")
create_facet_plot(pileup_df, args.output)

else:
print("Generating separate plots...")
create_separate_plots(pileup_df, args.output, sample_name)


1 change: 1 addition & 0 deletions environments/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ dependencies:
- multiqc=1.15
- fastqc=0.12.1
- genoflu=1.05
- pysam=0.22
5 changes: 5 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ include { snp_calling } from './modules/snp_calling.nf'
include { pull_genoflu } from './modules/genoflu.nf'
include { checkout_genoflu } from './modules/genoflu.nf'
include { genoflu } from './modules/genoflu.nf'
include { make_pileup } from './modules/pileup.nf'
include { plot_pileup } from './modules/pileup.nf'


// prints to the screen and to the log
Expand Down Expand Up @@ -93,6 +95,9 @@ workflow {
// Run FluViewer
fluviewer(normalize_depth.out.normalized_reads.combine(ch_db))

make_pileup(fluviewer.out.alignment)
plot_pileup(make_pileup.out.pileup)

//Collect al the relevant filesfor multiqc
ch_fastqc_collected = fastqc.out.zip.map{ it -> [it[1], it[2]]}.collect()
multiqc(fastp.out.json.mix( cutadapt.out.log, ch_fastqc_collected ).collect().ifEmpty([]) )
Expand Down
3 changes: 1 addition & 2 deletions modules/fluviewer.nf
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ process fluviewer {
tuple val(sample_id), path(reads_1), path(reads_2), path(db)

output:
tuple val(sample_id), path("${sample_id}*.bam"), emit: alignment
tuple val(sample_id), path("${sample_id}*.bam.bai"), emit: alignmentindex, optional: true
tuple val(sample_id), path("${sample_id}*.bam"), path("${sample_id}*.bam.bai"), emit: alignment
tuple val(sample_id), path("${sample_id}*report.tsv"), emit: reports, optional: true
tuple val(sample_id), path("${sample_id}*_consensus.fa"), emit: consensus_seqs, optional: true
tuple val(sample_id), path("${sample_id}_HA_consensus.fa"), emit: ha_consensus_seq, optional: true
Expand Down
40 changes: 40 additions & 0 deletions modules/pileup.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
process make_pileup {

tag { sample_id }

publishDir "${params.outdir}/${sample_id}", pattern: "*pileup.tsv", mode:'copy'

input:
tuple val(sample_id), path(bam), path(bam_index)

output:
tuple val(sample_id), path("${sample_id}_pileup.tsv"), emit: pileup

script:
"""
make_pileup.py --input ${bam} --output ${sample_id}_pileup.tsv
"""
}

process plot_pileup {

tag { sample_id }

conda "${projectDir}/environments/fluviewer.yml"

publishDir "${params.outdir}/${sample_id}/plots", pattern: "*png", mode:'copy'

input:
tuple val(sample_id), path(pileup)

output:
tuple val(sample_id), path("${sample_id}_*combined.png"), emit: combined
tuple val(sample_id), path("${sample_id}_*{NA,HA,PB2,PB1,PA,NS,M,NP}.png"), emit: separate

script:
"""
plot_pileup.py --input ${pileup} --output ${sample_id}_pileup_combined.png -m facet

plot_pileup.py --input ${pileup} --output ${sample_id}_pileup.png -m separate
"""
}
Loading