BCCDC-PHL · jpalmer37 · Oct 28, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/bin/convert_gb_to_fa.py b/bin/convert_gb_to_fa.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+from Bio import SeqIO
+import argparse
+
+def convert(input_file, output_file):
+    SeqIO.convert(input_file, "genbank", output_file, "fasta")
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Convert Genbank files to FASTA format", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('-i', '--input', required=True, help='Input Genbank file') 
+    parser.add_argument('-o', '--output', required=True,help='Output FASTA file')
+    return parser.parse_args()
+
+def main():
+    args = get_args()
+    convert(args.input, args.output)
+
+if __name__ == '__main__':
+    main()
diff --git a/bin/plot-coverage.py b/bin/plot-coverage.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import logging
+import os
+import subprocess
+
+from pathlib import Path
+from typing import Optional
+
+import matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import pandas as pd
+import seaborn as sns
+
+from matplotlib.patches import Rectangle
+
+def parse_bed(bed_path: Path) -> dict:
+    """
+    Parse BED file
+
+    :param bed: BED file
+    :type bed: Path
+    :return: Dictionary of genes by name
+    :rtype: dict
+    """
+    genes_by_name = {}
+    with open(bed_path, 'r') as f:
+        for line in f:
+            line_split = line.strip().split('\t')
+            chrom = line_split[0]
+            start = int(line_split[1])
+            end = int(line_split[2])
+            name = line_split[3]
+            strand = line_split[5]
+            genes_by_name[name] = {
+                'name': name,
+                'chrom': chrom,
+                'start': start,
+                'end': end,
+                'strand': strand,
+            }
+
+    return genes_by_name
+
+
+def parse_depths(pileup_path: Path) -> list[dict]:
+    """
+    Parse pileup file
+
+    :param pileup_path: Pileup file
+    :type pileup_path: Path
+    :return: List of dictionaries with the keys chrom, pos, ref, and depth
+    :rtype: list[dict]
+    """
+    depths = []
+    num_positions = 0
+    previous_chrom = None
+    with open(pileup_path, 'r') as f:
+        for line in f:
+            num_positions += 1
+            line_split = line.strip().split("\t")
+            chrom = line_split[0]
+            pos = int(line_split[1])
+            depth = int(line_split[2])
+            output = {
+                "chrom": chrom,
+                "pos": pos,
+                "depth": depth
+            }
+            depths.append(output)
+            if previous_chrom is None:
+                previous_chrom = chrom
+            elif previous_chrom != chrom:
+                num_positions = 0
+                previous_chrom = chrom
+            if num_positions % 100_000 == 0:
+                logging.info(f"Processed {num_positions:,} positions from {chrom}...")
+
+    return depths
+
+
+def plot_coverage(depths: pd.DataFrame, bed: dict, sample_name: str="Sample", threshold: int=10, y_limit: int=500, log_scale: bool=False, width_inches_per_mb=3, height_inches_per_chrom=2) -> sns.FacetGrid:
+    """
+    Plot coverage
+
+    :param depths: DataFrame of depths (columns: chrom, pos, depth)
+    :type depths: pandas.DataFrame
+    :param bed: Dictionary of bed regions by name
+    :type bed: dict
+    :param sample_name: Sample name
+    :type sample_name: str
+    :param threshold: Threshold for depth of coverage
+    :type threshold: int
+    :param y_limit: Maximum y-axis value in plot (default: 500)
+    :type y_limit: int
+    :param log_scale: Log scale y-axis (default: False)
+    :type log_scale: bool
+    :param width_inches_per_mb: Width inches per megabase (default: 12)
+    :type width_inches_per_mb: float
+    :param height_inches_per_chrom: Height inches per chromosome (default: 8)
+    :type height_inches_per_chrom: float
+    :return: Seaborn FacetGrid object
+    :rtype: seaborn.FacetGrid
+    """
+    try:
+        percent_coverage_above_threshold = (
+            sum(1 if x > threshold else 0 for x in depths.depth)
+            / depths.shape[0]
+            * 100
+        )
+        logging.info(f"Percent bases with coverage above {threshold}X: {percent_coverage_above_threshold: .1f}%")
+    except ZeroDivisionError:
+        percent_coverage_above_threshold = 0
+
+    # Filter out any chroms with fewer than 100 positions
+    chrom_counts = depths.chrom.value_counts()
+    chroms_to_keep = chrom_counts[chrom_counts >= 100].index
+    depths = depths[depths.chrom.isin(chroms_to_keep)]
+    num_chroms = depths.chrom.nunique()
+    longest_chrom = depths.chrom.value_counts().idxmax()
+    longest_chrom_length = depths[depths.chrom == longest_chrom].pos.max()
+    longest_chrom_length_mb = longest_chrom_length / 1_000_000
+    sns.set_style("whitegrid")
+    dpi = 300
+    width_pixels = round(longest_chrom_length_mb * width_inches_per_mb * dpi)
+    height_pixels = round(height_inches_per_chrom * num_chroms * dpi)
+    logging.info(f"Plot size: {width_pixels} x {height_pixels} pixels")
+    try:
+        g = sns.FacetGrid(depths, row="chrom", height=height_inches_per_chrom, aspect=(width_inches_per_mb * longest_chrom_length_mb)/height_inches_per_chrom, gridspec_kws={"hspace": 0.5})
+    except ValueError as e:
+        logging.error(e)
+        logging.error("Try reducing the width_inches_per_mb and height_inches_per_chrom parameters.")
+        exit(1)
+    g.fig.set_size_inches(longest_chrom_length_mb * width_inches_per_mb, height_inches_per_chrom * num_chroms)
+    g.map(sns.lineplot, "pos", "depth", linewidth=0.5)
+    g.map(plt.axhline, y=threshold, color="red", linestyle="--", linewidth=0.5)
+    g.set_xlabels("Position")
+    for ax in g.axes.flatten():
+        ax.tick_params(labelbottom=True)
+    g.set(xlim=(0, longest_chrom_length))
+    if log_scale:
+        g.set(yscale="log")
+        g.set(ylim=(1, y_limit))
+        g.set_ylabels("Depth of coverage (log scale)")
+    else:
+        g.set(ylim=(0, y_limit))
+        g.set_ylabels("Depth of coverage")
+    # set titles for each facet with chrom name
+    g.set_titles(row_template="{row_name}")
+
+    # line_plot.set_xticks(range(0, depths.shape[0], 100000), minor=True)
+
+    plt.close()
+
+    return g
+
+
+
+def main(args):
+
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s: %(message)s",
+        level=logging.INFO,
+        datefmt='%Y-%m-%dT%H:%M:%S'
+    )
+
+    all_depths_df = pd.read_csv(args.depths, sep="\t")
+    # Check if depths are empty
+    if all_depths_df.empty:
+        logging.error("No depth of coverage data found in input file. Exiting...")
+        exit(0)
+
+    logging.info("Calculating tumbling window depths")
+    # create an empty dataframe to fill with tumbling window depths:
+    tumbling_window_depths = pd.DataFrame(columns=["chrom", "pos", "depth"])
+    for chrom, group in all_depths_df.groupby("chrom"):
+        # for each chromosome, iterate over the depths in tumbling windows:
+        for i in range(0, len(group), args.window):
+            # get a slice of the dataframe for the tumbling window:
+            window = group.iloc[i:i+args.window]
+            # calculate the middle position of the window:
+            middle_pos = window.iloc[int(len(window)/2)].pos
+            # calculate the mean depth of the window:
+            mean_depth = window.depth.mean()
+            record = {
+                "chrom": chrom,
+                "pos": middle_pos,
+                "depth": mean_depth
+            }
+            # Avoid FutureWarning about empty or all-NA entries:
+            if len(tumbling_window_depths) == 0:
+                tumbling_window_depths = pd.DataFrame([record])
+            else:
+                tumbling_window_depths = pd.concat([tumbling_window_depths, pd.DataFrame([record])], ignore_index=True)
+    tumbling_window_depths = tumbling_window_depths.reset_index(drop=True)
+
+    bed = {}
+    if args.bed is not None:
+        bed = parse_bed(args.bed)
+
+    logging.info("Plotting coverage...")
+
+    coverage_plot = plot_coverage(tumbling_window_depths, bed, args.sample_id, args.threshold, args.window, args.log_scale, args.width_inches_per_mb, args.height_inches_per_chrom)
+    logging.info(f"Saving plot to {args.output}...")
+    coverage_plot.savefig(args.output, dpi=100)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Plot coverage')
+    parser.add_argument('-d', '--depths', help='Input .bam file')
+    parser.add_argument('-r', '--ref', help='Input ref .fasta file', required=True)
+    parser.add_argument('--bed', type=Path, help='bed file (optional)')
+    parser.add_argument('-s', '--sample-id', default="Sample", help='Sample ID')
+    parser.add_argument('-t', '--threshold', type=int, default=10, help='Threshold for depth of coverage')
+    parser.add_argument('--window', type=int, default=1000, help='Tumbling window for coverage')
+    parser.add_argument('--log-scale', action='store_true', help='Log scale y-axis')
+    parser.add_argument('--width-inches-per-mb', type=float, default=3, help='Width inches per megabase (default: 3)')
+    parser.add_argument('--height-inches-per-chrom', type=float, default=2, help='Height inches per chromosome (default: 2)')
+    parser.add_argument('--y-limit', type=int, default=500, help='Maximum y-axis value in plot (default: 500)')
+    parser.add_argument('-o', '--output', default="./coverage.png", help='Output file')
+    args = parser.parse_args()
+    main(args)
diff --git a/environments/environment.yml b/environments/environment.yml
@@ -2,10 +2,13 @@ name: snippy-nf
 channels:
   - conda-forge
   - bioconda
-  - defaults
 dependencies:
   - python=3
   - csvkit=1.0.5
   - snippy=4.6.0
   - fastp=0.20.1
   - qualimap=2.2.2d
+  - pandas=2.2.3
+  - seaborn=0.13.2
+  - matplotlib=3.9.1
+  - biopython=1.84
diff --git a/main.nf b/main.nf
@@ -6,6 +6,9 @@ include { fastp }          from './modules/snippy.nf'
 include { snippy }         from './modules/snippy.nf'
 include { count_variants } from './modules/snippy.nf'
 include { qualimap_bamqc } from './modules/snippy.nf'
+include { samtools_mpileup } from './modules/snippy.nf'
+include { plot_coverage } from './modules/snippy.nf'
+include { convert_gb_to_fa } from './modules/snippy.nf'
 
 workflow {
   ch_ref = Channel.fromPath( "${params.ref}", type: 'file')
@@ -21,4 +24,13 @@ workflow {
     snippy(fastp.out.reads.combine(ch_ref))
     qualimap_bamqc(snippy.out.alignment)
     count_variants(snippy.out.variants_csv.combine(ch_ref))
+
+    if (file(params.ref).name =~ /.+\.[Gg]b.*/){
+      ch_ref_fa = convert_gb_to_fa(ch_ref)
+    }else {
+      ch_ref_fa = ch_ref
+    }
+
+    samtools_mpileup(snippy.out.alignment.combine(ch_ref_fa))
+    plot_coverage(samtools_mpileup.out.depths.combine(ch_ref_fa))
 }
diff --git a/modules/snippy.nf b/modules/snippy.nf
@@ -133,3 +133,88 @@ process count_variants {
   paste -d ',' sample_id ref_id num_snps num_indel num_mnp num_complex >> ${sample_id}_variant_counts.csv
   """
 }
+
+process convert_gb_to_fa {
+    publishDir "${params.outdir}/reference", mode: 'copy', pattern: "*fa"
+
+    input:
+    path(reference)
+
+    output:
+    path("reference.fa"), emit: ref
+
+    script:
+    """
+    convert_gb_to_fa.py --input ${reference} --output reference.fa
+    """
+}
+
+process samtools_mpileup {
+
+    tag { sample_id  }
+
+    publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_depths.tsv"
+
+    input:
+    tuple val(sample_id), path(alignment), path(alignment_index), path(ref)
+
+    output:
+    tuple val(sample_id), path("${sample_id}_depths.tsv"), emit: depths
+    tuple val(sample_id), path("${sample_id}_samtools_mpileup_provenance.yml"), emit: provenance
+
+    script:
+    """
+    printf -- "- process_name: samtools_mpileup\\n" >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "  tools:\\n"                         >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "    - tool_name: samtools\\n"        >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "      tool_version: \$(samtools --version | head -n 1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "      subcommand: mpileup\\n"        >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "      parameters:\\n"                >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "        - parameter: -a\\n"          >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "          value: null\\n"            >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "        - parameter: --min-BQ\\n"    >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "          value: 0\\n"               >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "        - parameter: --count-orphans\\n" >> ${sample_id}_samtools_mpileup_provenance.yml
+    printf -- "          value: null\\n"                >> ${sample_id}_samtools_mpileup_provenance.yml
+
+    samtools faidx ${ref}
+
+    printf "chrom\tpos\tref\tdepth\n" > ${sample_id}_depths.tsv
+
+    samtools mpileup -a \
+    --fasta-ref ${ref} \
+    --min-BQ 0 \
+    --count-orphans \
+    ${alignment[0]} | cut -f 1-4 >> ${sample_id}_depths.tsv
+    """
+}
+
+process plot_coverage {
+
+    tag { sample_id  }
+
+    publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_coverage.png"
+
+    input:
+    tuple val(sample_id), path(depths), path(ref)
+
+    output:
+    tuple val(sample_id), path("${sample_id}_coverage.png"), optional: true
+
+    script:
+    log_scale = params.coverage_plot_log_scale ? "--log-scale" : ""
+    """
+    plot-coverage.py \
+    --sample-id ${sample_id} \
+    --ref ${ref} \
+    --depths ${depths} \
+    --threshold ${params.min_depth} \
+    --y-limit ${params.coverage_plot_y_limit} \
+    --width-inches-per-mb ${params.coverage_plot_width_inches_per_mb} \
+    --height-inches-per-chrom ${params.coverage_plot_height_inches_per_chrom} \
+    --window ${params.coverage_plot_window_size} \
+    ${log_scale} \
+    --output ${sample_id}_coverage.png
+
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -21,6 +21,15 @@ params {
   outdir = 'results'
   pipeline_short_name = parsePipelineName(manifest.toMap().get('name'))
   pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version'))
+
+  // plotting parameters 
+  min_depth = 10
+  coverage_by_depth_limit = 500
+  coverage_plot_y_limit = 500
+  coverage_plot_log_scale = false
+  coverage_plot_width_inches_per_mb = 3
+  coverage_plot_height_inches_per_chrom = 2
+  coverage_plot_window_size = 1000
 }
 
 def parseMinorVersion(version) {