Add coverage plot (#27)

* Add coverage plot * Check for empty alignment * Add more support for customizing coverage plot
BCCDC-PHL · Jul 5, 2024 · ec4f531 · ec4f531
1 parent ae43481
commit ec4f531
Show file tree

Hide file tree

Showing 5 changed files with 302 additions and 5 deletions.
diff --git a/bin/plot-coverage.py b/bin/plot-coverage.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import logging
+import os
+import subprocess
+
+from pathlib import Path
+from typing import Optional
+
+import matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import pandas as pd
+import seaborn as sns
+
+from matplotlib.patches import Rectangle
+
+def parse_bed(bed_path: Path) -> dict:
+    """
+    Parse BED file
+
+    :param bed: BED file
+    :type bed: Path
+    :return: Dictionary of genes by name
+    :rtype: dict
+    """
+    genes_by_name = {}
+    with open(bed_path, 'r') as f:
+        for line in f:
+            line_split = line.strip().split('\t')
+            chrom = line_split[0]
+            start = int(line_split[1])
+            end = int(line_split[2])
+            name = line_split[3]
+            strand = line_split[5]
+            genes_by_name[name] = {
+                'name': name,
+                'chrom': chrom,
+                'start': start,
+                'end': end,
+                'strand': strand,
+            }
+
+    return genes_by_name
+
+
+def parse_depths(pileup_path: Path) -> list[dict]:
+    """
+    Parse pileup file
+
+    :param pileup_path: Pileup file
+    :type pileup_path: Path
+    :return: List of dictionaries with the keys chrom, pos, ref, and depth
+    :rtype: list[dict]
+    """
+    depths = []
+    num_positions = 0
+    previous_chrom = None
+    with open(pileup_path, 'r') as f:
+        for line in f:
+            num_positions += 1
+            line_split = line.strip().split("\t")
+            chrom = line_split[0]
+            pos = int(line_split[1])
+            depth = int(line_split[2])
+            output = {
+                "chrom": chrom,
+                "pos": pos,
+                "depth": depth
+            }
+            depths.append(output)
+            if previous_chrom is None:
+                previous_chrom = chrom
+            elif previous_chrom != chrom:
+                num_positions = 0
+                previous_chrom = chrom
+            if num_positions % 100_000 == 0:
+                logging.info(f"Processed {num_positions:,} positions from {chrom}...")
+
+    return depths
+
+
+def plot_coverage(depths: pd.DataFrame, bed: dict, sample_name: str="Sample", threshold: int=10, y_limit: int=500, log_scale: bool=False, width_inches_per_mb=3, height_inches_per_chrom=2) -> sns.FacetGrid:
+    """
+    Plot coverage
+
+    :param depths: DataFrame of depths (columns: chrom, pos, depth)
+    :type depths: pandas.DataFrame
+    :param bed: Dictionary of bed regions by name
+    :type bed: dict
+    :param sample_name: Sample name
+    :type sample_name: str
+    :param threshold: Threshold for depth of coverage
+    :type threshold: int
+    :param y_limit: Maximum y-axis value in plot (default: 500)
+    :type y_limit: int
+    :param log_scale: Log scale y-axis (default: False)
+    :type log_scale: bool
+    :param width_inches_per_mb: Width inches per megabase (default: 12)
+    :type width_inches_per_mb: float
+    :param height_inches_per_chrom: Height inches per chromosome (default: 8)
+    :type height_inches_per_chrom: float
+    :return: Seaborn FacetGrid object
+    :rtype: seaborn.FacetGrid
+    """
+    try:
+        percent_coverage_above_threshold = (
+            sum(1 if x > threshold else 0 for x in depths.depth)
+            / depths.shape[0]
+            * 100
+        )
+        logging.info(f"Percent bases with coverage above {threshold}X: {percent_coverage_above_threshold: .1f}%")
+    except ZeroDivisionError:
+        percent_coverage_above_threshold = 0
+
+    # Filter out any chroms with fewer than 100 positions
+    chrom_counts = depths.chrom.value_counts()
+    chroms_to_keep = chrom_counts[chrom_counts >= 100].index
+    depths = depths[depths.chrom.isin(chroms_to_keep)]
+    num_chroms = depths.chrom.nunique()
+    longest_chrom = depths.chrom.value_counts().idxmax()
+    longest_chrom_length = depths[depths.chrom == longest_chrom].pos.max()
+    longest_chrom_length_mb = longest_chrom_length / 1_000_000
+    sns.set_style("whitegrid")
+    dpi = 300
+    width_pixels = round(longest_chrom_length_mb * width_inches_per_mb * dpi)
+    height_pixels = round(height_inches_per_chrom * num_chroms * dpi)
+    logging.info(f"Plot size: {width_pixels} x {height_pixels} pixels")
+    try:
+        g = sns.FacetGrid(depths, row="chrom", height=height_inches_per_chrom, aspect=(width_inches_per_mb * longest_chrom_length_mb)/height_inches_per_chrom, gridspec_kws={"hspace": 0.5})
+    except ValueError as e:
+        logging.error(e)
+        logging.error("Try reducing the width_inches_per_mb and height_inches_per_chrom parameters.")
+        exit(1)
+    g.fig.set_size_inches(longest_chrom_length_mb * width_inches_per_mb, height_inches_per_chrom * num_chroms)
+    g.map(sns.lineplot, "pos", "depth", linewidth=0.5)
+    g.map(plt.axhline, y=threshold, color="red", linestyle="--", linewidth=0.5)
+    g.set_xlabels("Position")
+    for ax in g.axes.flatten():
+        ax.tick_params(labelbottom=True)
+    g.set(xlim=(0, longest_chrom_length))
+    if log_scale:
+        g.set(yscale="log")
+        g.set(ylim=(1, y_limit))
+        g.set_ylabels("Depth of coverage (log scale)")
+    else:
+        g.set(ylim=(0, y_limit))
+        g.set_ylabels("Depth of coverage")
+    # set titles for each facet with chrom name
+    g.set_titles(row_template="{row_name}")
+
+    # line_plot.set_xticks(range(0, depths.shape[0], 100000), minor=True)
+
+    plt.close()
+
+    return g
+
+
+
+def main(args):
+
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s: %(message)s",
+        level=logging.INFO,
+        datefmt='%Y-%m-%dT%H:%M:%S'
+    )
+
+    all_depths_df = pd.read_csv(args.depths, sep="\t")
+    # Check if depths are empty
+    if all_depths_df.empty:
+        logging.error("No depth of coverage data found in input file. Exiting...")
+        exit(0)
+
+    logging.info("Calculating tumbling window depths")
+    # create an empty dataframe to fill with tumbling window depths:
+    tumbling_window_depths = pd.DataFrame(columns=["chrom", "pos", "depth"])
+    for chrom, group in all_depths_df.groupby("chrom"):
+        # for each chromosome, iterate over the depths in tumbling windows:
+        for i in range(0, len(group), args.window):
+            # get a slice of the dataframe for the tumbling window:
+            window = group.iloc[i:i+args.window]
+            # calculate the middle position of the window:
+            middle_pos = window.iloc[int(len(window)/2)].pos
+            # calculate the mean depth of the window:
+            mean_depth = window.depth.mean()
+            record = {
+                "chrom": chrom,
+                "pos": middle_pos,
+                "depth": mean_depth
+            }
+            # Avoid FutureWarning about empty or all-NA entries:
+            if len(tumbling_window_depths) == 0:
+                tumbling_window_depths = pd.DataFrame([record])
+            else:
+                tumbling_window_depths = pd.concat([tumbling_window_depths, pd.DataFrame([record])], ignore_index=True)
+    tumbling_window_depths = tumbling_window_depths.reset_index(drop=True)
+
+    bed = {}
+    if args.bed is not None:
+        bed = parse_bed(args.bed)
+
+    logging.info("Plotting coverage...")
+
+    coverage_plot = plot_coverage(tumbling_window_depths, bed, args.sample_id, args.threshold, args.window, args.log_scale, args.width_inches_per_mb, args.height_inches_per_chrom)
+    logging.info(f"Saving plot to {args.output}...")
+    coverage_plot.savefig(args.output, dpi=100)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Plot coverage')
+    parser.add_argument('-d', '--depths', help='Input .bam file')
+    parser.add_argument('-r', '--ref', help='Input ref .fasta file', required=True)
+    parser.add_argument('--bed', type=Path, help='bed file (optional)')
+    parser.add_argument('-s', '--sample-id', default="Sample", help='Sample ID')
+    parser.add_argument('-t', '--threshold', type=int, default=10, help='Threshold for depth of coverage')
+    parser.add_argument('--window', type=int, default=1000, help='Tumbling window for coverage')
+    parser.add_argument('--log-scale', action='store_true', help='Log scale y-axis')
+    parser.add_argument('--width-inches-per-mb', type=float, default=3, help='Width inches per megabase (default: 3)')
+    parser.add_argument('--height-inches-per-chrom', type=float, default=2, help='Height inches per chromosome (default: 2)')
+    parser.add_argument('--y-limit', type=int, default=500, help='Maximum y-axis value in plot (default: 500)')
+    parser.add_argument('-o', '--output', default="./coverage.png", help='Output file')
+    args = parser.parse_args()
+    main(args)
diff --git a/environments/plot_coverage.yml b/environments/plot_coverage.yml
@@ -0,0 +1,9 @@
+name: alignment-variants-plot-coverage
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3
+  - seaborn=0.13.2
+  - samtools=1.20
diff --git a/main.nf b/main.nf
@@ -22,6 +22,7 @@ include { samtools_stats }                 from './modules/alignment_variants.nf
 include { combine_alignment_qc }           from './modules/alignment_variants.nf'
 include { generate_low_coverage_bed }      from './modules/alignment_variants.nf'
 include { percent_coverage_by_depth }      from './modules/alignment_variants.nf'
+include { plot_coverage }                  from './modules/alignment_variants.nf'
 include { pipeline_provenance }            from './modules/provenance.nf'
 include { collect_provenance }             from './modules/provenance.nf'
 
@@ -99,12 +100,34 @@ workflow {
 
     percent_coverage_by_depth(ch_depths)
 
+    plot_coverage(ch_depths.join(ch_ref))
+
     // Collect multi-sample outputs
     if (params.collect_outputs) {
-	fastp.out.fastp_csv.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_fastp.csv", storeDir: "${params.outdir}")
-	qualimap_bamqc.out.alignment_qc.map{ it -> it[2] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_qualimap_alignment_qc.csv", storeDir: "${params.outdir}")
-	samtools_stats.out.stats_summary_csv.map{ it -> it[2] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_samtools_stats_summary.csv", storeDir: "${params.outdir}")
-	combine_alignment_qc.out.map{ it -> it[2] }.collectFile(keepHeader: true, sort: { it.text }, name: "${params.collected_outputs_prefix}_combined_alignment_qc.csv", storeDir: "${params.outdir}")
+	fastp.out.fastp_csv.map{ it -> it[1] }.collectFile(
+	    keepHeader: true,
+	    sort: { it.text },
+	    name: "${params.collected_outputs_prefix}_fastp.csv",
+	    storeDir: "${params.outdir}"
+	)
+	qualimap_bamqc.out.alignment_qc.map{ it -> it[2] }.collectFile(
+	    keepHeader: true,
+	    sort: { it.text },
+	    name: "${params.collected_outputs_prefix}_qualimap_alignment_qc.csv",
+	    storeDir: "${params.outdir}"
+	)
+	samtools_stats.out.stats_summary_csv.map{ it -> it[2] }.collectFile(
+	    keepHeader: true,
+	    sort: { it.text },
+	    name: "${params.collected_outputs_prefix}_samtools_stats_summary.csv",
+	    storeDir: "${params.outdir}"
+	)
+	combine_alignment_qc.out.map{ it -> it[2] }.collectFile(
+	    keepHeader: true,
+	    sort: { it.text },
+	    name: "${params.collected_outputs_prefix}_combined_alignment_qc.csv",
+	    storeDir: "${params.outdir}"
+	)
     }
 
     // Collect Provenance

diff --git a/modules/alignment_variants.nf b/modules/alignment_variants.nf
@@ -403,3 +403,34 @@ process freebayes {
 	> ${sample_id}_${short_long}_freebayes.vcf
     """
 }
+
+
+process plot_coverage {
+
+    tag { sample_id + ' / ' + short_long }
+
+    publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_coverage.png"
+
+    input:
+    tuple val(sample_id), val(short_long), path(depths), path(ref)
+
+    output:
+    tuple val(sample_id), path("${sample_id}_${short_long}_coverage.png"), optional: true
+
+    script:
+    log_scale = params.coverage_plot_log_scale ? "--log-scale" : ""
+    """
+    plot-coverage.py \
+	--sample-id ${sample_id} \
+	--ref ${ref} \
+	--depths ${depths} \
+	--threshold ${params.min_depth} \
+	--y-limit ${params.coverage_plot_y_limit} \
+	--width-inches-per-mb ${params.coverage_plot_width_inches_per_mb} \
+	--height-inches-per-chrom ${params.coverage_plot_height_inches_per_chrom} \
+	--window ${params.coverage_plot_window_size} \
+	${log_scale} \
+	--output ${sample_id}_${short_long}_coverage.png
+	
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -4,7 +4,7 @@ manifest {
     description = 'Alignment and variant calling pipeline'
     mainScript = 'main.nf'
     nextflowVersion = '>=20.01.0'
-    version = '0.1.9'
+    version = '0.1.10'
 }
 
 params {
@@ -27,6 +27,11 @@ params {
     min_alternate_fraction_for_variant_calling = 0.1
     qualimap_coverage_histogram_limit = 100
     coverage_by_depth_limit = 500
+    coverage_plot_y_limit = 500
+    coverage_plot_log_scale = false
+    coverage_plot_width_inches_per_mb = 3
+    coverage_plot_height_inches_per_chrom = 2
+    coverage_plot_window_size = 1000
     qualimap_memory = '4G'
     skip_alignment_cleaning = false
     align_long_reads = false
@@ -79,4 +84,7 @@ process {
 	cpus = 16
 	memory = '36G'
     }
+    withName: plot_coverage {
+	conda = "$baseDir/environments/plot_coverage.yml"
+    }
 }