From 85b5f748f2d5b69c57b47dc8d676264cde3d5f71 Mon Sep 17 00:00:00 2001
From: skchronicles <kuhnsa3@gmail.com>
Date: Tue, 4 Jun 2024 17:20:38 -0400
Subject: [PATCH] Adding last step to visualize coverage along the genome

---
 workflow/Snakefile           |  4 +++
 workflow/envs/mpox.yaml      |  1 +
 workflow/rules/visualize.smk | 58 +++++++++++++++++++++++++++++++++++-
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index ca2a33b..4731492 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -136,6 +136,10 @@ rule all:
             join(workpath, "{name}", "bams", "{name}.raw_coverage.bw"),
             name=provided(samples, plot_coverage)
         ),
+        expand(
+            join(workpath, "{name}", "plots", "{name}.coverage_plot.pdf"),
+            name=provided(samples, plot_coverage)
+        ),
 
 
 # Import rules 
diff --git a/workflow/envs/mpox.yaml b/workflow/envs/mpox.yaml
index ccad65a..ed460fb 100644
--- a/workflow/envs/mpox.yaml
+++ b/workflow/envs/mpox.yaml
@@ -15,3 +15,4 @@ dependencies:
  - gawk
  - deeptools
  - samtools
+ - pygenometracks 
\ No newline at end of file
diff --git a/workflow/rules/visualize.smk b/workflow/rules/visualize.smk
index eb76e5a..6c4516f 100644
--- a/workflow/rules/visualize.smk
+++ b/workflow/rules/visualize.smk
@@ -22,7 +22,7 @@ rule bigwig:
         cpm_bw = join(workpath, "{name}", "bams", "{name}.cpm_normalized.bw"),
         raw_bw = join(workpath, "{name}", "bams", "{name}.raw_coverage.bw"),
     params:
-        rname  = 'bamcoverage',
+        rname  = 'bigwig',
     conda: depending(conda_yaml_or_named_env, use_conda)
     container: depending(config['images']['mpox-seek'], use_singularity)
     shell: 
@@ -47,3 +47,59 @@ rule bigwig:
             -bs 1 \\
             -p 1
         """
+
+
+# Data processing rules to visualize
+# coverage along the genome
+rule plot_coverage:
+    """
+    Data visualization step to plot the coverage of the reads along the genome
+    for each sample. The raw and normalized bigwig files will be plotted for each
+    sample. The normalized bigwig allows for comparison of coverage across samples,
+    while the raw bigwig file can be used to visualize the raw read coverage of each
+    sample to view the observed sequencing depth along the genome.
+    @Input:
+        CPM normalized bigwig file
+        Raw coverage bigwig file
+    @Output:
+        Coverage PNG plot
+    """
+    input:
+        raw_bw = join(workpath, "{name}", "bams", "{name}.raw_coverage.bw"),
+    output:
+        ini = join(workpath, "{name}", "plots", "{name}.coverage_plot.ini"),
+        pdf = join(workpath, "{name}", "plots", "{name}.coverage_plot.pdf"),
+    params:
+        rname  = 'plotcoverage',
+        ref_fa  = config['references']['mpox_pcr_sequence'],
+    conda: depending(conda_yaml_or_named_env, use_conda)
+    container: depending(config['images']['mpox-seek'], use_singularity)
+    shell: 
+        """
+        # Create a config/ini file for 
+        # DeepTools pyGenomeTracks:
+        # https://pygenometracks.readthedocs.io/en/latest/index.html
+        make_tracks_file \\
+            --trackFiles {input.raw_bw} \\
+            -o {output.ini}
+
+        # Update config file to increase 
+        # height and number of bins and 
+        # remove suffix from track name
+        sed -i 's/\.raw_coverage$//g' \\
+            {output.ini}
+        sed -i 's/height = 2/height = 4/g' \\
+            {output.ini}
+        sed -i 's/number_of_bins = 700/number_of_bins = 1000/g' \\
+            {output.ini}
+
+        # Plot coverage along the genome,
+        # pyGenomeTracks requires a region
+        # is provided, so we will use the
+        # full length of the reference genome
+        region=$(samtools faidx {params.ref_fa} -o /dev/stdout | head -1 | awk -F '\\t' '{{print $1":1-"$2}}')
+        pyGenomeTracks \\
+            --tracks {output.ini} \\
+            -o {output.pdf} \\
+            --region "$region"
+        """
\ No newline at end of file