Skip to content

Commit

Permalink
try out click version
Browse files Browse the repository at this point in the history
  • Loading branch information
gordonkoehn committed Sep 13, 2024
1 parent 59dbfa3 commit d4f05f8
Show file tree
Hide file tree
Showing 2 changed files with 237 additions and 1 deletion.
236 changes: 236 additions & 0 deletions scripts/amplicon_covs_click.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
import click
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns


def get_samples_paths(main_samples_path, samplestsv):
sam_paths_list = []
with open(samplestsv, "r") as f:
for line in f:
tmp = line.rstrip("\n").split("\t")
sam_paths_list.append(
main_samples_path
+ "/"
+ tmp[0]
+ "/"
+ tmp[1]
+ "/alignments/coverage.tsv.gz"
)
return sam_paths_list


def load_bedfile(bed):
bedfile = pd.read_table(bed, header=None)
bedfile["sense"] = [re.search("(LEFT|RIGHT)", i).group(1) for i in bedfile[3]]
bedfile["primer_num"] = [
int(re.search("_([0-9]+)_", i).group(1)) for i in bedfile[3]
]
bedfile["pool"] = [
int(re.search("([1-2])$", i).group(1)) for i in bedfile[4].astype("str")
]
bedfile = bedfile[[re.search("alt", i) is None for i in bedfile[3]]]
return bedfile


def make_amplicons_df(bedfile):
amplicons = []
for i in np.unique(bedfile["primer_num"]):
pr_num = i
seq_start = bedfile[
(bedfile["primer_num"] == pr_num) & (bedfile["sense"] == "LEFT")
][2].values[0]
primer_start = bedfile[
(bedfile["primer_num"] == pr_num) & (bedfile["sense"] == "LEFT")
][1].values[0]
seq_end = bedfile[
(bedfile["primer_num"] == pr_num) & (bedfile["sense"] == "RIGHT")
][1].values[0]
primer_end = bedfile[
(bedfile["primer_num"] == pr_num) & (bedfile["sense"] == "RIGHT")
][2].values[0]
pool = bedfile[bedfile["primer_num"] == pr_num]["pool"].values[1]
amplicons.append([pool, pr_num, primer_start, seq_start, seq_end, primer_end])

amplicons_df = pd.DataFrame(
np.array(amplicons),
columns=[
"pool",
"primer_num",
"primer_start",
"seq_start",
"seq_end",
"primer_end",
],
)

q_starts = []
q_stops = []
for i in range(amplicons_df.shape[0]):
if i > 0:
query_start = amplicons_df.iloc[i - 1]["primer_end"] + 5
else:
query_start = amplicons_df.iloc[i]["primer_start"]
if i < amplicons_df.shape[0] - 1:
query_stop = amplicons_df.iloc[i + 1]["primer_start"] - 5
else:
query_stop = amplicons_df.iloc[i]["seq_end"]
q_starts.append(query_start)
q_stops.append(query_stop)
amplicons_df["query_start"] = q_starts
amplicons_df["query_end"] = q_stops
return amplicons_df


def get_amplicon_cov(cov_df, start, stop, length=20):
amplicon_slice = cov_df.iloc[np.r_[start:length, (stop - length) : stop], [2]]
return np.median(amplicon_slice)


def get_count_reads(cov_df, amplicons_df):
cov = amplicons_df.apply(
lambda x: get_amplicon_cov(cov_df, x["query_start"], x["query_end"]), axis=1
)
return cov


def make_cov_heatmap(cov_df, output=None):
plt.figure(figsize=(15, 8 * 2.5))
split_at = round(cov_df.shape[0] / 2)
plt.subplot(1, 2, 1)
ax = sns.heatmap(
cov_df.iloc[0:split_at, 1:],
cmap="Reds",
vmin=0,
square=True,
cbar_kws={"shrink": 0.2, "anchor": (0.0, 0.8)},
)
sns.heatmap(
cov_df.iloc[0:split_at, 1:],
cmap=plt.get_cmap("binary"),
vmin=0,
vmax=2,
mask=cov_df.iloc[0:split_at, 1:] > 0,
cbar=False,
ax=ax,
)
plt.xlabel("amplicon")
plt.ylabel("sample")
plt.title("Samples 0:{}".format(split_at))
plt.subplot(1, 2, 2)
ax = sns.heatmap(
cov_df.iloc[split_at:, 1:],
cmap="Reds",
vmin=0,
square=True,
cbar_kws={"shrink": 0.2, "anchor": (0.0, 0.8)},
)
sns.heatmap(
cov_df.iloc[split_at:, 1:],
cmap=plt.get_cmap("binary"),
vmin=0,
vmax=2,
mask=cov_df.iloc[split_at:, 1:] > 0,
cbar=False,
ax=ax,
)
plt.xlabel("amplicon")
plt.ylabel("sample")
plt.title("Samples {}:{}".format(split_at, cov_df.shape[0] - 1))
if output is not None:
plt.savefig(output)


@click.command()
@click.option(
"-r",
"--bedfile-addr",
required=True,
type=click.Path(exists=True),
help="Bedfile of the articV3 primers.",
)
@click.option(
"-s",
"--samp-file",
default="/cluster/project/pangolin/working/samples.tsv",
type=click.Path(exists=True),
help="TSV file like samples.tsv.",
)
@click.option(
"-f",
"--samp-path",
default="/cluster/project/pangolin/working/samples",
type=click.Path(exists=True),
help="Main path to samples.",
)
@click.option(
"-o", "--outdir", default=os.getcwd(), type=click.Path(), help="Output directory."
)
@click.option("-p", "--makeplots", is_flag=True, help="Output plots.")
@click.option("-v", "--verbose", is_flag=True, help="Verbose output.")
def main(bedfile_addr, samp_file, samp_path, outdir, makeplots, verbose):
"""
Script to calculate primer rebalancings according to november 2020 version 5
of the ARTIC V3 protocol for sars-cov-2 sequencing.
"""
if not os.path.exists(outdir):
os.makedirs(outdir)

if verbose:
click.echo("Loading primers bedfile.")
amplicons_df = make_amplicons_df(load_bedfile(bedfile_addr))

if verbose:
click.echo("Reading list of coverage files.")
sam_list = get_samples_paths(samp_path, samp_file)

if verbose:
click.echo("Loading and parsing coverage files.")
all_covs = []
indexes = []
with click.progressbar(sam_list, label="Parsing coverage files") as bar:
for sam in bar:
try:
temp_cov_df = pd.read_csv(sam, sep="\t", compression="gzip")
temp_frac_read_df = pd.DataFrame(
get_count_reads(temp_cov_df, amplicons_df)
).T
indexes.append(sam.split("/")[-4])
all_covs.append(temp_frac_read_df)
except FileNotFoundError:
if verbose:
click.echo(f"WARNING: file {sam} not found.")

all_covs = pd.concat(all_covs, axis=0)
all_covs = all_covs.reset_index(drop=True)
all_covs_frac = all_covs.div(all_covs.sum(axis=1), axis=0)

all_covs = pd.concat(
[pd.DataFrame({"sample": indexes}), all_covs.reset_index(drop=True)],
axis=1,
ignore_index=False,
)
all_covs_frac = pd.concat(
[pd.DataFrame({"sample": indexes}), all_covs_frac.reset_index(drop=True)],
axis=1,
ignore_index=False,
)

if verbose:
click.echo("Outputting .csv's")
all_covs.to_csv(os.path.join(outdir, "amplicons_coverages.csv"), index=False)
all_covs_frac.to_csv(
os.path.join(outdir, "amplicons_coverages_norm.csv"), index=False
)

if makeplots:
if verbose:
click.echo("Outputting plots.")
make_cov_heatmap(all_covs, os.path.join(outdir, "cov_heatmap.pdf"))


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion workflows/amplicon_cov.smk
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ rule relative_amplicon_coverage_allbatches:
shell:
"""
mkdir -p {params.output_dir}
python ../scripts/amplicon_covs.py \
python ../scripts/amplicon_covs_click.py \
-s {input.sample_list} \
-f {input.samples} \
-r {params.primers_fp} \
Expand Down

0 comments on commit d4f05f8

Please sign in to comment.