From d4f05f8176fb163bc785b671e3263c3c520b21a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gordon=20J=2E=20K=C3=B6hn?= Date: Fri, 13 Sep 2024 10:18:47 +0200 Subject: [PATCH] try out click version --- scripts/amplicon_covs_click.py | 236 +++++++++++++++++++++++++++++++++ workflows/amplicon_cov.smk | 2 +- 2 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 scripts/amplicon_covs_click.py diff --git a/scripts/amplicon_covs_click.py b/scripts/amplicon_covs_click.py new file mode 100644 index 0000000..bf7f8f6 --- /dev/null +++ b/scripts/amplicon_covs_click.py @@ -0,0 +1,236 @@ +import click +import numpy as np +import pandas as pd +import os +import re +import matplotlib.pyplot as plt +import seaborn as sns + + +def get_samples_paths(main_samples_path, samplestsv): + sam_paths_list = [] + with open(samplestsv, "r") as f: + for line in f: + tmp = line.rstrip("\n").split("\t") + sam_paths_list.append( + main_samples_path + + "/" + + tmp[0] + + "/" + + tmp[1] + + "/alignments/coverage.tsv.gz" + ) + return sam_paths_list + + +def load_bedfile(bed): + bedfile = pd.read_table(bed, header=None) + bedfile["sense"] = [re.search("(LEFT|RIGHT)", i).group(1) for i in bedfile[3]] + bedfile["primer_num"] = [ + int(re.search("_([0-9]+)_", i).group(1)) for i in bedfile[3] + ] + bedfile["pool"] = [ + int(re.search("([1-2])$", i).group(1)) for i in bedfile[4].astype("str") + ] + bedfile = bedfile[[re.search("alt", i) is None for i in bedfile[3]]] + return bedfile + + +def make_amplicons_df(bedfile): + amplicons = [] + for i in np.unique(bedfile["primer_num"]): + pr_num = i + seq_start = bedfile[ + (bedfile["primer_num"] == pr_num) & (bedfile["sense"] == "LEFT") + ][2].values[0] + primer_start = bedfile[ + (bedfile["primer_num"] == pr_num) & (bedfile["sense"] == "LEFT") + ][1].values[0] + seq_end = bedfile[ + (bedfile["primer_num"] == pr_num) & (bedfile["sense"] == "RIGHT") + ][1].values[0] + primer_end = bedfile[ + (bedfile["primer_num"] == pr_num) & (bedfile["sense"] == "RIGHT") + ][2].values[0] + pool = bedfile[bedfile["primer_num"] == pr_num]["pool"].values[1] + amplicons.append([pool, pr_num, primer_start, seq_start, seq_end, primer_end]) + + amplicons_df = pd.DataFrame( + np.array(amplicons), + columns=[ + "pool", + "primer_num", + "primer_start", + "seq_start", + "seq_end", + "primer_end", + ], + ) + + q_starts = [] + q_stops = [] + for i in range(amplicons_df.shape[0]): + if i > 0: + query_start = amplicons_df.iloc[i - 1]["primer_end"] + 5 + else: + query_start = amplicons_df.iloc[i]["primer_start"] + if i < amplicons_df.shape[0] - 1: + query_stop = amplicons_df.iloc[i + 1]["primer_start"] - 5 + else: + query_stop = amplicons_df.iloc[i]["seq_end"] + q_starts.append(query_start) + q_stops.append(query_stop) + amplicons_df["query_start"] = q_starts + amplicons_df["query_end"] = q_stops + return amplicons_df + + +def get_amplicon_cov(cov_df, start, stop, length=20): + amplicon_slice = cov_df.iloc[np.r_[start:length, (stop - length) : stop], [2]] + return np.median(amplicon_slice) + + +def get_count_reads(cov_df, amplicons_df): + cov = amplicons_df.apply( + lambda x: get_amplicon_cov(cov_df, x["query_start"], x["query_end"]), axis=1 + ) + return cov + + +def make_cov_heatmap(cov_df, output=None): + plt.figure(figsize=(15, 8 * 2.5)) + split_at = round(cov_df.shape[0] / 2) + plt.subplot(1, 2, 1) + ax = sns.heatmap( + cov_df.iloc[0:split_at, 1:], + cmap="Reds", + vmin=0, + square=True, + cbar_kws={"shrink": 0.2, "anchor": (0.0, 0.8)}, + ) + sns.heatmap( + cov_df.iloc[0:split_at, 1:], + cmap=plt.get_cmap("binary"), + vmin=0, + vmax=2, + mask=cov_df.iloc[0:split_at, 1:] > 0, + cbar=False, + ax=ax, + ) + plt.xlabel("amplicon") + plt.ylabel("sample") + plt.title("Samples 0:{}".format(split_at)) + plt.subplot(1, 2, 2) + ax = sns.heatmap( + cov_df.iloc[split_at:, 1:], + cmap="Reds", + vmin=0, + square=True, + cbar_kws={"shrink": 0.2, "anchor": (0.0, 0.8)}, + ) + sns.heatmap( + cov_df.iloc[split_at:, 1:], + cmap=plt.get_cmap("binary"), + vmin=0, + vmax=2, + mask=cov_df.iloc[split_at:, 1:] > 0, + cbar=False, + ax=ax, + ) + plt.xlabel("amplicon") + plt.ylabel("sample") + plt.title("Samples {}:{}".format(split_at, cov_df.shape[0] - 1)) + if output is not None: + plt.savefig(output) + + +@click.command() +@click.option( + "-r", + "--bedfile-addr", + required=True, + type=click.Path(exists=True), + help="Bedfile of the articV3 primers.", +) +@click.option( + "-s", + "--samp-file", + default="/cluster/project/pangolin/working/samples.tsv", + type=click.Path(exists=True), + help="TSV file like samples.tsv.", +) +@click.option( + "-f", + "--samp-path", + default="/cluster/project/pangolin/working/samples", + type=click.Path(exists=True), + help="Main path to samples.", +) +@click.option( + "-o", "--outdir", default=os.getcwd(), type=click.Path(), help="Output directory." +) +@click.option("-p", "--makeplots", is_flag=True, help="Output plots.") +@click.option("-v", "--verbose", is_flag=True, help="Verbose output.") +def main(bedfile_addr, samp_file, samp_path, outdir, makeplots, verbose): + """ + Script to calculate primer rebalancings according to november 2020 version 5 + of the ARTIC V3 protocol for sars-cov-2 sequencing. + """ + if not os.path.exists(outdir): + os.makedirs(outdir) + + if verbose: + click.echo("Loading primers bedfile.") + amplicons_df = make_amplicons_df(load_bedfile(bedfile_addr)) + + if verbose: + click.echo("Reading list of coverage files.") + sam_list = get_samples_paths(samp_path, samp_file) + + if verbose: + click.echo("Loading and parsing coverage files.") + all_covs = [] + indexes = [] + with click.progressbar(sam_list, label="Parsing coverage files") as bar: + for sam in bar: + try: + temp_cov_df = pd.read_csv(sam, sep="\t", compression="gzip") + temp_frac_read_df = pd.DataFrame( + get_count_reads(temp_cov_df, amplicons_df) + ).T + indexes.append(sam.split("/")[-4]) + all_covs.append(temp_frac_read_df) + except FileNotFoundError: + if verbose: + click.echo(f"WARNING: file {sam} not found.") + + all_covs = pd.concat(all_covs, axis=0) + all_covs = all_covs.reset_index(drop=True) + all_covs_frac = all_covs.div(all_covs.sum(axis=1), axis=0) + + all_covs = pd.concat( + [pd.DataFrame({"sample": indexes}), all_covs.reset_index(drop=True)], + axis=1, + ignore_index=False, + ) + all_covs_frac = pd.concat( + [pd.DataFrame({"sample": indexes}), all_covs_frac.reset_index(drop=True)], + axis=1, + ignore_index=False, + ) + + if verbose: + click.echo("Outputting .csv's") + all_covs.to_csv(os.path.join(outdir, "amplicons_coverages.csv"), index=False) + all_covs_frac.to_csv( + os.path.join(outdir, "amplicons_coverages_norm.csv"), index=False + ) + + if makeplots: + if verbose: + click.echo("Outputting plots.") + make_cov_heatmap(all_covs, os.path.join(outdir, "cov_heatmap.pdf")) + + +if __name__ == "__main__": + main() diff --git a/workflows/amplicon_cov.smk b/workflows/amplicon_cov.smk index 6955cda..ea93a53 100644 --- a/workflows/amplicon_cov.smk +++ b/workflows/amplicon_cov.smk @@ -25,7 +25,7 @@ rule relative_amplicon_coverage_allbatches: shell: """ mkdir -p {params.output_dir} - python ../scripts/amplicon_covs.py \ + python ../scripts/amplicon_covs_click.py \ -s {input.sample_list} \ -f {input.samples} \ -r {params.primers_fp} \