diff --git a/README.md b/README.md index dd0df7a..1d80559 100644 --- a/README.md +++ b/README.md @@ -13,4 +13,4 @@ Output: To run this workflow adapt file paths in config and sample names. Run the pipeline on Euler with run_workflow.sh. -Note, at the moment the workflow is SARS-CoV-2 specific, however, reference files and gene annotations can be easily swapped. +Note, at the moment the workflow is SARS-CoV-2 specific, however, reference files and gene annotations can be easily swapped. diff --git a/workflow/Snakefile b/workflow/Snakefile index c454216..bbcdc98 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -22,6 +22,7 @@ rule all: input: f"results/mutations_of_interest.csv", f"results/all_mutations.csv", + f"results/all_mutations.annotated.csv", # note that here deletions are excluded rule copy_coverage_file: input: @@ -164,7 +165,6 @@ rule run_viloca: fname_bad_samples= f"results/samples.bad_coverage.csv", output: fname_vcf=f"results/{{sample}}/snvs.vcf", - #fname_raw=f"results/{{sample}}/variant_calling/snv/raw_snv.tsv", dname_work=directory( f"results/{{sample}}/variant_calling/" ), @@ -193,38 +193,41 @@ rule annotate_vcf: script: "./scripts/annotate_vcf.py" -rule mutations_of_interest: + +rule collect_all_annotated_mutations: input: - fname_snvs_vcf=f"results/{{sample}}/snvs.annotated.vcf", - params: - fname_mutation_definitions=f"resources/mutation_definitions.yaml", + vcf_list=[ + f"results/{sample}/snvs.annotated.vcf" + for sample in all_samples + ], output: - fname_mutations=f"results/{{sample}}/mutations_of_interest.csv", + fname_result_csv=f"results/all_mutations.annotated.csv", + params: + params=all_samples, conda: "envs/annotate_vcf.yaml" script: - "./scripts/scan_mutations.py" - + "./scripts/collect_all_annotated_mutations.py" -rule collect_mutations_of_interest: +rule mutations_of_interest: input: - csv_list=[ - f"results/{sample}/mutations_of_interest.csv" - for sample in all_samples - ], - output: - fname_result_csv=f"results/mutations_of_interest.csv", + fname_all_mutations=f"results/all_mutations.annotated.csv", + fname_bad_samples=f"results/samples.bad_coverage.csv", params: - params=all_samples, + fname_mutation_definitions=f"resources/mutation_definitions.yaml", + all_samples=all_samples, + output: + fname_mutations=f"results/mutations_of_interest.csv", conda: - "envs/viloca.yaml" + "envs/annotate_vcf.yaml" script: - "scripts/collect_mutations.py" + "./scripts/scan_mutations.py" -rule collect_all_annotated_mutations: + +rule collect_all_mutations: input: vcf_list=[ - f"results/{sample}/snvs.annotated.vcf" + f"results/{sample}/snvs.vcf" for sample in all_samples ], output: diff --git a/workflow/notebook/visualize_mutations.ipynb b/workflow/notebook/visualize_mutations.ipynb index 3de7d3d..d83b21b 100644 --- a/workflow/notebook/visualize_mutations.ipynb +++ b/workflow/notebook/visualize_mutations.ipynb @@ -23,7 +23,7 @@ "source": [ "# import sample name mapping to date and location \n", "\n", - "fname_sample_names = \"timeline.tsv\"\n", + "fname_sample_names = \"../../resources/timeline.tsv\"\n", "\n", "df_mapping = pd.read_csv(fname_sample_names, sep=\"\\t\")\n", "df_mapping['my_sample_name'] = df_mapping[\"sample\"] + \"/\"+df_mapping[\"batch\"]" @@ -31,14 +31,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "c1747b50", "metadata": {}, "outputs": [], "source": [ "# import mutations of interest\n", "\n", - "fname_mut = \"mutations_of_interest.csv\"\n", + "fname_mut = \"../../results/mutations_of_interest.csv\"\n", "\n", "df = pd.read_csv(fname_mut)\n", "df = df.drop([\"Unnamed: 0.1\", \"index\", \"Unnamed: 0\", \"INFO\", \"INFO_list\"], axis=1)" @@ -167,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "1217f6d2", "metadata": {}, "outputs": [ @@ -203,7 +203,16 @@ "plt.title(\"Mutation frequency\", fontsize=20)\n", "plt.suptitle(\"* = non-synonymous mutations\", x= 0.7,y=-0.01, fontsize=10)\n", "plt.tight_layout()\n", - "plt.savefig(\"mutation_calls_on_potential_drug_resistance_sites.pdf\")" + "plt.savefig(\"mutation_calls_on_potential_drug_resistance_sites.pdf\")\n", + "plt.savefig(\"mutation_calls_on_potential_drug_resistance_sites.svg\")" + ] + }, + { + "cell_type": "markdown", + "id": "5b91ee24", + "metadata": {}, + "source": [ + "### Plotting the posterior" ] }, { diff --git a/workflow/scripts/scan_mutations.py b/workflow/scripts/scan_mutations.py index 3ff7674..86bb9e6 100644 --- a/workflow/scripts/scan_mutations.py +++ b/workflow/scripts/scan_mutations.py @@ -1,6 +1,6 @@ """ Input: - -- snvs.vcf outputted by VILOCA + -- csv files of all mutations -- yaml with mutation definitions Output: @@ -8,7 +8,6 @@ """ import yaml import pandas as pd -from fuc import pyvcf def parse_yaml(fname_yaml): @@ -19,32 +18,36 @@ def parse_yaml(fname_yaml): return dict_mut +def main(fname_muts, fname_csv, fname_yaml, all_samples, fname_bad_samples): - - -def main(fname_vcf, fname_csv, fname_yaml): + # get bad samples --> those should not occurr in the results as they haven't + # been processed and we cannot say anything about them + bad_samples = pd.read_csv(fname_bad_samples)['sample'].values.tolist() # read mutations dict_mut = parse_yaml(fname_yaml) positions_of_interest = [int(x) for x in list(dict_mut.keys())] - if fname_vcf.split(".")[-1]=="vcf": - # vcf into dataframe - df_vcf = pyvcf.VcfFrame.from_file(fname_vcf).df - df_mut = df_vcf[df_vcf['POS'].isin(positions_of_interest)] + df_muts = pd.read_csv(fname_muts) + + # filter for positions of interest + df_muts = df_muts[df_muts['POS'].isin(positions_of_interest)] - elif fname_vcf.split(".")[-1]=="tsv": - df_vcf = pd.read_csv(fname_vcf, sep='\t') - if df_vcf.shape[0]>0: - df_mut = df_vcf[df_vcf['Pos'].isin(positions_of_interest)] - else: - df_mut = df_vcf + # for samples that don't have positins of interest we need to add an empty lind + for sample_name in all_samples: + if sample_name not in bad_samples: + if sample_name not in df_muts['sample'].unique(): + # create empty row + df_muts = df_muts.append({'sample':sample_name}, ignore_index=True) + #df_muts = pd.concat([df_muts, pd.DataFrame({"sample": sample_name})]) - df_mut.to_csv(fname_csv) + df_muts.to_csv(fname_csv) if __name__ == "__main__": main( - snakemake.input.fname_snvs_vcf, + snakemake.input.fname_all_mutations, snakemake.output.fname_mutations, snakemake.params.fname_mutation_definitions, + snakemake.params.all_samples, + snakemake.input.fname_bad_samples, )