Skip to content

Commit

Permalink
more details
Browse files Browse the repository at this point in the history
  • Loading branch information
LaraFuhrmann committed Jul 11, 2023
2 parents 9565512 + 2b14f0d commit c39beea
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 43 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ Output:

To run this workflow adapt file paths in config and sample names. Run the pipeline on Euler with run_workflow.sh.

Note, at the moment the workflow is SARS-CoV-2 specific, however, reference files and gene annotations can be easily swapped.
Note, at the moment the workflow is SARS-CoV-2 specific, however, reference files and gene annotations can be easily swapped.
43 changes: 23 additions & 20 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ rule all:
input:
f"results/mutations_of_interest.csv",
f"results/all_mutations.csv",
f"results/all_mutations.annotated.csv", # note that here deletions are excluded

rule copy_coverage_file:
input:
Expand Down Expand Up @@ -164,7 +165,6 @@ rule run_viloca:
fname_bad_samples= f"results/samples.bad_coverage.csv",
output:
fname_vcf=f"results/{{sample}}/snvs.vcf",
#fname_raw=f"results/{{sample}}/variant_calling/snv/raw_snv.tsv",
dname_work=directory(
f"results/{{sample}}/variant_calling/"
),
Expand Down Expand Up @@ -193,38 +193,41 @@ rule annotate_vcf:
script:
"./scripts/annotate_vcf.py"

rule mutations_of_interest:

rule collect_all_annotated_mutations:
input:
fname_snvs_vcf=f"results/{{sample}}/snvs.annotated.vcf",
params:
fname_mutation_definitions=f"resources/mutation_definitions.yaml",
vcf_list=[
f"results/{sample}/snvs.annotated.vcf"
for sample in all_samples
],
output:
fname_mutations=f"results/{{sample}}/mutations_of_interest.csv",
fname_result_csv=f"results/all_mutations.annotated.csv",
params:
params=all_samples,
conda:
"envs/annotate_vcf.yaml"
script:
"./scripts/scan_mutations.py"

"./scripts/collect_all_annotated_mutations.py"

rule collect_mutations_of_interest:
rule mutations_of_interest:
input:
csv_list=[
f"results/{sample}/mutations_of_interest.csv"
for sample in all_samples
],
output:
fname_result_csv=f"results/mutations_of_interest.csv",
fname_all_mutations=f"results/all_mutations.annotated.csv",
fname_bad_samples=f"results/samples.bad_coverage.csv",
params:
params=all_samples,
fname_mutation_definitions=f"resources/mutation_definitions.yaml",
all_samples=all_samples,
output:
fname_mutations=f"results/mutations_of_interest.csv",
conda:
"envs/viloca.yaml"
"envs/annotate_vcf.yaml"
script:
"scripts/collect_mutations.py"
"./scripts/scan_mutations.py"

rule collect_all_annotated_mutations:

rule collect_all_mutations:
input:
vcf_list=[
f"results/{sample}/snvs.annotated.vcf"
f"results/{sample}/snvs.vcf"
for sample in all_samples
],
output:
Expand Down
19 changes: 14 additions & 5 deletions workflow/notebook/visualize_mutations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,22 @@
"source": [
"# import sample name mapping to date and location \n",
"\n",
"fname_sample_names = \"timeline.tsv\"\n",
"fname_sample_names = \"../../resources/timeline.tsv\"\n",
"\n",
"df_mapping = pd.read_csv(fname_sample_names, sep=\"\\t\")\n",
"df_mapping['my_sample_name'] = df_mapping[\"sample\"] + \"/\"+df_mapping[\"batch\"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "c1747b50",
"metadata": {},
"outputs": [],
"source": [
"# import mutations of interest\n",
"\n",
"fname_mut = \"mutations_of_interest.csv\"\n",
"fname_mut = \"../../results/mutations_of_interest.csv\"\n",
"\n",
"df = pd.read_csv(fname_mut)\n",
"df = df.drop([\"Unnamed: 0.1\", \"index\", \"Unnamed: 0\", \"INFO\", \"INFO_list\"], axis=1)"
Expand Down Expand Up @@ -167,7 +167,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"id": "1217f6d2",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -203,7 +203,16 @@
"plt.title(\"Mutation frequency\", fontsize=20)\n",
"plt.suptitle(\"* = non-synonymous mutations\", x= 0.7,y=-0.01, fontsize=10)\n",
"plt.tight_layout()\n",
"plt.savefig(\"mutation_calls_on_potential_drug_resistance_sites.pdf\")"
"plt.savefig(\"mutation_calls_on_potential_drug_resistance_sites.pdf\")\n",
"plt.savefig(\"mutation_calls_on_potential_drug_resistance_sites.svg\")"
]
},
{
"cell_type": "markdown",
"id": "5b91ee24",
"metadata": {},
"source": [
"### Plotting the posterior"
]
},
{
Expand Down
37 changes: 20 additions & 17 deletions workflow/scripts/scan_mutations.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
"""
Input:
-- snvs.vcf outputted by VILOCA
-- csv files of all mutations
-- yaml with mutation definitions
Output:
-- per sample csv file with the mutation calls and their respective frequency
"""
import yaml
import pandas as pd
from fuc import pyvcf

def parse_yaml(fname_yaml):

Expand All @@ -19,32 +18,36 @@ def parse_yaml(fname_yaml):
return dict_mut


def main(fname_muts, fname_csv, fname_yaml, all_samples, fname_bad_samples):



def main(fname_vcf, fname_csv, fname_yaml):
# get bad samples --> those should not occurr in the results as they haven't
# been processed and we cannot say anything about them
bad_samples = pd.read_csv(fname_bad_samples)['sample'].values.tolist()

# read mutations
dict_mut = parse_yaml(fname_yaml)
positions_of_interest = [int(x) for x in list(dict_mut.keys())]

if fname_vcf.split(".")[-1]=="vcf":
# vcf into dataframe
df_vcf = pyvcf.VcfFrame.from_file(fname_vcf).df
df_mut = df_vcf[df_vcf['POS'].isin(positions_of_interest)]
df_muts = pd.read_csv(fname_muts)

# filter for positions of interest
df_muts = df_muts[df_muts['POS'].isin(positions_of_interest)]

elif fname_vcf.split(".")[-1]=="tsv":
df_vcf = pd.read_csv(fname_vcf, sep='\t')
if df_vcf.shape[0]>0:
df_mut = df_vcf[df_vcf['Pos'].isin(positions_of_interest)]
else:
df_mut = df_vcf
# for samples that don't have positins of interest we need to add an empty lind
for sample_name in all_samples:
if sample_name not in bad_samples:
if sample_name not in df_muts['sample'].unique():
# create empty row
df_muts = df_muts.append({'sample':sample_name}, ignore_index=True)
#df_muts = pd.concat([df_muts, pd.DataFrame({"sample": sample_name})])

df_mut.to_csv(fname_csv)
df_muts.to_csv(fname_csv)

if __name__ == "__main__":
main(
snakemake.input.fname_snvs_vcf,
snakemake.input.fname_all_mutations,
snakemake.output.fname_mutations,
snakemake.params.fname_mutation_definitions,
snakemake.params.all_samples,
snakemake.input.fname_bad_samples,
)

0 comments on commit c39beea

Please sign in to comment.