more details

cbg-ethz · Jul 11, 2023 · c39beea · c39beea
2 parents 9565512 + 2b14f0d
commit c39beea
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -13,4 +13,4 @@ Output:
 
 To run this workflow adapt file paths in config and sample names. Run the pipeline on Euler with run_workflow.sh.
 
-Note, at the moment the workflow is SARS-CoV-2 specific, however, reference files and gene annotations can be easily swapped. 
+Note, at the moment the workflow is SARS-CoV-2 specific, however, reference files and gene annotations can be easily swapped.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -22,6 +22,7 @@ rule all:
     input:
         f"results/mutations_of_interest.csv",
         f"results/all_mutations.csv",
+        f"results/all_mutations.annotated.csv", # note that here deletions are excluded
 
 rule copy_coverage_file:
     input:
@@ -164,7 +165,6 @@ rule run_viloca:
         fname_bad_samples= f"results/samples.bad_coverage.csv",
     output:
         fname_vcf=f"results/{{sample}}/snvs.vcf",
-        #fname_raw=f"results/{{sample}}/variant_calling/snv/raw_snv.tsv",
         dname_work=directory(
             f"results/{{sample}}/variant_calling/"
         ),
@@ -193,38 +193,41 @@ rule annotate_vcf:
     script:
         "./scripts/annotate_vcf.py"
 
-rule mutations_of_interest:
+
+rule collect_all_annotated_mutations:
     input:
-        fname_snvs_vcf=f"results/{{sample}}/snvs.annotated.vcf",
-    params:
-        fname_mutation_definitions=f"resources/mutation_definitions.yaml",
+        vcf_list=[
+            f"results/{sample}/snvs.annotated.vcf"
+            for sample in all_samples
+        ],
     output:
-        fname_mutations=f"results/{{sample}}/mutations_of_interest.csv",
+        fname_result_csv=f"results/all_mutations.annotated.csv",
+    params:
+        params=all_samples,
     conda:
         "envs/annotate_vcf.yaml"
     script:
-        "./scripts/scan_mutations.py"
-
+        "./scripts/collect_all_annotated_mutations.py"
 
-rule collect_mutations_of_interest:
+rule mutations_of_interest:
     input:
-        csv_list=[
-            f"results/{sample}/mutations_of_interest.csv"
-            for sample in all_samples
-        ],
-    output:
-        fname_result_csv=f"results/mutations_of_interest.csv",
+        fname_all_mutations=f"results/all_mutations.annotated.csv",
+        fname_bad_samples=f"results/samples.bad_coverage.csv",
     params:
-        params=all_samples,
+        fname_mutation_definitions=f"resources/mutation_definitions.yaml",
+        all_samples=all_samples,
+    output:
+        fname_mutations=f"results/mutations_of_interest.csv",
     conda:
-        "envs/viloca.yaml"
+        "envs/annotate_vcf.yaml"
     script:
-        "scripts/collect_mutations.py"
+        "./scripts/scan_mutations.py"
 
-rule collect_all_annotated_mutations:
+
+rule collect_all_mutations:
     input:
         vcf_list=[
-            f"results/{sample}/snvs.annotated.vcf"
+            f"results/{sample}/snvs.vcf"
             for sample in all_samples
         ],
     output:

diff --git a/workflow/notebook/visualize_mutations.ipynb b/workflow/notebook/visualize_mutations.ipynb
@@ -23,22 +23,22 @@
    "source": [
     "# import sample name mapping to date and location \n",
     "\n",
-    "fname_sample_names = \"timeline.tsv\"\n",
+    "fname_sample_names = \"../../resources/timeline.tsv\"\n",
     "\n",
     "df_mapping = pd.read_csv(fname_sample_names, sep=\"\\t\")\n",
     "df_mapping['my_sample_name'] = df_mapping[\"sample\"] + \"/\"+df_mapping[\"batch\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "c1747b50",
    "metadata": {},
    "outputs": [],
    "source": [
     "# import mutations of interest\n",
     "\n",
-    "fname_mut = \"mutations_of_interest.csv\"\n",
+    "fname_mut = \"../../results/mutations_of_interest.csv\"\n",
     "\n",
     "df = pd.read_csv(fname_mut)\n",
     "df = df.drop([\"Unnamed: 0.1\", \"index\", \"Unnamed: 0\", \"INFO\", \"INFO_list\"], axis=1)"
@@ -167,7 +167,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "1217f6d2",
    "metadata": {},
    "outputs": [
@@ -203,7 +203,16 @@
     "plt.title(\"Mutation frequency\", fontsize=20)\n",
     "plt.suptitle(\"* = non-synonymous mutations\", x= 0.7,y=-0.01, fontsize=10)\n",
     "plt.tight_layout()\n",
-    "plt.savefig(\"mutation_calls_on_potential_drug_resistance_sites.pdf\")"
+    "plt.savefig(\"mutation_calls_on_potential_drug_resistance_sites.pdf\")\n",
+    "plt.savefig(\"mutation_calls_on_potential_drug_resistance_sites.svg\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b91ee24",
+   "metadata": {},
+   "source": [
+    "### Plotting the posterior"
    ]
   },
   {

diff --git a/workflow/scripts/scan_mutations.py b/workflow/scripts/scan_mutations.py
@@ -1,14 +1,13 @@
 """
 Input:
-    -- snvs.vcf outputted by VILOCA
+    -- csv files of all mutations
     -- yaml with mutation definitions
 
 Output:
     -- per sample csv file with the mutation calls and their respective frequency
 """
 import yaml
 import pandas as pd
-from fuc import pyvcf
 
 def parse_yaml(fname_yaml):
 
@@ -19,32 +18,36 @@ def parse_yaml(fname_yaml):
     return dict_mut
 
 
+def main(fname_muts, fname_csv, fname_yaml, all_samples, fname_bad_samples):
 
-
-
-def main(fname_vcf, fname_csv, fname_yaml):
+    # get bad samples --> those should not occurr in the results as they haven't
+    # been processed and we cannot say anything about them
+    bad_samples = pd.read_csv(fname_bad_samples)['sample'].values.tolist()
 
     # read mutations
     dict_mut = parse_yaml(fname_yaml)
     positions_of_interest = [int(x) for x in list(dict_mut.keys())]
 
-    if fname_vcf.split(".")[-1]=="vcf":
-        # vcf into dataframe
-        df_vcf = pyvcf.VcfFrame.from_file(fname_vcf).df
-        df_mut = df_vcf[df_vcf['POS'].isin(positions_of_interest)]
+    df_muts = pd.read_csv(fname_muts)
+
+    # filter for positions of interest
+    df_muts = df_muts[df_muts['POS'].isin(positions_of_interest)]
 
-    elif  fname_vcf.split(".")[-1]=="tsv":
-        df_vcf = pd.read_csv(fname_vcf, sep='\t')
-        if df_vcf.shape[0]>0:
-            df_mut = df_vcf[df_vcf['Pos'].isin(positions_of_interest)]
-        else:
-            df_mut = df_vcf
+    # for samples that don't have positins of interest we need to add an empty lind
+    for sample_name in all_samples:
+        if sample_name not in bad_samples:
+            if sample_name not in df_muts['sample'].unique():
+                # create empty row
+                df_muts = df_muts.append({'sample':sample_name}, ignore_index=True)
+                #df_muts = pd.concat([df_muts, pd.DataFrame({"sample": sample_name})])
 
-    df_mut.to_csv(fname_csv)
+    df_muts.to_csv(fname_csv)
 
 if __name__ == "__main__":
     main(
-        snakemake.input.fname_snvs_vcf,
+        snakemake.input.fname_all_mutations,
         snakemake.output.fname_mutations,
         snakemake.params.fname_mutation_definitions,
+        snakemake.params.all_samples,
+        snakemake.input.fname_bad_samples,
     )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,4 +13,4 @@ Output:

		To run this workflow adapt file paths in config and sample names. Run the pipeline on Euler with run_workflow.sh.

		Note, at the moment the workflow is SARS-CoV-2 specific, however, reference files and gene annotations can be easily swapped.
		Note, at the moment the workflow is SARS-CoV-2 specific, however, reference files and gene annotations can be easily swapped.