Merge pull request #11 from blab/rewrite

First pass at making snakemake workflow for innovation model
blab · Dec 24, 2024 · 6abc677 · 6abc677
2 parents 78b806f + 09b7ac6
commit 6abc677
Show file tree

Hide file tree

Showing 29 changed files with 11,835 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 # (or at least out of _this_ git repo).
 benchmarks/
 logs/
+data/
 results/
 build/
 auspice/

diff --git a/README.md b/README.md
@@ -1,5 +1,64 @@
 # Analysis of SARS-CoV-2 antigenic drift and evolutionary fitness
 
-## Nextstrain workflow
+## Installation
 
-There is a Nextstrain workflow forked from v12 of the canonical Nextstrain [ncov](https://github.com/nextstrain/ncov) workflow in the [`ncov-workflow`](ncov-workflow/) directory.
+All dependencies can be met by running through the Nextstrain runtime.
+See [docs.nextstrain.org](https://docs.nextstrain.org/en/latest/install.html) for installation instructions.
+
+## Provision metadata locally
+
+Windowed analyses require access to SARS-CoV-2 metadata.
+This can be acquired via
+
+```bash
+aws s3 cp s3://nextstrain-ncov-private/metadata.tsv.gz data/gisaid_metadata.tsv.gz
+zstd -c -d data/gisaid_metadata.tsv.gz \
+   | tsv-select -H -f strain,date,date_submitted,country,clade_nextstrain,Nextclade_pango,QC_overall_status \
+   | gzip -c > data/gisaid_metadata_filtered.tsv.gz
+```
+
+Non-windowed analyses will provision sequence counts from [forecasts-ncov](https://github.com/nextstrain/forecasts-ncov).
+
+
+## Workflow
+
+Once metadata is provision locally, run the workflow with
+
+```
+nextstrain build . all
+```
+
+This will generate sequence count files for all analyses specificied in `./config/config.yaml`.
+Under `analysis_period` in the config file, you can specify an analysis name as well as minimum and maximum dates for the analysis, the pivot variant of interest, lineages to forcibly include, and predictors to use for the regression-prior model e.g.
+
+```
+analysis_period:
+  xbb15:
+    min_date: "2023-01-01"
+    max_date: "2023-12-01"
+    pivot: "XBB.1.5"
+    force_include: "defaults/xbb15/force_include_lineages.txt"
+    predictor_names:
+      - "spike pseudovirus DMS human sera escape relative to XBB.1.5"
+      - "spike pseudovirus DMS ACE2 binding relative to XBB.1.5"
+      - "RBD yeast-display DMS ACE2 affinity relative to XBB.1.5"
+      - "RBD yeast-display DMS RBD expression relative to XBB.1.5"
+      - "RBD yeast-display DMS escape relative to XBB.1.5"
+```
+
+### Sequence counts and variant relationships
+
+It produces sequence count files for windowed analyses, non-windowed analyses, collapsing lineages into their parents based on count thresholds specified in `./config/config.yaml`, as well as generating variant relationship files, so that the innovation model can be fit.
+
+Collapsed sequence counts follow the form `data/{analysis_period}/collapsed_seq_counts.tsv` and variant relationships follow the form `data/{analysis_period}/pango_variant_relationships.tsv`.
+
+### MLR innovation model
+
+The collapsed sequence count files and variant relationships are used to estimate relative fitness using the uninformed (normal-prior) innovation model.
+
+For each analysis period, this produces `results/{analysis_period}/growth_advantage.tsv` and `results/{analysis_periods}/growth_advantage_delta.tsv`.
+
+The model posteriors will also be saved under `results/{analysis_period}/posteriors/data_{location}.pkl` and `results/{analysis_period}/posteriors/samples_{location}.pkl`.
+
+If predictor names are provided, they are used to fit the regression prior innovation model.
+These predictor-informed results are stored under `results/{analysis_period}/informed/growth_advantages.tsv` and `results/{analysis_period}/informed/growth_advantages.tsv`.
diff --git a/Snakefile b/Snakefile
@@ -0,0 +1,59 @@
+import os
+import pandas as pd
+
+if not config:
+    configfile: "config/config.yaml"
+
+wildcard_constraints:
+    data_provenance="[A-Za-z0-9_-]+",  # Allow letters, numbers, underscores, and dashes
+    analysis_period="[A-Za-z0-9_-]+",  # Allow letters, numbers, underscores, and dashes
+    geo_resolution="[A-Za-z0-9_-]+",  # Allow letters, numbers, underscores, and dashes
+    variant_classifications="[A-Za-z0-9_-]+"  # Allow letters, numbers, underscores, and dashes
+
+def _get_date_range(ap):
+    obs_date_min = config["analysis_period"].get(ap, {}).get('obs_date_min')
+    obs_date_max = config["analysis_period"].get(ap, {}).get('obs_date_max')
+    obs_date_interval = config["analysis_period"].get(ap, {}).get('interval')
+    date_range = pd.date_range(
+        start=obs_date_min,
+        end=obs_date_max,
+        freq=obs_date_interval,
+    ).strftime("%Y-%m-%d").tolist()
+
+    return date_range
+
+def _get_all_input(w):
+    data_provenances = config["data_provenances"] if isinstance(config["data_provenances"], list) else [config["data_provenances"]]
+    variant_classifications = config["variant_classifications"] if isinstance(config["variant_classifications"], list) else [config["variant_classifications"]]
+    geo_resolutions = config["geo_resolutions"] if isinstance(config["geo_resolutions"], list) else [config["geo_resolutions"]]
+    analysis_periods = config["analysis_period"]
+
+    # date_ranges = {analysis: generate_dates(analysis_periods[analysis]) for analysis in analysis_periods}
+    all_input = [
+        # Non-windowed analyses
+        *expand(
+            "results/{analysis_period}/growth_advantages.tsv",
+            analysis_period=[ap for ap, cfg in analysis_periods.items() if not cfg.get("windowed", False)],
+        ),
+        *expand(
+            "results/{analysis_period}/informed/growth_advantages.tsv",
+            analysis_period=[ap for ap, cfg in analysis_periods.items() if (not cfg.get("windowed", False)) and (cfg.get("predictor_names", False))],
+	)
+    ]
+    # Windowed analyses
+    for ap, cfg in analysis_periods.items():
+        if cfg.get("windowed", False):
+            all_input += expand(
+                "results/{analysis_period}/growth_advantages_{obs_date}.tsv",
+                analysis_period=[ap],
+                obs_date=_get_date_range(ap)
+                )
+    return all_input
+
+rule all:
+    input: _get_all_input
+
+include: "workflow/snakemake_rules/prepare_data.smk"
+include: "workflow/snakemake_rules/retrieve_phenotypes.smk"
+include: "workflow/snakemake_rules/run_models.smk"
+include: "workflow/snakemake_rules/run_models_over_period.smk"
diff --git a/config/config.yaml b/config/config.yaml
@@ -0,0 +1,54 @@
+data_provenances:
+  - gisaid
+variant_classifications:
+  - pango_lineages
+geo_resolutions:
+  - global
+analysis_periods:
+  - xbb15
+
+# Params for the prepare data scripts
+# Define params for each data_provenance / variant_classification / geo_resolution combination
+# Include `max_date` if you don't want to use today as the max date
+prepare_data:
+  gisaid:
+    pango_lineages:
+      global:
+        location_min_seq: 1000
+        excluded_locations: "defaults/global_excluded_locations.txt"
+        clade_min_seq: 1
+        collapse_threshold: 200
+
+# In case we want to run models for various different pandemic periods
+analysis_period:
+  ba2:
+    min_date: "2021-11-01"
+    max_date: "2022-11-01"
+    pivot: "BA.2"
+  xbb15:
+    min_date: "2023-01-01"
+    max_date: "2023-12-01"
+    pivot: "XBB.1.5"
+    force_include: "defaults/xbb15/force_include_lineages.txt"
+    predictor_names:
+      - "spike pseudovirus DMS human sera escape relative to XBB.1.5"
+      - "spike pseudovirus DMS ACE2 binding relative to XBB.1.5"
+      - "RBD yeast-display DMS ACE2 affinity relative to XBB.1.5"
+      - "RBD yeast-display DMS RBD expression relative to XBB.1.5"
+      - "RBD yeast-display DMS escape relative to XBB.1.5"
+  xbb15_test:
+    min_date: "2024-01-01"
+    max_date: "2024-6-30"
+    pivot: "XBB.1.5"
+    force_include: "defaults/xbb15/force_include_lineages.txt"
+  jn1:
+    min_date: "2024-01-01"
+    max_date: "2024-10-01"
+    pivot: "JN.1"
+  xbb15_windowed:
+    windowed: true
+    obs_date_min: "2023-01-01"
+    obs_date_max: "2023-12-01"
+    interval: "2M"
+    pivot: "XBB.1.5"
+    num_days_context: 90
diff --git a/defaults/global_excluded_locations.txt b/defaults/global_excluded_locations.txt
@@ -0,0 +1,5 @@
+Austria
+Czech Republic
+Lithuania
+Luxembourg
+Slovakia
diff --git a/defaults/xbb15/force_include_lineages.txt b/defaults/xbb15/force_include_lineages.txt
@@ -0,0 +1,3 @@
+XBB
+XBB.1.5
+XBB.2
diff --git a/manuscript/ncov-escape.bib b/manuscript/ncov-escape.bib
@@ -6,8 +6,6 @@
 
 %% Saved with string encoding Unicode (UTF-8) 
 
-
-
 @article{hadfield2018nextstrain,
 	author = {Hadfield, James and Megill, Colin and Bell, Sidney M and Huddleston, John and Potter, Barney and Callender, Charlton and Sagulenko, Pavel and Bedford, Trevor and Neher, Richard A},
 	date-added = {2022-11-22 14:10:43 -0800},
@@ -18,3 +16,14 @@ @article{hadfield2018nextstrain
 	title = {Nextstrain: real-time tracking of pathogen evolution},
 	volume = {34},
 	year = {2018}}
+
+@article{khare2021gisaid,
+	author = {Khare, Shruti and Gurry, C{\'e}line and Freitas, Lucas and Schultz, Mark B and Bach, Gunter and Diallo, Amadou and Akite, Nancy and Ho, Joses and Lee, Raphael TC and Yeo, Winston and others},
+	date-added = {2023-11-30 14:24:46 +0100},
+	date-modified = {2023-11-30 14:24:59 +0100},
+	journal = {China CDC weekly},
+	pages = {1049},
+	title = {GISAID's role in pandemic response},
+	volume = {3},
+	year = {2021}}
+
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    XBB
+    XBB.1.5
+    XBB.2