Adding Snakemake Unit Test in Github CI (#20)

cbg-ethz · Sep 26, 2024 · 1a5801f · 1a5801f
1 parent 74cc508
commit 1a5801f
Show file tree

Hide file tree

Showing 9 changed files with 154 additions and 89 deletions.
diff --git a/.github/workflows/test-snake.yml b/.github/workflows/test-snake.yml
@@ -49,9 +49,10 @@ jobs:
       - name: Install dependencies
         run: |
           pip install -e .
+          pip install pytest
 
-      - name: Snakemake Testing
+      - name: Snakemake Unit Testing
         run: |
-          snakemake --cores 1 --snakefile workflow/Snakefile --directory .test --verbose
+          pytest workflow/.tests 
 
-      
+      # TODO: add dry-run testing
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,11 +32,21 @@ repos:
     rev: v0.10.2 
     hooks:
       - id: snakefmt
+
   - repo: local
     hooks:
-      - id: snakemake-dryrun
-        name: Snakemake Dry Run
-        entry: bash -c 'cd workflow && poetry run snakemake -n'
+      - id: snakemake-unit-testing
+        name: Snakemake Unit Testing
+        entry: bash -c 'poetry run pytest workflow/.tests'
         language: system
-        files: (Snakefile|\.smk$)
-        pass_filenames: false
+        types: [python]
+
+# TODO enable dry-run testing
+  # - repo: local
+  #   hooks:
+  #     - id: snakemake-dryrun
+  #       name: Snakemake Dry Run
+  #       entry: bash -c 'cd workflow && poetry run snakemake -n'
+  #       language: system
+  #       files: (Snakefile|\.smk$)
+  #       pass_filenames: false
diff --git a/config/amplicon_cov.smk b/config/amplicon_cov.smk
@@ -1,5 +1,5 @@
 # Inputs
-sample_list_dir : "/cluster/project/pangolin/work-amplicon-coverage/test_data/"
-sample_dir : "/cluster/project/pangolin/work-amplicon-coverage/test_data/samples"
+sample_list_dir: "/cluster/project/pangolin/work-amplicon-coverage/test_data/"
+sample_dir: "/cluster/project/pangolin/work-amplicon-coverage/test_data/samples"
 # Outputs
-output_dir : "/cluster/home/koehng/temp/amplicon_cov/"
+output_dir: "/cluster/home/koehng/temp/amplicon_cov/"
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ seaborn = "^0.13.2"
 pandas-stubs = "^2.2.2.240807"
 click = "^8.1.7"
 snakemake = "^8.20.4"
+interrogate = "^1.7.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.2.1"

diff --git a/workflow/.tests/unit/common.py b/workflow/.tests/unit/common.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 import subprocess as sp
 import os
+import pandas as pd
 
 import sys
 
@@ -74,3 +75,26 @@ def compare_files(self, generated_file, expected_file):
         Compare the generated file with the expected file.
         """
         sp.check_output(["cmp", generated_file, expected_file])
+
+
+def compare_csv_files(
+    file1_path: str, file2_path: str, tolerance: float = 1e-4
+) -> bool:
+    """
+    Compare two CSV files with a given tolerance.
+    """
+    df1 = pd.read_csv(file1_path, skiprows=[1])
+    df2 = pd.read_csv(file2_path, skiprows=[1])
+
+    if df1.shape != df2.shape:
+        raise ValueError("DataFrames have different shapes")
+
+    # check that the data frames contrain the same data types
+    assert df1.dtypes.equals(df2.dtypes)
+
+    # check that the data frames contain the same data
+    pd.testing.assert_frame_equal(
+        df1, df2, check_exact=False, rtol=tolerance, atol=tolerance
+    )
+
+    return True
diff --git a/workflow/.tests/unit/test_make_price_data.py b/workflow/.tests/unit/test_make_price_data.py
@@ -1,14 +1,23 @@
+"""
+This script tests the make_price_data rule.
+"""
+
 import os
 import sys
 import subprocess as sp
 from tempfile import TemporaryDirectory
 import shutil
 from pathlib import Path
 
+from common import compare_csv_files
+
 sys.path.insert(0, os.path.dirname(__file__))
 
 
 def test_make_price_data():
+    """
+    Test the make_price_data rule.
+    """
     with TemporaryDirectory() as tmpdir:
         workdir = Path(tmpdir) / "workdir"
         workdir.mkdir(exist_ok=True)
@@ -55,17 +64,12 @@ def test_make_price_data():
         assert (workdir / "results" / "statistics.csv").exists()
 
         # Compare output with expected result
-        result = sp.run(
-            [
-                "diff",
-                str(workdir / "results" / "statistics.csv"),
-                str(expected_path / "statistics.csv"),
-            ],
-            capture_output=True,
-            text=True,
+        files_match = compare_csv_files(
+            str(workdir / "results" / "statistics.csv"),
+            str(expected_path / "statistics.csv"),
         )
 
-        assert result.returncode == 0, f"Files are different:\n{result.stdout}"
+        assert files_match, "Files are different within the specified tolerance"
 
 
 ### Main

diff --git a/workflow/rules/amplicon_cov.smk b/workflow/rules/amplicon_cov.smk
@@ -9,15 +9,15 @@ rule relative_amplicon_coverage_per_batch:
     Calculate the relative amplicon coverage for all samples in the batch specific samples{batch}.tsv file.
     """
     input:
-        sample_list = config['sample_list_dir'] + "samples{batch}.tsv",
-        samples = config['sample_dir']
+        sample_list=config["sample_list_dir"] + "samples{batch}.tsv",
+        samples=config["sample_dir"],
     output:
-        heatmap = config["output_dir"] + "{batch}/cov_heatmap.pdf",
+        heatmap=config["output_dir"] + "{batch}/cov_heatmap.pdf",
     params:
-        primers_fp ="../resources/amplicon_cov/articV3primers.bed",
-        output_dir = config["output_dir"] + "{batch}/"
+        primers_fp="../resources/amplicon_cov/articV3primers.bed",
+        output_dir=config["output_dir"] + "{batch}/",
     log:
-        config["output_dir"] + "relative_amplicon_coverage_per_batch/{batch}.log"
+        config["output_dir"] + "relative_amplicon_coverage_per_batch/{batch}.log",
     shell:
         """
         mkdir -p {params.output_dir}
@@ -33,19 +33,20 @@ rule relative_amplicon_coverage_per_batch:
 
 rule get_samples_per_batch:
     input:
-        samples_list = config['sample_list_dir'] + "samples.tsv"
+        samples_list=config["sample_list_dir"] + "samples.tsv",
     output:
-        samples_batch = config['sample_list_dir'] + "samples{batch}.tsv",
+        samples_batch=config["sample_list_dir"] + "samples{batch}.tsv",
     log:
-        config["output_dir"] + "get_samples_per_batch_{batch}.log"
+        config["output_dir"] + "get_samples_per_batch_{batch}.log",
     shell:
         """
         grep {wildcards.batch} {input.samples_list} > {output.samples_batch}
         """
 
+
 rule get_coverage_for_batch:
     """
     Calculate the relative amplicon coverage for all samples in the batch specific samples{batch}.tsv file.
     """
     input:
-        samples = f"{config['output_dir']}20240705_AAFH52MM5/cov_heatmap.pdf",
+        samples=f"{config['output_dir']}20240705_AAFH52MM5/cov_heatmap.pdf",