Merge pull request #1244: Add type checking (mypy)

nextstrain · Jun 30, 2023 · 9e8c9e0 · 9e8c9e0
2 parents 45f54da + a5c173a
commit 9e8c9e0
Show file tree

Hide file tree

Showing 14 changed files with 115 additions and 10 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -6,7 +6,12 @@
 
 * export, frequencies, refine, traits: Add a new flag `--metadata-id-columns` to customize the possible metadata ID columns. Previously, this was only available in `augur filter`. [#1240][] (@victorlin)
 
+### Bug fixes
+
+* parse: Fix a bug where `--fix-dates` was always applied, with a default of `--fix-dates=monthfirst`. Now, running without `--fix-dates` will leave dates as-is. [#1247][] (@victorlin)
+
 [#1240]: https://github.com/nextstrain/augur/pull/1240
+[#1247]: https://github.com/nextstrain/augur/issues/1247
 
 ## 22.0.3 (14 June 2023)
 

diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py
@@ -680,7 +680,7 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil
     strains_to_keep = set(metadata.index.values)
     strains_to_filter = []
     strains_to_force_include = []
-    distinct_strains_to_force_include = set()
+    distinct_strains_to_force_include: Set = set()
 
     # Track strains that should be included regardless of filters.
     for include_function, include_kwargs in include_by:
@@ -713,9 +713,9 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil
             if filter_function is filter_by_query:
                 try:
                     # pandas ≥1.5.0 only
-                    UndefinedVariableError = pd.errors.UndefinedVariableError
+                    UndefinedVariableError = pd.errors.UndefinedVariableError  # type: ignore
                 except AttributeError:
-                    UndefinedVariableError = pd.core.computation.ops.UndefinedVariableError
+                    UndefinedVariableError = pd.core.computation.ops.UndefinedVariableError  # type: ignore
                 if isinstance(e, UndefinedVariableError):
                     raise AugurError(f"Query contains a column that does not exist in metadata.") from e
                 raise AugurError(f"Error when applying query. Ensure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.") from e

diff --git a/augur/filter/subsample.py b/augur/filter/subsample.py
@@ -475,7 +475,7 @@ def _calculate_fractional_sequences_per_group(
     0.4844
     """
     lo = 1e-5
-    hi = target_max_value
+    hi = float(target_max_value)
 
     while (hi / lo) > 1.1:
         mid = (lo + hi) / 2

diff --git a/augur/io/metadata.py b/augur/io/metadata.py
@@ -456,10 +456,15 @@ def write_records_to_tsv(records, output_file):
 
 def _get_delimiter(path: str, valid_delimiters: Iterable[str]):
     """Get the delimiter of a file given a list of valid delimiters."""
+
+    for delimiter in valid_delimiters:
+        if len(delimiter) != 1:
+            raise AugurError(f"Delimiters must be single-character strings. {delimiter!r} does not satisfy that condition.")
+
     with open_file(path) as file:
         try:
             # Infer the delimiter from the first line.
-            return csv.Sniffer().sniff(file.readline(), valid_delimiters).delimiter
+            return csv.Sniffer().sniff(file.readline(), "".join(valid_delimiters)).delimiter
         except csv.Error as error:
             # This assumes all csv.Errors imply a delimiter issue. That might
             # change in a future Python version.

diff --git a/augur/io/shell_command_runner.py b/augur/io/shell_command_runner.py
@@ -8,7 +8,7 @@
     from signal import SIGKILL
 except ImportError:
     # A non-POSIX platform
-    SIGKILL = None
+    SIGKILL = None  # type: ignore[assignment]
 
 
 def run_shell_command(cmd, raise_errors=False, extra_env=None):

diff --git a/augur/parse.py b/augur/parse.py
@@ -127,7 +127,7 @@ def parse_sequence(sequence, fields, strain_key="strain", separator="|", prettif
                                             etal='lower' if field.startswith('author') else None)
 
     # parse dates and convert to a canonical format
-    if fix_dates and 'date' in metadata:
+    if fix_dates_format and 'date' in metadata:
         metadata['date'] = fix_dates(
             metadata['date'],
             dayfirst=fix_dates_format=='dayfirst'

diff --git a/augur/titer_model.py b/augur/titer_model.py
@@ -1,3 +1,6 @@
+# Prevent mypy from erroring on "flu" not being defined.
+# mypy: disable-error-code="name-defined"
+
 import os
 import logging
 import numpy as np

diff --git a/augur/utils.py b/augur/utils.py
@@ -278,7 +278,7 @@ def available_cpu_cores(fallback: int = 1) -> int:
     **computer**, if determinable, otherwise the *fallback* number (which
     defaults to 1).
     """
-    try:
+    if hasattr(os, "sched_getaffinity"):
         # Note that this is the correct function to use, not os.cpu_count(), as
         # described in the latter's documentation.
         #
@@ -290,7 +290,7 @@ def available_cpu_cores(fallback: int = 1) -> int:
         # naively use all 24, we'd end up with two threads across the 12 cores.
         # This would degrade performance rather than improve it!
         return len(os.sched_getaffinity(0))
-    except:
+    else:
         # cpu_count() returns None if the value is indeterminable.
         return os.cpu_count() or fallback
 

diff --git a/augur/validate.py b/augur/validate.py
@@ -137,7 +137,7 @@ def format_path(path: Iterable[Union[str, int]]) -> str:
     '.x.y[42].z'
     """
     def valid_identifier(x) -> bool:
-        return isinstance(x, str) and re.search(r'^[a-zA-Z$_][a-zA-Z0-9_$]*$', x)
+        return isinstance(x, str) and re.search(r'^[a-zA-Z$_][a-zA-Z0-9_$]*$', x) is not None
 
     def fmt(x) -> str:
         return (f"[{x}]"  if isinstance(x, int)  else

diff --git a/docs/contribute/DEV_DOCS.md b/docs/contribute/DEV_DOCS.md
@@ -140,6 +140,19 @@ Troubleshooting tip: As tests run on the development code in the augur repositor
 We use continuous integration with GitHub Actions to run tests on every pull request submitted to the project.
 We use [codecov](https://codecov.io/) to automatically produce test coverage for new contributions and the project as a whole.
 
+### Type annotations
+
+Our goal is to gradually add [type annotations][] to our code so that we can catch errors earlier and be explicit about the interfaces expected and provided.  Annotation pairs well with the functional approach taken by the package.
+
+During development you can run static type checks using [mypy][]:
+
+    $ mypy
+    # No output is good!
+
+There are also many [editor integrations for mypy][].
+
+[editor integrations for mypy]: https://github.com/python/mypy#integrations
+
 ### Releasing
 
 Versions for this project, Augur, from 3.0.0 onwards aim to follow the

diff --git a/mypy.ini b/mypy.ini
@@ -0,0 +1,49 @@
+[mypy]
+# Don't set python_version. Instead, use the default behavior of checking for
+# compatibility against the version of Python used to run mypy.
+files = augur/
+
+# Require functions with an annotated return type to be explicit about
+# potentially returning None (via Optional[…]).
+strict_optional = False
+
+# In the future maybe we can contribute typing stubs for these modules (either
+# complete stubs in the python/typeshed repo or partial stubs just in
+# this repo), but for now that's more work than we want to invest.  These
+# sections let us ignore missing stubs for specific modules without hiding all
+# missing errors like (--ignore-missing-imports).
+[mypy-treetime.*]
+ignore_missing_imports = True
+
+[mypy-isodate.*]
+ignore_missing_imports = True
+
+[mypy-Bio.*]
+ignore_missing_imports = True
+
+[mypy-BCBio.*]
+ignore_missing_imports = True
+
+[mypy-matplotlib.*]
+ignore_missing_imports = True
+
+[mypy-seaborn.*]
+ignore_missing_imports = True
+
+[mypy-cvxopt.*]
+ignore_missing_imports = True
+
+[mypy-ipdb.*]
+ignore_missing_imports = True
+
+[mypy-jsonschema.*]
+ignore_missing_imports = True
+
+[mypy-pyfastx.*]
+ignore_missing_imports = True
+
+[mypy-networkx.*]
+ignore_missing_imports = True
+
+[mypy-scipy.*]
+ignore_missing_imports = True
diff --git a/setup.py b/setup.py
@@ -70,7 +70,9 @@
             "cram >=0.7",
             "deepdiff >=4.3.2",
             "freezegun >=0.3.15",
+            "mypy",
             "nextstrain-sphinx-theme >=2022.5",
+            "pandas-stubs >=1.0.0, ==1.*",
             "pylint >=1.7.6",
             "pytest >=5.4.1",
             "pytest-cov >=2.8.1",
@@ -83,6 +85,8 @@
             "sphinx-markdown-tables >= 0.0.9",
             "sphinx-rtd-theme >=0.4.3",
             "sphinx-autodoc-typehints >=1.21.4",
+            "types-jsonschema >=3.0.0, ==3.*",
+            "types-setuptools",
             "wheel >=0.32.3",
             "ipdb >=0.10.1"
         ]

diff --git a/tests/functional/parse.t b/tests/functional/parse.t
@@ -66,4 +66,21 @@ Error on the first duplicate.
   ERROR: Duplicate found for 'SEQ1'.
   [2]
 
+Run without --fix-dates. The date is left unchanged.
+
+  $ cat >$TMP/data.fasta <<~~
+  > >SEQ1|05/01/2020
+  > AAA
+  > ~~
+  $ ${AUGUR} parse \
+  >   --sequences $TMP/data.fasta \
+  >   --output-sequences "$TMP/sequences.fasta" \
+  >   --output-metadata "$TMP/metadata.tsv" \
+  >   --fields strain date
+
+  $ cat "$TMP/metadata.tsv"
+  strain	date
+  SEQ1	05/01/2020
+  $ rm -f "$TMP/metadata.tsv"
+
   $ popd > /dev/null
diff --git a/tests/test_mypy.py b/tests/test_mypy.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+from subprocess import run
+
+topdir = Path(__file__).resolve().parent.parent
+
+def test_mypy():
+    # Check the exit status ourselves for nicer test output on failure
+    result = run(["mypy"], cwd = topdir)
+    assert result.returncode == 0, "mypy exited with errors"