diff --git a/CHANGES.md b/CHANGES.md index d33443a30..69eaf3769 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,7 +6,12 @@ * export, frequencies, refine, traits: Add a new flag `--metadata-id-columns` to customize the possible metadata ID columns. Previously, this was only available in `augur filter`. [#1240][] (@victorlin) +### Bug fixes + +* parse: Fix a bug where `--fix-dates` was always applied, with a default of `--fix-dates=monthfirst`. Now, running without `--fix-dates` will leave dates as-is. [#1247][] (@victorlin) + [#1240]: https://github.com/nextstrain/augur/pull/1240 +[#1247]: https://github.com/nextstrain/augur/issues/1247 ## 22.0.3 (14 June 2023) diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py index c7c018555..e5b9e09be 100644 --- a/augur/filter/include_exclude_rules.py +++ b/augur/filter/include_exclude_rules.py @@ -680,7 +680,7 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil strains_to_keep = set(metadata.index.values) strains_to_filter = [] strains_to_force_include = [] - distinct_strains_to_force_include = set() + distinct_strains_to_force_include: Set = set() # Track strains that should be included regardless of filters. for include_function, include_kwargs in include_by: @@ -713,9 +713,9 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil if filter_function is filter_by_query: try: # pandas ≥1.5.0 only - UndefinedVariableError = pd.errors.UndefinedVariableError + UndefinedVariableError = pd.errors.UndefinedVariableError # type: ignore except AttributeError: - UndefinedVariableError = pd.core.computation.ops.UndefinedVariableError + UndefinedVariableError = pd.core.computation.ops.UndefinedVariableError # type: ignore if isinstance(e, UndefinedVariableError): raise AugurError(f"Query contains a column that does not exist in metadata.") from e raise AugurError(f"Error when applying query. Ensure the syntax is valid per .") from e diff --git a/augur/filter/subsample.py b/augur/filter/subsample.py index cd9bcb72c..588d06791 100644 --- a/augur/filter/subsample.py +++ b/augur/filter/subsample.py @@ -475,7 +475,7 @@ def _calculate_fractional_sequences_per_group( 0.4844 """ lo = 1e-5 - hi = target_max_value + hi = float(target_max_value) while (hi / lo) > 1.1: mid = (lo + hi) / 2 diff --git a/augur/io/metadata.py b/augur/io/metadata.py index aedb005f2..ded54ff80 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -456,10 +456,15 @@ def write_records_to_tsv(records, output_file): def _get_delimiter(path: str, valid_delimiters: Iterable[str]): """Get the delimiter of a file given a list of valid delimiters.""" + + for delimiter in valid_delimiters: + if len(delimiter) != 1: + raise AugurError(f"Delimiters must be single-character strings. {delimiter!r} does not satisfy that condition.") + with open_file(path) as file: try: # Infer the delimiter from the first line. - return csv.Sniffer().sniff(file.readline(), valid_delimiters).delimiter + return csv.Sniffer().sniff(file.readline(), "".join(valid_delimiters)).delimiter except csv.Error as error: # This assumes all csv.Errors imply a delimiter issue. That might # change in a future Python version. diff --git a/augur/io/shell_command_runner.py b/augur/io/shell_command_runner.py index a1538c4de..b6df32f3b 100644 --- a/augur/io/shell_command_runner.py +++ b/augur/io/shell_command_runner.py @@ -8,7 +8,7 @@ from signal import SIGKILL except ImportError: # A non-POSIX platform - SIGKILL = None + SIGKILL = None # type: ignore[assignment] def run_shell_command(cmd, raise_errors=False, extra_env=None): diff --git a/augur/parse.py b/augur/parse.py index 19b2e2d38..86efbf8ac 100644 --- a/augur/parse.py +++ b/augur/parse.py @@ -127,7 +127,7 @@ def parse_sequence(sequence, fields, strain_key="strain", separator="|", prettif etal='lower' if field.startswith('author') else None) # parse dates and convert to a canonical format - if fix_dates and 'date' in metadata: + if fix_dates_format and 'date' in metadata: metadata['date'] = fix_dates( metadata['date'], dayfirst=fix_dates_format=='dayfirst' diff --git a/augur/titer_model.py b/augur/titer_model.py index f92f34858..af3b0aacf 100644 --- a/augur/titer_model.py +++ b/augur/titer_model.py @@ -1,3 +1,6 @@ +# Prevent mypy from erroring on "flu" not being defined. +# mypy: disable-error-code="name-defined" + import os import logging import numpy as np diff --git a/augur/utils.py b/augur/utils.py index 8537d6619..288312936 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -278,7 +278,7 @@ def available_cpu_cores(fallback: int = 1) -> int: **computer**, if determinable, otherwise the *fallback* number (which defaults to 1). """ - try: + if hasattr(os, "sched_getaffinity"): # Note that this is the correct function to use, not os.cpu_count(), as # described in the latter's documentation. # @@ -290,7 +290,7 @@ def available_cpu_cores(fallback: int = 1) -> int: # naively use all 24, we'd end up with two threads across the 12 cores. # This would degrade performance rather than improve it! return len(os.sched_getaffinity(0)) - except: + else: # cpu_count() returns None if the value is indeterminable. return os.cpu_count() or fallback diff --git a/augur/validate.py b/augur/validate.py index 4860eca3f..3c42c7b8f 100644 --- a/augur/validate.py +++ b/augur/validate.py @@ -137,7 +137,7 @@ def format_path(path: Iterable[Union[str, int]]) -> str: '.x.y[42].z' """ def valid_identifier(x) -> bool: - return isinstance(x, str) and re.search(r'^[a-zA-Z$_][a-zA-Z0-9_$]*$', x) + return isinstance(x, str) and re.search(r'^[a-zA-Z$_][a-zA-Z0-9_$]*$', x) is not None def fmt(x) -> str: return (f"[{x}]" if isinstance(x, int) else diff --git a/docs/contribute/DEV_DOCS.md b/docs/contribute/DEV_DOCS.md index 8600ec3b4..e3d0223aa 100644 --- a/docs/contribute/DEV_DOCS.md +++ b/docs/contribute/DEV_DOCS.md @@ -140,6 +140,19 @@ Troubleshooting tip: As tests run on the development code in the augur repositor We use continuous integration with GitHub Actions to run tests on every pull request submitted to the project. We use [codecov](https://codecov.io/) to automatically produce test coverage for new contributions and the project as a whole. +### Type annotations + +Our goal is to gradually add [type annotations][] to our code so that we can catch errors earlier and be explicit about the interfaces expected and provided. Annotation pairs well with the functional approach taken by the package. + +During development you can run static type checks using [mypy][]: + + $ mypy + # No output is good! + +There are also many [editor integrations for mypy][]. + +[editor integrations for mypy]: https://github.com/python/mypy#integrations + ### Releasing Versions for this project, Augur, from 3.0.0 onwards aim to follow the diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 000000000..1ada82fd7 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,49 @@ +[mypy] +# Don't set python_version. Instead, use the default behavior of checking for +# compatibility against the version of Python used to run mypy. +files = augur/ + +# Require functions with an annotated return type to be explicit about +# potentially returning None (via Optional[…]). +strict_optional = False + +# In the future maybe we can contribute typing stubs for these modules (either +# complete stubs in the python/typeshed repo or partial stubs just in +# this repo), but for now that's more work than we want to invest. These +# sections let us ignore missing stubs for specific modules without hiding all +# missing errors like (--ignore-missing-imports). +[mypy-treetime.*] +ignore_missing_imports = True + +[mypy-isodate.*] +ignore_missing_imports = True + +[mypy-Bio.*] +ignore_missing_imports = True + +[mypy-BCBio.*] +ignore_missing_imports = True + +[mypy-matplotlib.*] +ignore_missing_imports = True + +[mypy-seaborn.*] +ignore_missing_imports = True + +[mypy-cvxopt.*] +ignore_missing_imports = True + +[mypy-ipdb.*] +ignore_missing_imports = True + +[mypy-jsonschema.*] +ignore_missing_imports = True + +[mypy-pyfastx.*] +ignore_missing_imports = True + +[mypy-networkx.*] +ignore_missing_imports = True + +[mypy-scipy.*] +ignore_missing_imports = True diff --git a/setup.py b/setup.py index 631eefcd2..5a80e9d1d 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,9 @@ "cram >=0.7", "deepdiff >=4.3.2", "freezegun >=0.3.15", + "mypy", "nextstrain-sphinx-theme >=2022.5", + "pandas-stubs >=1.0.0, ==1.*", "pylint >=1.7.6", "pytest >=5.4.1", "pytest-cov >=2.8.1", @@ -83,6 +85,8 @@ "sphinx-markdown-tables >= 0.0.9", "sphinx-rtd-theme >=0.4.3", "sphinx-autodoc-typehints >=1.21.4", + "types-jsonschema >=3.0.0, ==3.*", + "types-setuptools", "wheel >=0.32.3", "ipdb >=0.10.1" ] diff --git a/tests/functional/parse.t b/tests/functional/parse.t index e76e11421..0f4a1ac89 100644 --- a/tests/functional/parse.t +++ b/tests/functional/parse.t @@ -66,4 +66,21 @@ Error on the first duplicate. ERROR: Duplicate found for 'SEQ1'. [2] +Run without --fix-dates. The date is left unchanged. + + $ cat >$TMP/data.fasta <<~~ + > >SEQ1|05/01/2020 + > AAA + > ~~ + $ ${AUGUR} parse \ + > --sequences $TMP/data.fasta \ + > --output-sequences "$TMP/sequences.fasta" \ + > --output-metadata "$TMP/metadata.tsv" \ + > --fields strain date + + $ cat "$TMP/metadata.tsv" + strain date + SEQ1 05/01/2020 + $ rm -f "$TMP/metadata.tsv" + $ popd > /dev/null diff --git a/tests/test_mypy.py b/tests/test_mypy.py new file mode 100644 index 000000000..1c7af7cbc --- /dev/null +++ b/tests/test_mypy.py @@ -0,0 +1,9 @@ +from pathlib import Path +from subprocess import run + +topdir = Path(__file__).resolve().parent.parent + +def test_mypy(): + # Check the exit status ourselves for nicer test output on failure + result = run(["mypy"], cwd = topdir) + assert result.returncode == 0, "mypy exited with errors"