From 348613b8062331847e4a671ecf74832321ef4594 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 5 Mar 2024 15:48:38 -0800 Subject: [PATCH] Clarify behavior for options that use strain ID Add reference to --metadata-id-columns which affects the behavior of these options. --- augur/filter/__init__.py | 16 +++++++---- tests/functional/filter/cram/filter-help.t | 32 ++++++++++++---------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index 903d7899f..3b7ef9089 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -66,7 +66,8 @@ def register_arguments(parser): "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").""") metadata_filter_group.add_argument('--exclude', type=str, nargs="+", metavar="FILE", default=argparse.SUPPRESS, - help="File(s) with list of strains to exclude.") + help="""File(s) with list of strain IDs to exclude. The ID column is + determined by --metadata-id-columns.""") metadata_filter_group.add_argument('--exclude-where', nargs='+', metavar="CONDITION", default=argparse.SUPPRESS, help="""Exclude strains matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of @@ -75,8 +76,9 @@ def register_arguments(parser): help="""Exclude all strains by default. Use this with the include arguments to select a specific subset of strains.""") metadata_filter_group.add_argument('--include', type=str, nargs="+", metavar="FILE", default=argparse.SUPPRESS, - help="""File(s) with list of strains to include regardless of - priorities, subsampling, or absence of an entry in --sequences.""") + help="""File(s) with list of strain IDs to include regardless of + priorities, subsampling, or absence of an entry in --sequences. The + ID column is determined by --metadata-id-columns.""") metadata_filter_group.add_argument('--include-where', nargs='+', metavar="CONDITION", default=argparse.SUPPRESS, help="""Include strains with these values. ex: host=rat. Multiple values are processed as OR (having any of those specified will be @@ -128,13 +130,14 @@ def register_arguments(parser): subsample_group.add_argument('--priority', type=str, metavar="FILE", default=argparse.SUPPRESS, help="""Tab-delimited file with list of priority scores for strains - (e.g., "\\t") and no header. When scores are + (e.g., "\\t") and no header. When scores are provided, Augur converts scores to floating point values, sorts strains within each subsampling group from highest to lowest priority, and selects the top N strains per group where N is the calculated or requested number of strains per group. Higher numbers indicate higher priority. Since priorities represent relative - values between strains, these values can be arbitrary.""") + values between strains, these values can be arbitrary. The ID + column is determined by --metadata-id-columns.""") subsample_group.add_argument('--subsample-seed', type=int, metavar="N", default=argparse.SUPPRESS, help="""Random number generator seed to allow reproducible subsampling (with same input data).""") @@ -149,7 +152,8 @@ def register_arguments(parser): output_group.add_argument('--output-metadata', metavar="FILE", default=argparse.SUPPRESS, help="Metadata for strains that passed filters.") output_group.add_argument('--output-strains', metavar="FILE", default=argparse.SUPPRESS, - help="List of strains that passed filters (no header).") + help="""List of strain IDs that passed filters (no header). The ID + column is determined by --metadata-id-columns.""") output_group.add_argument('--output-log', metavar="FILE", default=argparse.SUPPRESS, help="""Tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter diff --git a/tests/functional/filter/cram/filter-help.t b/tests/functional/filter/cram/filter-help.t index 663a6301d..8042a5e19 100644 --- a/tests/functional/filter/cram/filter-help.t +++ b/tests/functional/filter/cram/filter-help.t @@ -92,7 +92,8 @@ Show help text "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01"). --exclude FILE [FILE ...] - File(s) with list of strains to exclude. + File(s) with list of strain IDs to exclude. The ID + column is determined by --metadata-id-columns. --exclude-where CONDITION [CONDITION ...] Exclude strains matching these conditions. Ex: "host=rat" or "host!=rat". Multiple values are @@ -102,9 +103,10 @@ Show help text include arguments to select a specific subset of strains. --include FILE [FILE ...] - File(s) with list of strains to include regardless of - priorities, subsampling, or absence of an entry in - --sequences. + File(s) with list of strain IDs to include regardless + of priorities, subsampling, or absence of an entry in + --sequences. The ID column is determined by + --metadata-id-columns. --include-where CONDITION [CONDITION ...] Include strains with these values. ex: host=rat. Multiple values are processed as OR (having any of @@ -151,15 +153,16 @@ Show help text max-sequences` is provided. (default: True) --no-probabilistic-sampling --priority FILE Tab-delimited file with list of priority scores for - strains (e.g., "\t") and no header. - When scores are provided, Augur converts scores to - floating point values, sorts strains within each - subsampling group from highest to lowest priority, and - selects the top N strains per group where N is the - calculated or requested number of strains per group. - Higher numbers indicate higher priority. Since - priorities represent relative values between strains, - these values can be arbitrary. + strains (e.g., "\t") and no + header. When scores are provided, Augur converts + scores to floating point values, sorts strains within + each subsampling group from highest to lowest + priority, and selects the top N strains per group + where N is the calculated or requested number of + strains per group. Higher numbers indicate higher + priority. Since priorities represent relative values + between strains, these values can be arbitrary. The ID + column is determined by --metadata-id-columns. --subsample-seed N Random number generator seed to allow reproducible subsampling (with same input data). @@ -173,7 +176,8 @@ Show help text --output-metadata FILE Metadata for strains that passed filters. --output-strains FILE - List of strains that passed filters (no header). + List of strain IDs that passed filters (no header). + The ID column is determined by --metadata-id-columns. --output-log FILE Tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON