-
Notifications
You must be signed in to change notification settings - Fork 128
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1506 from nextstrain/james/curate-rename
Port `transform-field-names` to `augur curate rename`
- Loading branch information
Showing
7 changed files
with
260 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
""" | ||
Renames fields / columns of the input data | ||
""" | ||
|
||
from typing import Iterable, Literal, Union, List, Tuple | ||
import argparse | ||
from augur.io.print import print_err | ||
from augur.errors import AugurError | ||
|
||
def register_parser(parent_subparsers): | ||
parser = parent_subparsers.add_parser("rename", | ||
parents = [parent_subparsers.shared_parser], | ||
help = __doc__) | ||
|
||
required = parser.add_argument_group(title="REQUIRED") | ||
required.add_argument("--field-map", nargs="+", required=True, | ||
help="Rename fields/columns via '{old_field_name}={new_field_name}'. " + | ||
"If the new field already exists, then the renaming of the old field will be skipped. " + | ||
"Multiple entries with the same '{old_field_name}' will duplicate the field/column. " + | ||
"Skips the field if the old field name is the same as the new field name (case-sensitive).") | ||
|
||
optional = parser.add_argument_group(title="OPTIONAL") | ||
optional.add_argument("--force", action="store_true", | ||
help="Force renaming of old field even if the new field already exists. " + | ||
"Please keep in mind this will overwrite the value of the new field.") | ||
|
||
return parser | ||
|
||
|
||
def parse_field_map(field_map_arg: List[str]) -> List[Tuple[str,str]]: | ||
seen_new = set() # keep track of the new field names | ||
|
||
field_map = [] | ||
for field in field_map_arg: | ||
fields = [n.strip() for n in field.split('=')] | ||
if len(fields)!=2: | ||
raise AugurError(f"The field-map {field!r} must contain a single '=' character.") | ||
old_name, new_name = fields | ||
|
||
# Sanity check the requests to catch typos etc | ||
if not old_name: | ||
raise AugurError(f"The field-map {field!r} doesn't specify a name for the existing field.") | ||
if not new_name: | ||
raise AugurError(f"The field-map {field!r} doesn't specify a name for the new field.") | ||
if new_name in seen_new: | ||
raise AugurError(f"Asked to rename multiple fields to {new_name!r}.") | ||
seen_new.add(new_name) | ||
|
||
field_map.append((old_name, new_name)) | ||
return field_map | ||
|
||
|
||
def transform_columns(existing_fields: List[str], field_map: List[Tuple[str,str]], force: bool) -> List[Tuple[str,str]]: | ||
""" | ||
Calculate the mapping of old column names to new column names | ||
""" | ||
# check that all columns to be renamed exist | ||
for idx,names in enumerate(field_map[:]): | ||
old_name, new_name = names | ||
if old_name not in existing_fields: | ||
print_err(f"WARNING: Asked to rename field {old_name!r} (to {new_name!r}) but it doesn't exist in the input data.") | ||
field_map.pop(idx) | ||
|
||
# iterate through field_map and remove rename requests if they would drop an existing column | ||
# doing this ahead-of-time allows us to preserve the order of fields using a simpler implementation | ||
if not force: | ||
for idx, fields in enumerate(field_map[:]): | ||
old_field, new_field = fields | ||
if new_field in existing_fields and new_field!=old_field: | ||
print_err( | ||
f"WARNING: skipping rename of {old_field} because record", | ||
f"already has a field named {new_field}." | ||
) | ||
field_map.pop(idx) | ||
|
||
names_to_change, new_names = set([f[0] for f in field_map]), set([f[1] for f in field_map]) | ||
|
||
m = [] | ||
for field in existing_fields: | ||
if field in names_to_change: | ||
m += [(field,new_field) for old_field, new_field in field_map if old_field==field] | ||
elif field in new_names: | ||
pass # another column is renamed to this name, so we drop it | ||
else: | ||
m.append((field, field)) # no change to field name | ||
return m | ||
|
||
|
||
def run(args: argparse.Namespace, records: Iterable[dict]) -> Iterable[dict]: | ||
col_map: Union[Literal[False], List[Tuple[str,str]]] = False | ||
for record in records: | ||
if not col_map: # initialise using first record | ||
col_map = transform_columns(list(record.keys()), parse_field_map(args.field_map), args.force) | ||
yield({new_field:record[old_field] for old_field, new_field in col_map}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
|
||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
$ cat >records.ndjson <<~~ | ||
> {"strain": "s_1", "country": "c_1", "accession": "a_1"} | ||
> {"strain": "s_2", "country": "c_2", "accession": "a_2"} | ||
> {"strain": "s_3", "country": "c_3", "accession": "a_3"} | ||
> ~~ | ||
|
||
Rename the strain column to accession -- we don't move the strain column to where accession was, | ||
We simply rename it and drop the (old) accession column. The alternate (which I don't like) produces: | ||
{"country": "c_1", "accession": "s_1"} | ||
{"country": "c_2", "accession": "s_2"} | ||
{"country": "c_3", "accession": "s_3"} | ||
|
||
$ $AUGUR curate rename --field-map "strain=accession" --force < <(cat records.ndjson) | ||
{"accession": "s_1", "country": "c_1"} | ||
{"accession": "s_2", "country": "c_2"} | ||
{"accession": "s_3", "country": "c_3"} | ||
|
||
Similarly, renaming accession to strain re-names the column "in-place" and drops the (old) strain column | ||
|
||
$ $AUGUR curate rename --field-map "accession=strain" --force < <(cat records.ndjson) | ||
{"country": "c_1", "strain": "a_1"} | ||
{"country": "c_2", "strain": "a_2"} | ||
{"country": "c_3", "strain": "a_3"} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
$ cat >records.ndjson <<~~ | ||
> {"accession": "record_1", "country": "country_1"} | ||
> {"accession": "record_2", "country": "country_2"} | ||
> ~~ | ||
|
||
|
||
Asking to rename the same column multiple times results in duplication of the column. | ||
Additional columns are inserted next to the existing one, and the order of the new columns | ||
matches the field-map | ||
|
||
$ $AUGUR curate rename --field-map "accession=id" "accession=genbank_accession" < <(cat records.ndjson) | ||
{"id": "record_1", "genbank_accession": "record_1", "country": "country_1"} | ||
{"id": "record_2", "genbank_accession": "record_2", "country": "country_2"} | ||
|
||
|
||
We can use the same name to keep the original column | ||
|
||
$ $AUGUR curate rename --field-map "accession=id" "accession=accession" < <(cat records.ndjson) | ||
{"id": "record_1", "accession": "record_1", "country": "country_1"} | ||
{"id": "record_2", "accession": "record_2", "country": "country_2"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
|
||
$ cat >records.ndjson <<~~ | ||
> {"accession": "record_1", "country": ""} | ||
> {"accession": "record_2", "country": "country_2"} | ||
> ~~ | ||
|
||
Test that --force is required if a key exists, irregardless of its associated value | ||
(This was not the behaviour in the precursor command `transform-field-names`) | ||
|
||
$ $AUGUR curate rename --field-map "accession=country" < <(cat records.ndjson) 1> out1.ndjson | ||
WARNING: skipping rename of accession because record already has a field named country. | ||
|
||
$ diff records.ndjson out1.ndjson | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
Setup | ||
|
||
$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" | ||
|
||
The tests here use NDJSON I/O and don't explicitly test TSV I/O as we rely | ||
on the general curate infrastructure to enforce that each row has the same | ||
fields. See <https://github.com/nextstrain/augur/issues/1510> for more | ||
|
||
$ cat >records.ndjson <<~~ | ||
> {"accession": "record_1", "country": "country_1"} | ||
> {"accession": "record_2", "country": "country_2"} | ||
> ~~ | ||
|
||
The --field-map argument is requried (error text not checked as it includes the entire argparse usage text) | ||
|
||
$ $AUGUR curate rename < <(cat records.ndjson) 2>/dev/null | ||
[2] | ||
|
||
Rename "accession" to "strain" -- the order should be preserved (i.e. strain is first column) | ||
|
||
$ $AUGUR curate rename --field-map "accession=strain" < <(cat records.ndjson) | ||
{"strain": "record_1", "country": "country_1"} | ||
{"strain": "record_2", "country": "country_2"} | ||
|
||
|
||
Rename "accession" to "country" - single error message is reported as we won't overwrite without --force | ||
and we don't change the data | ||
|
||
$ $AUGUR curate rename --field-map "accession=country" < <(cat records.ndjson) > out2.ndjson | ||
WARNING: skipping rename of accession because record already has a field named country. | ||
|
||
$ diff records.ndjson out2.ndjson | ||
|
||
Rename "accession" to "country" using --force | ||
|
||
$ $AUGUR curate rename --field-map "accession=country" --force < <(cat records.ndjson) | ||
{"country": "record_1"} | ||
{"country": "record_2"} | ||
|
||
Asking to rename multiple columns to the same new name is an error! | ||
|
||
$ $AUGUR curate rename --field-map "accession=foo" "country=foo" < <(cat records.ndjson) | ||
ERROR: Asked to rename multiple fields to 'foo'. | ||
[2] | ||
|
||
Asking to rename a non-existant column raises a warning. Using a warning not an error allows the command | ||
to be idempotent. | ||
|
||
$ $AUGUR curate rename --field-map "strain=foo" < <(cat records.ndjson) 1> out3.ndjson | ||
WARNING: Asked to rename field 'strain' (to 'foo') but it doesn't exist in the input data. | ||
|
||
$ diff records.ndjson out3.ndjson | ||
|
||
|
||
Rename will re-order fields to match the first observed record (with any necessary changes applied) | ||
This produces NDJSON output which more closely matches TSV output. | ||
|
||
$ cat >records.unordered.ndjson <<~~ | ||
> {"accession": "record_1", "country": "country_1"} | ||
> {"country": "country_2", "accession": "record_2"} | ||
> ~~ | ||
|
||
$ $AUGUR curate rename --field-map "accession=id" --force < <(cat records.unordered.ndjson) | ||
{"id": "record_1", "country": "country_1"} | ||
{"id": "record_2", "country": "country_2"} | ||
|
||
|
||
Using --field-map without a single '=' char is an error | ||
$ $AUGUR curate rename --field-map "accession" < <(cat records.ndjson) | ||
ERROR: The field-map 'accession' must contain a single '=' character. | ||
[2] | ||
|
||
Using --field-map with more than a single '=' char is an error | ||
$ $AUGUR curate rename --field-map "accession=id=strain" < <(cat records.ndjson) | ||
ERROR: The field-map 'accession=id=strain' must contain a single '=' character. | ||
[2] | ||
|
||
Using --field-map with spaces surrounding field names is OK (as long as you quote the arg appropriately) | ||
$ $AUGUR curate rename --field-map " accession = strain " < <(cat records.ndjson) | ||
{"strain": "record_1", "country": "country_1"} | ||
{"strain": "record_2", "country": "country_2"} | ||
|
||
Using --field-map with an empty "old field" doesn't make sense | ||
$ $AUGUR curate rename --field-map "=accession" < <(cat records.ndjson) | ||
ERROR: The field-map '=accession' doesn't specify a name for the existing field. | ||
[2] | ||
|
||
Using --field-map with an empty "new field" doesn't (yet) make sense | ||
$ $AUGUR curate rename --field-map "accession=" < <(cat records.ndjson) | ||
ERROR: The field-map 'accession=' doesn't specify a name for the new field. | ||
[2] |