Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop quoting TSV outputs from augur curate #1493

Merged
merged 4 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
### Bug Fixes

* filter: Improve speed of checking duplicates in metadata, especially for large files. [#1466][] (@victorlin)
* curate: Stop adding double quotes to the metadata TSV output when field values have internal quotes. [#1493][] (@joverlee521)

[#1466]: https://github.com/nextstrain/augur/pull/1466
[#1490]: https://github.com/nextstrain/augur/pull/1490
[#1491]: https://github.com/nextstrain/augur/pull/1491
[#1493]: https://github.com/nextstrain/augur/pull/1493

## 24.4.0 (15 May 2024)

Expand Down
4 changes: 3 additions & 1 deletion augur/io/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,9 @@ def write_records_to_tsv(records, output_file):
output_columns,
extrasaction='ignore',
delimiter='\t',
lineterminator='\n'
lineterminator='\n',
quoting=csv.QUOTE_NONE,
quotechar=None,
)
tsv_writer.writeheader()
tsv_writer.writerow(first_record)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
Setup

$ source "$TESTDIR"/_setup.sh

Testing metadata outputs with internal quotes for the curate command.
Running the `passthru` subcommand since it does not do any data transformations.

Create NDJSON with internal quotes

$ cat >records.ndjson <<~~
> {"strain": "sequence_A", "submitting_lab": "SRC VB \"Vector\", Molecular Biology of Genomes"}
> ~~

Test passthru with output to TSV.
This should not add any quotes around the field with internal quotes.

$ cat records.ndjson \
> | ${AUGUR} curate passthru \
> --output-metadata output-metadata.tsv

$ cat output-metadata.tsv
strain\tsubmitting_lab (esc)
sequence_A\tSRC VB "Vector", Molecular Biology of Genomes (esc)

Run the output TSV through augur curate passthru again.
The new output should still be identical to the first output.

$ ${AUGUR} curate passthru \
> --metadata output-metadata.tsv \
> --output-metadata output-metadata-2.tsv

$ diff -u output-metadata.tsv output-metadata-2.tsv
21 changes: 11 additions & 10 deletions tests/io/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ def expected_record():
return {
'strain': 'SEQ_A',
'date': '2020-10-03',
'country': 'USA'
'country': 'USA',
'lab': 'A Virology Lab "Vector"'
}

@pytest.fixture
Expand All @@ -36,29 +37,29 @@ class TestReadMetadataToDict:
def test_read_table_to_dict_with_csv(self, tmpdir, expected_record):
path = str(tmpdir / 'metadata.csv')
with open(path, 'w') as fh:
fh.write('strain,date,country\n')
fh.write('SEQ_A,2020-10-03,USA\n')
fh.write('strain,date,country,lab\n')
fh.write('SEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n')

record = next(read_table_to_dict(path, (',')))
assert record == expected_record

def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_record):
stdin = StringIO('strain,date,country\nSEQ_A,2020-10-03,USA\n')
stdin = StringIO('strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n')
mp_context.setattr('sys.stdin', stdin)
record = next(read_table_to_dict(sys.stdin, (',')))
assert record == expected_record

def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record):
path = str(tmpdir / 'metadata.tsv')
with open(path, 'w') as fh:
fh.write('strain\tdate\tcountry\n')
fh.write('SEQ_A\t2020-10-03\tUSA\n')
fh.write('strain\tdate\tcountry\tlab\n')
fh.write('SEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n')

record = next(read_table_to_dict(path, ('\t')))
assert record == expected_record

def test_read_table_to_dict_with_tsv_from_stdin(self, mp_context, expected_record):
stdin = StringIO('strain\tdate\tcountry\nSEQ_A\t2020-10-03\tUSA\n')
stdin = StringIO('strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n')
mp_context.setattr('sys.stdin', stdin)
record = next(read_table_to_dict(sys.stdin, ('\t')))
assert record == expected_record
Expand Down Expand Up @@ -457,15 +458,15 @@ def test_read_metadata_with_sequences_with_extra_and_dup_warn_both(self, capsys,
@pytest.fixture
def output_records():
return iter([
{"strain": "SEQ_A", "country": "USA", "date": "2020-10-01"},
{"strain": "SEQ_A", "country": "\"USA\"", "date": "2020-10-01"},
{"strain": "SEQ_T", "country": "USA", "date": "2020-10-02"}
])

@pytest.fixture
def expected_output_tsv():
return (
"strain\tcountry\tdate\n"
"SEQ_A\tUSA\t2020-10-01\n"
'SEQ_A\t"USA"\t2020-10-01\n'
"SEQ_T\tUSA\t2020-10-02\n"
)

Expand Down Expand Up @@ -564,7 +565,7 @@ def test_blank_lines(self, tmpdir):
',,\n',
'5,2,3\n',
])

m = Metadata(path, delimiters=',', id_columns=['a'])
assert list(m.rows()) == [
{'a': '1', 'b': '2', 'c': '3'},
Expand Down
Loading