Skip to content

Commit

Permalink
Merge pull request #1493 from nextstrain/curate-internal-quotes
Browse files Browse the repository at this point in the history
Stop quoting TSV outputs from augur curate
  • Loading branch information
joverlee521 authored Jun 27, 2024
2 parents dc97cde + 23b80e5 commit 0adfe3d
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
### Bug Fixes

* filter: Improve speed of checking duplicates in metadata, especially for large files. [#1466][] (@victorlin)
* curate: Stop adding double quotes to the metadata TSV output when field values have internal quotes. [#1493][] (@joverlee521)

[#1466]: https://github.com/nextstrain/augur/pull/1466
[#1490]: https://github.com/nextstrain/augur/pull/1490
[#1491]: https://github.com/nextstrain/augur/pull/1491
[#1493]: https://github.com/nextstrain/augur/pull/1493

## 24.4.0 (15 May 2024)

Expand Down
4 changes: 3 additions & 1 deletion augur/io/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,9 @@ def write_records_to_tsv(records, output_file):
output_columns,
extrasaction='ignore',
delimiter='\t',
lineterminator='\n'
lineterminator='\n',
quoting=csv.QUOTE_NONE,
quotechar=None,
)
tsv_writer.writeheader()
tsv_writer.writerow(first_record)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
Setup

$ source "$TESTDIR"/_setup.sh

Testing metadata outputs with internal quotes for the curate command.
Running the `passthru` subcommand since it does not do any data transformations.

Create NDJSON with internal quotes

$ cat >records.ndjson <<~~
> {"strain": "sequence_A", "submitting_lab": "SRC VB \"Vector\", Molecular Biology of Genomes"}
> ~~

Test passthru with output to TSV.
This should not add any quotes around the field with internal quotes.

$ cat records.ndjson \
> | ${AUGUR} curate passthru \
> --output-metadata output-metadata.tsv

$ cat output-metadata.tsv
strain\tsubmitting_lab (esc)
sequence_A\tSRC VB "Vector", Molecular Biology of Genomes (esc)

Run the output TSV through augur curate passthru again.
The new output should still be identical to the first output.

$ ${AUGUR} curate passthru \
> --metadata output-metadata.tsv \
> --output-metadata output-metadata-2.tsv

$ diff -u output-metadata.tsv output-metadata-2.tsv
21 changes: 11 additions & 10 deletions tests/io/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ def expected_record():
return {
'strain': 'SEQ_A',
'date': '2020-10-03',
'country': 'USA'
'country': 'USA',
'lab': 'A Virology Lab "Vector"'
}

@pytest.fixture
Expand All @@ -36,29 +37,29 @@ class TestReadMetadataToDict:
def test_read_table_to_dict_with_csv(self, tmpdir, expected_record):
path = str(tmpdir / 'metadata.csv')
with open(path, 'w') as fh:
fh.write('strain,date,country\n')
fh.write('SEQ_A,2020-10-03,USA\n')
fh.write('strain,date,country,lab\n')
fh.write('SEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n')

record = next(read_table_to_dict(path, (',')))
assert record == expected_record

def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_record):
stdin = StringIO('strain,date,country\nSEQ_A,2020-10-03,USA\n')
stdin = StringIO('strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n')
mp_context.setattr('sys.stdin', stdin)
record = next(read_table_to_dict(sys.stdin, (',')))
assert record == expected_record

def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record):
path = str(tmpdir / 'metadata.tsv')
with open(path, 'w') as fh:
fh.write('strain\tdate\tcountry\n')
fh.write('SEQ_A\t2020-10-03\tUSA\n')
fh.write('strain\tdate\tcountry\tlab\n')
fh.write('SEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n')

record = next(read_table_to_dict(path, ('\t')))
assert record == expected_record

def test_read_table_to_dict_with_tsv_from_stdin(self, mp_context, expected_record):
stdin = StringIO('strain\tdate\tcountry\nSEQ_A\t2020-10-03\tUSA\n')
stdin = StringIO('strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n')
mp_context.setattr('sys.stdin', stdin)
record = next(read_table_to_dict(sys.stdin, ('\t')))
assert record == expected_record
Expand Down Expand Up @@ -457,15 +458,15 @@ def test_read_metadata_with_sequences_with_extra_and_dup_warn_both(self, capsys,
@pytest.fixture
def output_records():
return iter([
{"strain": "SEQ_A", "country": "USA", "date": "2020-10-01"},
{"strain": "SEQ_A", "country": "\"USA\"", "date": "2020-10-01"},
{"strain": "SEQ_T", "country": "USA", "date": "2020-10-02"}
])

@pytest.fixture
def expected_output_tsv():
return (
"strain\tcountry\tdate\n"
"SEQ_A\tUSA\t2020-10-01\n"
'SEQ_A\t"USA"\t2020-10-01\n'
"SEQ_T\tUSA\t2020-10-02\n"
)

Expand Down Expand Up @@ -564,7 +565,7 @@ def test_blank_lines(self, tmpdir):
',,\n',
'5,2,3\n',
])

m = Metadata(path, delimiters=',', id_columns=['a'])
assert list(m.rows()) == [
{'a': '1', 'b': '2', 'c': '3'},
Expand Down

0 comments on commit 0adfe3d

Please sign in to comment.