diff --git a/CHANGES.md b/CHANGES.md index 1fabd06d8..cd0088f0e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,10 +10,12 @@ ### Bug Fixes * filter: Improve speed of checking duplicates in metadata, especially for large files. [#1466][] (@victorlin) +* curate: Stop adding double quotes to the metadata TSV output when field values have internal quotes. [#1493][] (@joverlee521) [#1466]: https://github.com/nextstrain/augur/pull/1466 [#1490]: https://github.com/nextstrain/augur/pull/1490 [#1491]: https://github.com/nextstrain/augur/pull/1491 +[#1493]: https://github.com/nextstrain/augur/pull/1493 ## 24.4.0 (15 May 2024) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 32747eceb..bcaef5b79 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -483,7 +483,9 @@ def write_records_to_tsv(records, output_file): output_columns, extrasaction='ignore', delimiter='\t', - lineterminator='\n' + lineterminator='\n', + quoting=csv.QUOTE_NONE, + quotechar=None, ) tsv_writer.writeheader() tsv_writer.writerow(first_record) diff --git a/tests/functional/curate/cram/metadata-output-with-internal-quotes.t b/tests/functional/curate/cram/metadata-output-with-internal-quotes.t new file mode 100644 index 000000000..0c8541798 --- /dev/null +++ b/tests/functional/curate/cram/metadata-output-with-internal-quotes.t @@ -0,0 +1,32 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Testing metadata outputs with internal quotes for the curate command. +Running the `passthru` subcommand since it does not do any data transformations. + +Create NDJSON with internal quotes + + $ cat >records.ndjson <<~~ + > {"strain": "sequence_A", "submitting_lab": "SRC VB \"Vector\", Molecular Biology of Genomes"} + > ~~ + +Test passthru with output to TSV. +This should not add any quotes around the field with internal quotes. + + $ cat records.ndjson \ + > | ${AUGUR} curate passthru \ + > --output-metadata output-metadata.tsv + + $ cat output-metadata.tsv + strain\tsubmitting_lab (esc) + sequence_A\tSRC VB "Vector", Molecular Biology of Genomes (esc) + +Run the output TSV through augur curate passthru again. +The new output should still be identical to the first output. + + $ ${AUGUR} curate passthru \ + > --metadata output-metadata.tsv \ + > --output-metadata output-metadata-2.tsv + + $ diff -u output-metadata.tsv output-metadata-2.tsv diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py index 28ff8e12a..45ec13f10 100644 --- a/tests/io/test_metadata.py +++ b/tests/io/test_metadata.py @@ -13,7 +13,8 @@ def expected_record(): return { 'strain': 'SEQ_A', 'date': '2020-10-03', - 'country': 'USA' + 'country': 'USA', + 'lab': 'A Virology Lab "Vector"' } @pytest.fixture @@ -36,14 +37,14 @@ class TestReadMetadataToDict: def test_read_table_to_dict_with_csv(self, tmpdir, expected_record): path = str(tmpdir / 'metadata.csv') with open(path, 'w') as fh: - fh.write('strain,date,country\n') - fh.write('SEQ_A,2020-10-03,USA\n') + fh.write('strain,date,country,lab\n') + fh.write('SEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n') record = next(read_table_to_dict(path, (','))) assert record == expected_record def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_record): - stdin = StringIO('strain,date,country\nSEQ_A,2020-10-03,USA\n') + stdin = StringIO('strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n') mp_context.setattr('sys.stdin', stdin) record = next(read_table_to_dict(sys.stdin, (','))) assert record == expected_record @@ -51,14 +52,14 @@ def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_recor def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record): path = str(tmpdir / 'metadata.tsv') with open(path, 'w') as fh: - fh.write('strain\tdate\tcountry\n') - fh.write('SEQ_A\t2020-10-03\tUSA\n') + fh.write('strain\tdate\tcountry\tlab\n') + fh.write('SEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n') record = next(read_table_to_dict(path, ('\t'))) assert record == expected_record def test_read_table_to_dict_with_tsv_from_stdin(self, mp_context, expected_record): - stdin = StringIO('strain\tdate\tcountry\nSEQ_A\t2020-10-03\tUSA\n') + stdin = StringIO('strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n') mp_context.setattr('sys.stdin', stdin) record = next(read_table_to_dict(sys.stdin, ('\t'))) assert record == expected_record @@ -457,7 +458,7 @@ def test_read_metadata_with_sequences_with_extra_and_dup_warn_both(self, capsys, @pytest.fixture def output_records(): return iter([ - {"strain": "SEQ_A", "country": "USA", "date": "2020-10-01"}, + {"strain": "SEQ_A", "country": "\"USA\"", "date": "2020-10-01"}, {"strain": "SEQ_T", "country": "USA", "date": "2020-10-02"} ]) @@ -465,7 +466,7 @@ def output_records(): def expected_output_tsv(): return ( "strain\tcountry\tdate\n" - "SEQ_A\tUSA\t2020-10-01\n" + 'SEQ_A\t"USA"\t2020-10-01\n' "SEQ_T\tUSA\t2020-10-02\n" ) @@ -564,7 +565,7 @@ def test_blank_lines(self, tmpdir): ',,\n', '5,2,3\n', ]) - + m = Metadata(path, delimiters=',', id_columns=['a']) assert list(m.rows()) == [ {'a': '1', 'b': '2', 'c': '3'},