From f0bd23e354ed5d0067d7e25543d059a3c0514a7b Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 26 Jun 2024 15:13:39 -0700 Subject: [PATCH 1/4] test_metadata: show current behavior with internal quotes Shows the undesired behavior with internal quotes described in comes from the `write_records_to_tsv` function. --- tests/io/test_metadata.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py index 28ff8e12a..301e841e2 100644 --- a/tests/io/test_metadata.py +++ b/tests/io/test_metadata.py @@ -13,7 +13,8 @@ def expected_record(): return { 'strain': 'SEQ_A', 'date': '2020-10-03', - 'country': 'USA' + 'country': 'USA', + 'lab': 'A Virology Lab "Vector"' } @pytest.fixture @@ -36,14 +37,14 @@ class TestReadMetadataToDict: def test_read_table_to_dict_with_csv(self, tmpdir, expected_record): path = str(tmpdir / 'metadata.csv') with open(path, 'w') as fh: - fh.write('strain,date,country\n') - fh.write('SEQ_A,2020-10-03,USA\n') + fh.write('strain,date,country,lab\n') + fh.write('SEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n') record = next(read_table_to_dict(path, (','))) assert record == expected_record def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_record): - stdin = StringIO('strain,date,country\nSEQ_A,2020-10-03,USA\n') + stdin = StringIO('strain,date,country,lab\nSEQ_A,2020-10-03,USA,A Virology Lab "Vector"\n') mp_context.setattr('sys.stdin', stdin) record = next(read_table_to_dict(sys.stdin, (','))) assert record == expected_record @@ -51,14 +52,14 @@ def test_read_table_to_dict_with_csv_from_stdin(self, mp_context, expected_recor def test_read_table_to_dict_with_tsv(self, tmpdir, expected_record): path = str(tmpdir / 'metadata.tsv') with open(path, 'w') as fh: - fh.write('strain\tdate\tcountry\n') - fh.write('SEQ_A\t2020-10-03\tUSA\n') + fh.write('strain\tdate\tcountry\tlab\n') + fh.write('SEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n') record = next(read_table_to_dict(path, ('\t'))) assert record == expected_record def test_read_table_to_dict_with_tsv_from_stdin(self, mp_context, expected_record): - stdin = StringIO('strain\tdate\tcountry\nSEQ_A\t2020-10-03\tUSA\n') + stdin = StringIO('strain\tdate\tcountry\tlab\nSEQ_A\t2020-10-03\tUSA\tA Virology Lab "Vector"\n') mp_context.setattr('sys.stdin', stdin) record = next(read_table_to_dict(sys.stdin, ('\t'))) assert record == expected_record @@ -457,7 +458,7 @@ def test_read_metadata_with_sequences_with_extra_and_dup_warn_both(self, capsys, @pytest.fixture def output_records(): return iter([ - {"strain": "SEQ_A", "country": "USA", "date": "2020-10-01"}, + {"strain": "SEQ_A", "country": "\"USA\"", "date": "2020-10-01"}, {"strain": "SEQ_T", "country": "USA", "date": "2020-10-02"} ]) @@ -465,7 +466,7 @@ def output_records(): def expected_output_tsv(): return ( "strain\tcountry\tdate\n" - "SEQ_A\tUSA\t2020-10-01\n" + 'SEQ_A\t"""USA"""\t2020-10-01\n' "SEQ_T\tUSA\t2020-10-02\n" ) @@ -564,7 +565,7 @@ def test_blank_lines(self, tmpdir): ',,\n', '5,2,3\n', ]) - + m = Metadata(path, delimiters=',', id_columns=['a']) assert list(m.rows()) == [ {'a': '1', 'b': '2', 'c': '3'}, From 915672e9db021588349ede4e23f21003455eb705 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 26 Jun 2024 16:47:19 -0700 Subject: [PATCH 2/4] write_records_to_tsv: Stop quoting output TSV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to TSV specs,¹ there are no restrictions on special characters other than tabs are not allowed in a field. This is different from the CSV specs,² which require double quotes around fields that contain special characters. Since this function only produces TSVs, follow the TSV specs and stop adding quotes. Resolves ¹ ² --- augur/io/metadata.py | 4 +++- tests/io/test_metadata.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 32747eceb..bcaef5b79 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -483,7 +483,9 @@ def write_records_to_tsv(records, output_file): output_columns, extrasaction='ignore', delimiter='\t', - lineterminator='\n' + lineterminator='\n', + quoting=csv.QUOTE_NONE, + quotechar=None, ) tsv_writer.writeheader() tsv_writer.writerow(first_record) diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py index 301e841e2..45ec13f10 100644 --- a/tests/io/test_metadata.py +++ b/tests/io/test_metadata.py @@ -466,7 +466,7 @@ def output_records(): def expected_output_tsv(): return ( "strain\tcountry\tdate\n" - 'SEQ_A\t"""USA"""\t2020-10-01\n' + 'SEQ_A\t"USA"\t2020-10-01\n' "SEQ_T\tUSA\t2020-10-02\n" ) From d02b11b4b4900113adc7b7bf575d3a994989e0be Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 26 Jun 2024 14:58:11 -0700 Subject: [PATCH 3/4] curate: Add test for metadata output with internal quotes --- .../metadata-output-with-internal-quotes.t | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/functional/curate/cram/metadata-output-with-internal-quotes.t diff --git a/tests/functional/curate/cram/metadata-output-with-internal-quotes.t b/tests/functional/curate/cram/metadata-output-with-internal-quotes.t new file mode 100644 index 000000000..0c8541798 --- /dev/null +++ b/tests/functional/curate/cram/metadata-output-with-internal-quotes.t @@ -0,0 +1,32 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Testing metadata outputs with internal quotes for the curate command. +Running the `passthru` subcommand since it does not do any data transformations. + +Create NDJSON with internal quotes + + $ cat >records.ndjson <<~~ + > {"strain": "sequence_A", "submitting_lab": "SRC VB \"Vector\", Molecular Biology of Genomes"} + > ~~ + +Test passthru with output to TSV. +This should not add any quotes around the field with internal quotes. + + $ cat records.ndjson \ + > | ${AUGUR} curate passthru \ + > --output-metadata output-metadata.tsv + + $ cat output-metadata.tsv + strain\tsubmitting_lab (esc) + sequence_A\tSRC VB "Vector", Molecular Biology of Genomes (esc) + +Run the output TSV through augur curate passthru again. +The new output should still be identical to the first output. + + $ ${AUGUR} curate passthru \ + > --metadata output-metadata.tsv \ + > --output-metadata output-metadata-2.tsv + + $ diff -u output-metadata.tsv output-metadata-2.tsv From 23b80e5fad731c0ed1b00bc38e6b34ab168baa7c Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 27 Jun 2024 13:33:55 -0700 Subject: [PATCH 4/4] Update changelog --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 1fabd06d8..cd0088f0e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,10 +10,12 @@ ### Bug Fixes * filter: Improve speed of checking duplicates in metadata, especially for large files. [#1466][] (@victorlin) +* curate: Stop adding double quotes to the metadata TSV output when field values have internal quotes. [#1493][] (@joverlee521) [#1466]: https://github.com/nextstrain/augur/pull/1466 [#1490]: https://github.com/nextstrain/augur/pull/1490 [#1491]: https://github.com/nextstrain/augur/pull/1491 +[#1493]: https://github.com/nextstrain/augur/pull/1493 ## 24.4.0 (15 May 2024)