From 803949ced1dddf4a3b37120fa855e8fb5dc7168a Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 1 Aug 2024 16:13:19 -0700 Subject: [PATCH 1/3] Revert "write_records_to_tsv: Stop quoting output TSV" This reverts commit 915672e9db021588349ede4e23f21003455eb705. Per discussion in , keep CSV-like TSV where quotes may be added or removed, but parsed values should be equivalent. --- augur/io/metadata.py | 2 -- tests/io/test_metadata.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index de474162d..95ae7b84b 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -549,8 +549,6 @@ def write_records_to_tsv(records, output_file): extrasaction='ignore', delimiter='\t', lineterminator='\n', - quoting=csv.QUOTE_NONE, - quotechar=None, ) tsv_writer.writeheader() tsv_writer.writerow(first_record) diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py index 6ddd27e92..548db3328 100644 --- a/tests/io/test_metadata.py +++ b/tests/io/test_metadata.py @@ -458,7 +458,7 @@ def output_records(): def expected_output_tsv(): return ( "strain\tcountry\tdate\n" - 'SEQ_A\t"USA"\t2020-10-01\n' + 'SEQ_A\t"""USA"""\t2020-10-01\n' "SEQ_T\tUSA\t2020-10-02\n" ) From afcac54560d74538f1513905674c6b92c2bcbcf5 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 1 Aug 2024 16:38:21 -0700 Subject: [PATCH 2/3] curate: fix endless additional quotes Resolves We are expecting the CSV-like double quoting when there are internal quotes. If the field value is already correctly double quoted, then there should not be any additional quotes. --- augur/io/metadata.py | 3 ++- .../curate/cram/metadata-output-with-internal-quotes.t | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 95ae7b84b..041829cd8 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -273,7 +273,8 @@ def visible_worksheet(s: calamine.SheetMetadata) -> bool: # change in a future Python version. raise InvalidDelimiter from error - metadata_reader = csv.DictReader(handle, dialect=dialect) + # Only use the dialect delimiter and keep all other default format params + metadata_reader = csv.DictReader(handle, delimiter=dialect.delimiter) columns, records = metadata_reader.fieldnames, iter(metadata_reader) diff --git a/tests/functional/curate/cram/metadata-output-with-internal-quotes.t b/tests/functional/curate/cram/metadata-output-with-internal-quotes.t index 0c8541798..1e3725f7e 100644 --- a/tests/functional/curate/cram/metadata-output-with-internal-quotes.t +++ b/tests/functional/curate/cram/metadata-output-with-internal-quotes.t @@ -12,7 +12,7 @@ Create NDJSON with internal quotes > ~~ Test passthru with output to TSV. -This should not add any quotes around the field with internal quotes. +This should add double quotes around the internal quotes to match CSV-like quoting. $ cat records.ndjson \ > | ${AUGUR} curate passthru \ @@ -20,10 +20,10 @@ This should not add any quotes around the field with internal quotes. $ cat output-metadata.tsv strain\tsubmitting_lab (esc) - sequence_A\tSRC VB "Vector", Molecular Biology of Genomes (esc) + sequence_A\t"SRC VB ""Vector"", Molecular Biology of Genomes" (esc) Run the output TSV through augur curate passthru again. -The new output should still be identical to the first output. +The new output should still be identical to the first output because it is already double quoted. $ ${AUGUR} curate passthru \ > --metadata output-metadata.tsv \ From 3f94f3e4b6738f0cfaf5beba535764e0cc30f766 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 12 Nov 2024 15:20:36 -0800 Subject: [PATCH 3/3] Update changelog --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 7548d8271..297f92ba2 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,13 @@ ## __NEXT__ +### Features + +- curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521) + +[#1565]: https://github.com/nextstrain/augur/pull/1565 +[#1566]: https://github.com/nextstrain/augur/issues/1566 +[RFC 4180 CSV-like TSVs]: https://datatracker.ietf.org/doc/html/rfc4180 ## 26.1.0 (12 November 2024)