From 6f39fcc50defc3dd80b68be27057da7e90d015cf Mon Sep 17 00:00:00 2001 From: Pascal Christoph Date: Fri, 18 Oct 2024 15:15:17 +0200 Subject: [PATCH] Fix escaping by using RFC compliant parser (#496) By configuring the CSVReader with an RFC-compliant parser the escaping is fixed. - update opencsv dependency to version 5.9 - add test --- metafacture-csv/build.gradle | 2 +- .../java/org/metafacture/csv/CsvDecoder.java | 26 +++++++++++++++---- .../org/metafacture/csv/CsvDecoderTest.java | 18 +++++++++++++ 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/metafacture-csv/build.gradle b/metafacture-csv/build.gradle index b63fe775..ee029ff1 100644 --- a/metafacture-csv/build.gradle +++ b/metafacture-csv/build.gradle @@ -19,7 +19,7 @@ description = 'Modules for processing comma-separated values' dependencies { api project(':metafacture-framework') - implementation 'com.opencsv:opencsv:3.10' + implementation 'com.opencsv:opencsv:5.9' testImplementation "junit:junit:${versions.junit}" testImplementation "org.mockito:mockito-core:${versions.mockito}" } diff --git a/metafacture-csv/src/main/java/org/metafacture/csv/CsvDecoder.java b/metafacture-csv/src/main/java/org/metafacture/csv/CsvDecoder.java index 06bd6a69..45b2ac4b 100644 --- a/metafacture-csv/src/main/java/org/metafacture/csv/CsvDecoder.java +++ b/metafacture-csv/src/main/java/org/metafacture/csv/CsvDecoder.java @@ -1,5 +1,5 @@ /* - * Copyright 2013, 2014 Deutsche Nationalbibliothek + * Copyright 2013-2024 Deutsche Nationalbibliothek and hbz * * Licensed under the Apache License, Version 2.0 the "License"; * you may not use this file except in compliance with the License. @@ -24,6 +24,10 @@ import org.metafacture.framework.helpers.DefaultObjectPipe; import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; +import com.opencsv.RFC4180Parser; +import com.opencsv.RFC4180ParserBuilder; +import com.opencsv.exceptions.CsvException; import java.io.IOException; import java.io.StringReader; @@ -48,6 +52,7 @@ public final class CsvDecoder extends DefaultObjectPipe private String[] header = new String[0]; private int count; private boolean hasHeader; + private RFC4180Parser parser; /** * Creates an instance of {@link CsvDecoder} with a given separator. @@ -56,6 +61,7 @@ public final class CsvDecoder extends DefaultObjectPipe */ public CsvDecoder(final String separator) { this.separator = separator.charAt(0); + initializeCsvParser(); } /** @@ -65,6 +71,7 @@ public CsvDecoder(final String separator) { */ public CsvDecoder(final char separator) { this.separator = separator; + initializeCsvParser(); } /** @@ -72,6 +79,13 @@ public CsvDecoder(final char separator) { * {@value #DEFAULT_SEP}. */ public CsvDecoder() { + initializeCsvParser(); + } + + private void initializeCsvParser() { + this.parser = new RFC4180ParserBuilder() + .withSeparator(separator) + .build(); } @Override @@ -105,18 +119,19 @@ else if (parts.length == header.length) { } } - private String[] parseCsv(final String string) { + private String[] parseCsv(final String csv) { String[] parts = new String[0]; try { - final CSVReader reader = new CSVReader(new StringReader(string), - separator); + final CSVReader reader = new CSVReaderBuilder(new StringReader(csv)) + .withCSVParser(parser) + .build(); final List lines = reader.readAll(); if (lines.size() > 0) { parts = lines.get(0); } reader.close(); } - catch (final IOException e) { + catch (final IOException | CsvException e) { e.printStackTrace(); } return parts; @@ -139,5 +154,6 @@ public void setHasHeader(final boolean hasHeader) { */ public void setSeparator(final String separator) { this.separator = separator.charAt(0); + initializeCsvParser(); } } diff --git a/metafacture-csv/src/test/java/org/metafacture/csv/CsvDecoderTest.java b/metafacture-csv/src/test/java/org/metafacture/csv/CsvDecoderTest.java index ed095383..4958775c 100644 --- a/metafacture-csv/src/test/java/org/metafacture/csv/CsvDecoderTest.java +++ b/metafacture-csv/src/test/java/org/metafacture/csv/CsvDecoderTest.java @@ -89,4 +89,22 @@ public void testTabSeparated() { ordered.verify(receiver).endRecord(); } + /** + * In: "a","b\t","c\\t","\","\cd\" + * Out: a, b , c\\t, \, \cd\ + */ + @Test + public void issue496_escaping() { + decoder.setHasHeader(false); + decoder.process("\"a\",\"b\t\",\"c\\t\",\"\\\",\"\\cd\\\""); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startRecord("1"); + ordered.verify(receiver).literal("0", "a"); + ordered.verify(receiver).literal("1", "b\t"); + ordered.verify(receiver).literal("2", "c\\t"); + ordered.verify(receiver).literal("3", "\\"); + ordered.verify(receiver).literal("4", "\\cd\\"); + ordered.verify(receiver).endRecord(); + } + }