From f7fe185602b9f9ebe61b17037d13d9ba2fd225de Mon Sep 17 00:00:00 2001 From: Nicolas Laval Date: Mon, 11 Mar 2024 12:48:49 +0100 Subject: [PATCH] Init loadCSV improvements --- pom.xml | 8 ++++++ .../fr/insee/trevas/jupyter/SparkUtils.java | 19 +++++++++---- .../java/fr/insee/trevas/jupyter/Utils.java | 26 +++++++++++++++++ src/test/java/CSVTest.java | 28 +++++++++++++++++++ src/test/resources/ds1.csv | 6 ++++ src/test/resources/ds2.csv | 6 ++++ 6 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 src/main/java/fr/insee/trevas/jupyter/Utils.java create mode 100644 src/test/java/CSVTest.java create mode 100644 src/test/resources/ds1.csv create mode 100644 src/test/resources/ds2.csv diff --git a/pom.xml b/pom.xml index 8d2f25b..ecf14cc 100644 --- a/pom.xml +++ b/pom.xml @@ -251,6 +251,14 @@ + + org.apache.maven.plugins + maven-surefire-plugin + 3.1.2 + + --add-exports java.base/sun.nio.ch=ALL-UNNAMED + + diff --git a/src/main/java/fr/insee/trevas/jupyter/SparkUtils.java b/src/main/java/fr/insee/trevas/jupyter/SparkUtils.java index 3761bb1..1206d63 100644 --- a/src/main/java/fr/insee/trevas/jupyter/SparkUtils.java +++ b/src/main/java/fr/insee/trevas/jupyter/SparkUtils.java @@ -3,16 +3,18 @@ import fr.insee.vtl.engine.VtlScriptEngine; import fr.insee.vtl.spark.SparkDataset; -import java.nio.file.Files; -import java.nio.file.Path; -import javax.script.ScriptEngine; -import javax.script.ScriptEngineManager; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; +import javax.script.ScriptEngine; +import javax.script.ScriptEngineManager; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; + public class SparkUtils { public static VtlScriptEngine buildSparkEngine(SparkSession spark) { @@ -70,8 +72,15 @@ public static SparkDataset readSasDataset(SparkSession spark, String path) throw public static SparkDataset readCSVDataset(SparkSession spark, String path) throws Exception { Dataset dataset; + + String url = Utils.getURL(path); + Map options = Utils.getQueryMap(path); + try { - dataset = spark.read().option("sep", ";").option("header", "true").csv(path); + dataset = spark.read().option("sep", ";") + .option("header", "true") + .options(options) + .csv(url); } catch (Exception e) { throw new Exception(e); } diff --git a/src/main/java/fr/insee/trevas/jupyter/Utils.java b/src/main/java/fr/insee/trevas/jupyter/Utils.java new file mode 100644 index 0000000..90a7d60 --- /dev/null +++ b/src/main/java/fr/insee/trevas/jupyter/Utils.java @@ -0,0 +1,26 @@ +package fr.insee.trevas.jupyter; + +import java.util.HashMap; +import java.util.Map; + +public class Utils { + + public static String getURL(String path) { + return path.split("\\?")[0]; + } + + public static Map getQueryMap(String path) { + if (path == null || !path.contains("\\?")) { + return Map.of(); + } + String[] params = path.split("\\?")[1].split("&"); + Map map = new HashMap<>(); + + for (String param : params) { + String name = param.split("=")[0]; + String value = param.split("=")[1]; + map.put(name, value); + } + return map; + } +} diff --git a/src/test/java/CSVTest.java b/src/test/java/CSVTest.java new file mode 100644 index 0000000..83e3892 --- /dev/null +++ b/src/test/java/CSVTest.java @@ -0,0 +1,28 @@ +import fr.insee.trevas.jupyter.SparkUtils; +import fr.insee.vtl.model.Dataset; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class CSVTest { + + private SparkSession spark; + + @BeforeEach + public void setUp() { + spark = SparkSession.builder() + .appName("test") + .master("local") + .getOrCreate(); + } + + @Test + public void readCSVDatasetTest() throws Exception { + Dataset ds1 = SparkUtils.readCSVDataset(spark, "src/test/resources/ds1.csv"); + assertThat(ds1.getDataPoints().get(1).get("name")).isEqualTo("B"); + Dataset ds2 = SparkUtils.readCSVDataset(spark, "src/test/resources/ds2.csv?sep=\"|\"&delimiter=\"\""); + assertThat(ds2.getDataPoints().get(1).get("name")).isEqualTo("B"); + } +} diff --git a/src/test/resources/ds1.csv b/src/test/resources/ds1.csv new file mode 100644 index 0000000..80c8845 --- /dev/null +++ b/src/test/resources/ds1.csv @@ -0,0 +1,6 @@ +"id";"name" +"1";"A" +"2";"B" +"3";"C" +"4";"D" +"5";"E" \ No newline at end of file diff --git a/src/test/resources/ds2.csv b/src/test/resources/ds2.csv new file mode 100644 index 0000000..465f65d --- /dev/null +++ b/src/test/resources/ds2.csv @@ -0,0 +1,6 @@ +id|name +6|F +7|G +8|H +9|I +10|J \ No newline at end of file