From 42a2a48233d88062913306b5a8531850bc4cf351 Mon Sep 17 00:00:00 2001 From: Anthony Khong Date: Wed, 6 Jan 2021 15:16:57 +0700 Subject: [PATCH] Upgraded Spark to 3.1.0 + fixed tests (#305) * Upgraded Spark to 3.1.0 + fixed tests * Less stringent check on num partitions * Version bump to v0.0.38 * Add Clojure version to prevent overfetching --- docker/deps.edn | 15 ++++++++------- docker/project.clj | 14 +++++++------- docker/spark-project.clj | 14 +++++++------- examples/geni-clj-app/deps.edn | 17 +++++++++-------- lein-template/project.clj | 2 +- .../resources/leiningen/new/geni/project.clj | 14 +++++++------- project.clj | 14 +++++++------- resources/GENI_REPL_RELEASED_VERSION | 2 +- src/clojure/zero_one/geni/core/polymorphic.clj | 7 ++++--- test/zero_one/geni/dataset_test.clj | 2 +- test/zero_one/geni/ml_frequent_pattern_test.clj | 3 ++- test/zero_one/geni/ml_test.clj | 1 + test/zero_one/geni/rdd_test.clj | 2 +- test/zero_one/geni/sql_functions_test.clj | 10 +++++----- 14 files changed, 61 insertions(+), 56 deletions(-) diff --git a/docker/deps.edn b/docker/deps.edn index 4c902748..c0bbc812 100644 --- a/docker/deps.edn +++ b/docker/deps.edn @@ -1,12 +1,13 @@ {:deps - {io.netty/netty-all {:mvn/version "4.1.53.Final"} + {org.clojure/clojure {:mvn/version "1.10.1"} + io.netty/netty-all {:mvn/version "4.1.53.Final"} ;; Spark - org.apache.spark/spark-avro_2.12 {:mvn/version "3.0.1"} - org.apache.spark/spark-core_2.12 {:mvn/version "3.0.1"} - org.apache.spark/spark-hive_2.12 {:mvn/version "3.0.1"} - org.apache.spark/spark-mllib_2.12 {:mvn/version "3.0.1"} - org.apache.spark/spark-sql_2.12 {:mvn/version "3.0.1"} - org.apache.spark/spark-streaming_2.12 {:mvn/version "3.0.1"} + org.apache.spark/spark-avro_2.12 {:mvn/version "3.1.0"} + org.apache.spark/spark-core_2.12 {:mvn/version "3.1.0"} + org.apache.spark/spark-hive_2.12 {:mvn/version "3.1.0"} + org.apache.spark/spark-mllib_2.12 {:mvn/version "3.1.0"} + org.apache.spark/spark-sql_2.12 {:mvn/version "3.1.0"} + org.apache.spark/spark-streaming_2.12 {:mvn/version "3.1.0"} ;; Databases mysql/mysql-connector-java {:mvn/version "8.0.21"} org.postgresql/postgresql {:mvn/version "42.2.16"} diff --git a/docker/project.clj b/docker/project.clj index d8159b57..4ea52224 100644 --- a/docker/project.clj +++ b/docker/project.clj @@ -3,12 +3,12 @@ ; This breaks cljcdoc: https://github.com/cljdoc/cljdoc/issues/407 ; Frozen until issue is resolved. ;[com.github.fommil.netlib/all "1.1.2" :extension "pom"] - [org.apache.spark/spark-avro_2.12 "3.0.1"] - [org.apache.spark/spark-core_2.12 "3.0.1"] - [org.apache.spark/spark-hive_2.12 "3.0.1"] - [org.apache.spark/spark-mllib_2.12 "3.0.1"] - [org.apache.spark/spark-sql_2.12 "3.0.1"] - [org.apache.spark/spark-streaming_2.12 "3.0.1"] + [org.apache.spark/spark-avro_2.12 "3.1.0"] + [org.apache.spark/spark-core_2.12 "3.1.0"] + [org.apache.spark/spark-hive_2.12 "3.1.0"] + [org.apache.spark/spark-mllib_2.12 "3.1.0"] + [org.apache.spark/spark-sql_2.12 "3.1.0"] + [org.apache.spark/spark-streaming_2.12 "3.1.0"] ; Arrow [org.apache.arrow/arrow-memory-netty "2.0.0"] [org.apache.arrow/arrow-memory-core "2.0.0"] @@ -22,7 +22,7 @@ [ml.dmlc/xgboost4j-spark_2.12 "1.2.0"] [ml.dmlc/xgboost4j_2.12 "1.2.0"]]) -(defproject zero.one/geni "0.0.37" +(defproject zero.one/geni "0.0.38" :jvm-opts ["-Duser.country=US" "-Duser.language=en"] :description "A Clojure dataframe library that runs on Spark" :url "https://github.com/zero-one-group/geni" diff --git a/docker/spark-project.clj b/docker/spark-project.clj index 149015e0..6f3eea6c 100644 --- a/docker/spark-project.clj +++ b/docker/spark-project.clj @@ -20,12 +20,12 @@ [techascent/tech.ml.dataset "5.00-alpha-25" :exclusions [ch.qos.logback/logback-classic]] ;; Spark - [org.apache.spark/spark-avro_2.12 "3.0.1"] - [org.apache.spark/spark-core_2.12 "3.0.1"] - [org.apache.spark/spark-hive_2.12 "3.0.1"] - [org.apache.spark/spark-mllib_2.12 "3.0.1"] - [org.apache.spark/spark-sql_2.12 "3.0.1"] - [org.apache.spark/spark-streaming_2.12 "3.0.1"] + [org.apache.spark/spark-avro_2.12 "3.1.0"] + [org.apache.spark/spark-core_2.12 "3.1.0"] + [org.apache.spark/spark-hive_2.12 "3.1.0"] + [org.apache.spark/spark-mllib_2.12 "3.1.0"] + [org.apache.spark/spark-sql_2.12 "3.1.0"] + [org.apache.spark/spark-streaming_2.12 "3.1.0"] [com.github.fommil.netlib/all "1.1.2" :extension "pom"] ; Arrow [org.apache.arrow/arrow-memory-netty "2.0.0"] @@ -39,7 +39,7 @@ ; EDN [metosin/jsonista "0.3.0"] ;; Optional: Dataproc - [org.apache.spark/spark-yarn_2.12 "3.0.1"] + [org.apache.spark/spark-yarn_2.12 "3.1.0"] ;; Optional: Spark XGBoost [ml.dmlc/xgboost4j-spark_2.12 "1.2.0"] [ml.dmlc/xgboost4j_2.12 "1.2.0"]]) diff --git a/examples/geni-clj-app/deps.edn b/examples/geni-clj-app/deps.edn index 0c29398a..267b191d 100644 --- a/examples/geni-clj-app/deps.edn +++ b/examples/geni-clj-app/deps.edn @@ -1,12 +1,13 @@ {:deps - {io.netty/netty-all {:mvn/version "4.1.53.Final"} + {org.clojure/clojure {:mvn/version "1.10.1"} + io.netty/netty-all {:mvn/version "4.1.53.Final"} ;; Spark - org.apache.spark/spark-avro_2.12 {:mvn/version "3.0.1"} - org.apache.spark/spark-core_2.12 {:mvn/version "3.0.1"} - org.apache.spark/spark-hive_2.12 {:mvn/version "3.0.1"} - org.apache.spark/spark-mllib_2.12 {:mvn/version "3.0.1"} - org.apache.spark/spark-sql_2.12 {:mvn/version "3.0.1"} - org.apache.spark/spark-streaming_2.12 {:mvn/version "3.0.1"} + org.apache.spark/spark-avro_2.12 {:mvn/version "3.1.0"} + org.apache.spark/spark-core_2.12 {:mvn/version "3.1.0"} + org.apache.spark/spark-hive_2.12 {:mvn/version "3.1.0"} + org.apache.spark/spark-mllib_2.12 {:mvn/version "3.1.0"} + org.apache.spark/spark-sql_2.12 {:mvn/version "3.1.0"} + org.apache.spark/spark-streaming_2.12 {:mvn/version "3.1.0"} ;; Databases mysql/mysql-connector-java {:mvn/version "8.0.21"} org.postgresql/postgresql {:mvn/version "42.2.16"} @@ -21,4 +22,4 @@ ml.dmlc/xgboost4j-spark_2.12 {:mvn/version "1.0.0"} ml.dmlc/xgboost4j_2.12 {:mvn/version "1.0.0"} ;; Geni - zero.one/geni {:mvn/version "0.0.37"}}} + zero.one/geni {:mvn/version "0.0.38"}}} diff --git a/lein-template/project.clj b/lein-template/project.clj index 378d3a09..a64d5220 100644 --- a/lein-template/project.clj +++ b/lein-template/project.clj @@ -1,4 +1,4 @@ -(defproject geni/lein-template "0.0.37" +(defproject geni/lein-template "0.0.38" :description "Leiningen template for a Geni application." :url "https://github.com/zero-one-group/geni/tree/develop/lein-template" :license {:name "Apache License" diff --git a/lein-template/resources/leiningen/new/geni/project.clj b/lein-template/resources/leiningen/new/geni/project.clj index 2e5804c2..e3751f6c 100644 --- a/lein-template/resources/leiningen/new/geni/project.clj +++ b/lein-template/resources/leiningen/new/geni/project.clj @@ -4,16 +4,16 @@ :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" :url "https://www.eclipse.org/legal/epl-2.0/"} :dependencies [[org.clojure/clojure "1.10.1"] - [zero.one/geni "0.0.37"] + [zero.one/geni "0.0.38"] [metosin/jsonista "0.3.0"] [expound "0.8.7"] ;; Spark - [org.apache.spark/spark-core_2.12 "3.0.1"] - [org.apache.spark/spark-hive_2.12 "3.0.1"] - [org.apache.spark/spark-mllib_2.12 "3.0.1"] - [org.apache.spark/spark-sql_2.12 "3.0.1"] - [org.apache.spark/spark-streaming_2.12 "3.0.1"] - [org.apache.spark/spark-yarn_2.12 "3.0.1"] + [org.apache.spark/spark-core_2.12 "3.1.0"] + [org.apache.spark/spark-hive_2.12 "3.1.0"] + [org.apache.spark/spark-mllib_2.12 "3.1.0"] + [org.apache.spark/spark-sql_2.12 "3.1.0"] + [org.apache.spark/spark-streaming_2.12 "3.1.0"] + [org.apache.spark/spark-yarn_2.12 "3.1.0"] [com.github.fommil.netlib/all "1.1.2" :extension "pom"] ; Arrow [org.apache.arrow/arrow-memory-netty "2.0.0"] diff --git a/project.clj b/project.clj index d8159b57..4ea52224 100644 --- a/project.clj +++ b/project.clj @@ -3,12 +3,12 @@ ; This breaks cljcdoc: https://github.com/cljdoc/cljdoc/issues/407 ; Frozen until issue is resolved. ;[com.github.fommil.netlib/all "1.1.2" :extension "pom"] - [org.apache.spark/spark-avro_2.12 "3.0.1"] - [org.apache.spark/spark-core_2.12 "3.0.1"] - [org.apache.spark/spark-hive_2.12 "3.0.1"] - [org.apache.spark/spark-mllib_2.12 "3.0.1"] - [org.apache.spark/spark-sql_2.12 "3.0.1"] - [org.apache.spark/spark-streaming_2.12 "3.0.1"] + [org.apache.spark/spark-avro_2.12 "3.1.0"] + [org.apache.spark/spark-core_2.12 "3.1.0"] + [org.apache.spark/spark-hive_2.12 "3.1.0"] + [org.apache.spark/spark-mllib_2.12 "3.1.0"] + [org.apache.spark/spark-sql_2.12 "3.1.0"] + [org.apache.spark/spark-streaming_2.12 "3.1.0"] ; Arrow [org.apache.arrow/arrow-memory-netty "2.0.0"] [org.apache.arrow/arrow-memory-core "2.0.0"] @@ -22,7 +22,7 @@ [ml.dmlc/xgboost4j-spark_2.12 "1.2.0"] [ml.dmlc/xgboost4j_2.12 "1.2.0"]]) -(defproject zero.one/geni "0.0.37" +(defproject zero.one/geni "0.0.38" :jvm-opts ["-Duser.country=US" "-Duser.language=en"] :description "A Clojure dataframe library that runs on Spark" :url "https://github.com/zero-one-group/geni" diff --git a/resources/GENI_REPL_RELEASED_VERSION b/resources/GENI_REPL_RELEASED_VERSION index 7e4eee2a..311e80ee 100644 --- a/resources/GENI_REPL_RELEASED_VERSION +++ b/resources/GENI_REPL_RELEASED_VERSION @@ -1 +1 @@ -0.0.37 \ No newline at end of file +0.0.38 \ No newline at end of file diff --git a/src/clojure/zero_one/geni/core/polymorphic.clj b/src/clojure/zero_one/geni/core/polymorphic.clj index de28f6bc..3d1c4222 100644 --- a/src/clojure/zero_one/geni/core/polymorphic.clj +++ b/src/clojure/zero_one/geni/core/polymorphic.clj @@ -221,18 +221,18 @@ (defmethod dissoc Dataset [dataframe & col-names] (apply dataset/drop dataframe col-names)) -(defmulti update +(defmulti update' "Column: `transform-values` with Clojure's `assoc` signature. Dataset: `with-column` with Clojure's `assoc` signature." (fn [head & _] (class head))) -(defmethod update :default [expr k f & args] +(defmethod update' :default [expr k f & args] (sql/transform-values expr (fn [k' v] (sql/when (.equalTo (->column k') (->column k)) (apply f v args) v)))) -(defmethod update Dataset [dataframe k f & args] +(defmethod update' Dataset [dataframe k f & args] (dataset/with-column dataframe k (apply f k args))) ;; Pandas @@ -281,6 +281,7 @@ (import-fn filter where) (import-fn iqr interquartile-range) (import-fn mean avg) +(import-fn update' update) (comment (require '[zero-one.geni.docs :as docs]) diff --git a/test/zero_one/geni/dataset_test.clj b/test/zero_one/geni/dataset_test.clj index 8aae683f..ff54c180 100644 --- a/test/zero_one/geni/dataset_test.clj +++ b/test/zero_one/geni/dataset_test.clj @@ -440,7 +440,7 @@ (-> (df-20) (g/repartition-by-range :Suburb :SellerG) g/partitions - count) => 7) + count) => pos?) (fact "able to repartition by range by number and columns" (-> (df-20) (g/repartition-by-range 3 :Suburb :SellerG) diff --git a/test/zero_one/geni/ml_frequent_pattern_test.clj b/test/zero_one/geni/ml_frequent_pattern_test.clj index a305e3a6..076f8497 100644 --- a/test/zero_one/geni/ml_frequent_pattern_test.clj +++ b/test/zero_one/geni/ml_frequent_pattern_test.clj @@ -36,4 +36,5 @@ (g/column-names (ml/association-rules model)) => ["antecedent" "consequent" "confidence" - "lift"])) + "lift" + "support"])) diff --git a/test/zero_one/geni/ml_test.clj b/test/zero_one/geni/ml_test.clj index 60559008..91196853 100644 --- a/test/zero_one/geni/ml_test.clj +++ b/test/zero_one/geni/ml_test.clj @@ -801,6 +801,7 @@ :threshold 0.5, :fit-intercept true, :label-col "label", + :max-block-size-in-mb 0.0 :standardization true, :probability-col "probability", :prediction-col "prediction", diff --git a/test/zero_one/geni/rdd_test.clj b/test/zero_one/geni/rdd_test.clj index a888f1a6..ccd5e73c 100644 --- a/test/zero_one/geni/rdd_test.clj +++ b/test/zero_one/geni/rdd_test.clj @@ -55,7 +55,7 @@ (rdd/resources) => {} (rdd/spark-home) => (System/getenv "SPARK_HOME") (rdd/sc) => (partial instance? SparkContext) - (rdd/version) => "3.0.1")) + (rdd/version) => "3.1.0")) (facts "On repartitioning" :rdd (fact "partition-by works" diff --git a/test/zero_one/geni/sql_functions_test.clj b/test/zero_one/geni/sql_functions_test.clj index 4611fdda..2d0bba2d 100644 --- a/test/zero_one/geni/sql_functions_test.clj +++ b/test/zero_one/geni/sql_functions_test.clj @@ -24,8 +24,8 @@ :to-2 (g/to-json (g/struct {:time (g/to-timestamp (g/lit "2015-08-26") "yyyy-MM-dd")}) {:timestampFormat "dd/MM/yyyy"})}) g/collect - first) => {:schema-1 "array>" - :schema-2 "array>" + first) => {:schema-1 "ARRAY>" + :schema-2 "ARRAY>" :from-1 {:a 1 :b 0.8} :from-2 {:time (Timestamp. 1440547200000)} :to-1 "{\"a\":1,\"b\":2}" @@ -44,8 +44,8 @@ :to-2 (g/to-csv (g/struct {:time (g/to-timestamp (g/lit "2015-08-26") "yyyy-MM-dd")}) {:timestampFormat "dd/MM/yyyy"})}) g/collect - first) => {:schema-1 "struct<_c0:int,_c1:string>" - :schema-2 "struct<_c0:int,_c1:string>" + first) => {:schema-1 "STRUCT<`_c0`: INT, `_c1`: STRING>" + :schema-2 "STRUCT<`_c0`: INT, `_c1`: STRING>" :from-1 {:a 1 :b 0.8} :from-2 {:time (Timestamp. 1440547200000)} :to-1 "1,2" @@ -214,7 +214,7 @@ (-> (df-20) (g/cube :SellerG :Regionname) (g/agg (g/grouping-id :SellerG :Regionname)) - g/first-vals) => ["Biggin" "Northern Metropolitan" 0] + g/first-vals) => ["Nelson" nil 1] (-> (df-20) (g/group-by :SellerG) (g/agg (-> (g/collect-list :Regionname) (g/as :regions)))