Skip to content

Commit

Permalink
Upgraded Spark to 3.1.0 + fixed tests (#305)
Browse files Browse the repository at this point in the history
* Upgraded Spark to 3.1.0 + fixed tests

* Less stringent check on num partitions

* Version bump to v0.0.38

* Add Clojure version to prevent overfetching
  • Loading branch information
anthony-khong authored Jan 6, 2021
1 parent d0c77f4 commit 42a2a48
Show file tree
Hide file tree
Showing 14 changed files with 61 additions and 56 deletions.
15 changes: 8 additions & 7 deletions docker/deps.edn
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
{:deps
{io.netty/netty-all {:mvn/version "4.1.53.Final"}
{org.clojure/clojure {:mvn/version "1.10.1"}
io.netty/netty-all {:mvn/version "4.1.53.Final"}
;; Spark
org.apache.spark/spark-avro_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-core_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-hive_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-mllib_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-sql_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-streaming_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-avro_2.12 {:mvn/version "3.1.0"}
org.apache.spark/spark-core_2.12 {:mvn/version "3.1.0"}
org.apache.spark/spark-hive_2.12 {:mvn/version "3.1.0"}
org.apache.spark/spark-mllib_2.12 {:mvn/version "3.1.0"}
org.apache.spark/spark-sql_2.12 {:mvn/version "3.1.0"}
org.apache.spark/spark-streaming_2.12 {:mvn/version "3.1.0"}
;; Databases
mysql/mysql-connector-java {:mvn/version "8.0.21"}
org.postgresql/postgresql {:mvn/version "42.2.16"}
Expand Down
14 changes: 7 additions & 7 deletions docker/project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
; This breaks cljcdoc: https://github.com/cljdoc/cljdoc/issues/407
; Frozen until issue is resolved.
;[com.github.fommil.netlib/all "1.1.2" :extension "pom"]
[org.apache.spark/spark-avro_2.12 "3.0.1"]
[org.apache.spark/spark-core_2.12 "3.0.1"]
[org.apache.spark/spark-hive_2.12 "3.0.1"]
[org.apache.spark/spark-mllib_2.12 "3.0.1"]
[org.apache.spark/spark-sql_2.12 "3.0.1"]
[org.apache.spark/spark-streaming_2.12 "3.0.1"]
[org.apache.spark/spark-avro_2.12 "3.1.0"]
[org.apache.spark/spark-core_2.12 "3.1.0"]
[org.apache.spark/spark-hive_2.12 "3.1.0"]
[org.apache.spark/spark-mllib_2.12 "3.1.0"]
[org.apache.spark/spark-sql_2.12 "3.1.0"]
[org.apache.spark/spark-streaming_2.12 "3.1.0"]
; Arrow
[org.apache.arrow/arrow-memory-netty "2.0.0"]
[org.apache.arrow/arrow-memory-core "2.0.0"]
Expand All @@ -22,7 +22,7 @@
[ml.dmlc/xgboost4j-spark_2.12 "1.2.0"]
[ml.dmlc/xgboost4j_2.12 "1.2.0"]])

(defproject zero.one/geni "0.0.37"
(defproject zero.one/geni "0.0.38"
:jvm-opts ["-Duser.country=US" "-Duser.language=en"]
:description "A Clojure dataframe library that runs on Spark"
:url "https://github.com/zero-one-group/geni"
Expand Down
14 changes: 7 additions & 7 deletions docker/spark-project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
[techascent/tech.ml.dataset "5.00-alpha-25"
:exclusions [ch.qos.logback/logback-classic]]
;; Spark
[org.apache.spark/spark-avro_2.12 "3.0.1"]
[org.apache.spark/spark-core_2.12 "3.0.1"]
[org.apache.spark/spark-hive_2.12 "3.0.1"]
[org.apache.spark/spark-mllib_2.12 "3.0.1"]
[org.apache.spark/spark-sql_2.12 "3.0.1"]
[org.apache.spark/spark-streaming_2.12 "3.0.1"]
[org.apache.spark/spark-avro_2.12 "3.1.0"]
[org.apache.spark/spark-core_2.12 "3.1.0"]
[org.apache.spark/spark-hive_2.12 "3.1.0"]
[org.apache.spark/spark-mllib_2.12 "3.1.0"]
[org.apache.spark/spark-sql_2.12 "3.1.0"]
[org.apache.spark/spark-streaming_2.12 "3.1.0"]
[com.github.fommil.netlib/all "1.1.2" :extension "pom"]
; Arrow
[org.apache.arrow/arrow-memory-netty "2.0.0"]
Expand All @@ -39,7 +39,7 @@
; EDN
[metosin/jsonista "0.3.0"]
;; Optional: Dataproc
[org.apache.spark/spark-yarn_2.12 "3.0.1"]
[org.apache.spark/spark-yarn_2.12 "3.1.0"]
;; Optional: Spark XGBoost
[ml.dmlc/xgboost4j-spark_2.12 "1.2.0"]
[ml.dmlc/xgboost4j_2.12 "1.2.0"]])
17 changes: 9 additions & 8 deletions examples/geni-clj-app/deps.edn
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
{:deps
{io.netty/netty-all {:mvn/version "4.1.53.Final"}
{org.clojure/clojure {:mvn/version "1.10.1"}
io.netty/netty-all {:mvn/version "4.1.53.Final"}
;; Spark
org.apache.spark/spark-avro_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-core_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-hive_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-mllib_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-sql_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-streaming_2.12 {:mvn/version "3.0.1"}
org.apache.spark/spark-avro_2.12 {:mvn/version "3.1.0"}
org.apache.spark/spark-core_2.12 {:mvn/version "3.1.0"}
org.apache.spark/spark-hive_2.12 {:mvn/version "3.1.0"}
org.apache.spark/spark-mllib_2.12 {:mvn/version "3.1.0"}
org.apache.spark/spark-sql_2.12 {:mvn/version "3.1.0"}
org.apache.spark/spark-streaming_2.12 {:mvn/version "3.1.0"}
;; Databases
mysql/mysql-connector-java {:mvn/version "8.0.21"}
org.postgresql/postgresql {:mvn/version "42.2.16"}
Expand All @@ -21,4 +22,4 @@
ml.dmlc/xgboost4j-spark_2.12 {:mvn/version "1.0.0"}
ml.dmlc/xgboost4j_2.12 {:mvn/version "1.0.0"}
;; Geni
zero.one/geni {:mvn/version "0.0.37"}}}
zero.one/geni {:mvn/version "0.0.38"}}}
2 changes: 1 addition & 1 deletion lein-template/project.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(defproject geni/lein-template "0.0.37"
(defproject geni/lein-template "0.0.38"
:description "Leiningen template for a Geni application."
:url "https://github.com/zero-one-group/geni/tree/develop/lein-template"
:license {:name "Apache License"
Expand Down
14 changes: 7 additions & 7 deletions lein-template/resources/leiningen/new/geni/project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"}
:dependencies [[org.clojure/clojure "1.10.1"]
[zero.one/geni "0.0.37"]
[zero.one/geni "0.0.38"]
[metosin/jsonista "0.3.0"]
[expound "0.8.7"]
;; Spark
[org.apache.spark/spark-core_2.12 "3.0.1"]
[org.apache.spark/spark-hive_2.12 "3.0.1"]
[org.apache.spark/spark-mllib_2.12 "3.0.1"]
[org.apache.spark/spark-sql_2.12 "3.0.1"]
[org.apache.spark/spark-streaming_2.12 "3.0.1"]
[org.apache.spark/spark-yarn_2.12 "3.0.1"]
[org.apache.spark/spark-core_2.12 "3.1.0"]
[org.apache.spark/spark-hive_2.12 "3.1.0"]
[org.apache.spark/spark-mllib_2.12 "3.1.0"]
[org.apache.spark/spark-sql_2.12 "3.1.0"]
[org.apache.spark/spark-streaming_2.12 "3.1.0"]
[org.apache.spark/spark-yarn_2.12 "3.1.0"]
[com.github.fommil.netlib/all "1.1.2" :extension "pom"]
; Arrow
[org.apache.arrow/arrow-memory-netty "2.0.0"]
Expand Down
14 changes: 7 additions & 7 deletions project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
; This breaks cljcdoc: https://github.com/cljdoc/cljdoc/issues/407
; Frozen until issue is resolved.
;[com.github.fommil.netlib/all "1.1.2" :extension "pom"]
[org.apache.spark/spark-avro_2.12 "3.0.1"]
[org.apache.spark/spark-core_2.12 "3.0.1"]
[org.apache.spark/spark-hive_2.12 "3.0.1"]
[org.apache.spark/spark-mllib_2.12 "3.0.1"]
[org.apache.spark/spark-sql_2.12 "3.0.1"]
[org.apache.spark/spark-streaming_2.12 "3.0.1"]
[org.apache.spark/spark-avro_2.12 "3.1.0"]
[org.apache.spark/spark-core_2.12 "3.1.0"]
[org.apache.spark/spark-hive_2.12 "3.1.0"]
[org.apache.spark/spark-mllib_2.12 "3.1.0"]
[org.apache.spark/spark-sql_2.12 "3.1.0"]
[org.apache.spark/spark-streaming_2.12 "3.1.0"]
; Arrow
[org.apache.arrow/arrow-memory-netty "2.0.0"]
[org.apache.arrow/arrow-memory-core "2.0.0"]
Expand All @@ -22,7 +22,7 @@
[ml.dmlc/xgboost4j-spark_2.12 "1.2.0"]
[ml.dmlc/xgboost4j_2.12 "1.2.0"]])

(defproject zero.one/geni "0.0.37"
(defproject zero.one/geni "0.0.38"
:jvm-opts ["-Duser.country=US" "-Duser.language=en"]
:description "A Clojure dataframe library that runs on Spark"
:url "https://github.com/zero-one-group/geni"
Expand Down
2 changes: 1 addition & 1 deletion resources/GENI_REPL_RELEASED_VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.37
0.0.38
7 changes: 4 additions & 3 deletions src/clojure/zero_one/geni/core/polymorphic.clj
Original file line number Diff line number Diff line change
Expand Up @@ -221,18 +221,18 @@
(defmethod dissoc Dataset [dataframe & col-names]
(apply dataset/drop dataframe col-names))

(defmulti update
(defmulti update'
"Column: `transform-values` with Clojure's `assoc` signature.
Dataset: `with-column` with Clojure's `assoc` signature."
(fn [head & _] (class head)))
(defmethod update :default [expr k f & args]
(defmethod update' :default [expr k f & args]
(sql/transform-values
expr
(fn [k' v] (sql/when (.equalTo (->column k') (->column k))
(apply f v args)
v))))
(defmethod update Dataset [dataframe k f & args]
(defmethod update' Dataset [dataframe k f & args]
(dataset/with-column dataframe k (apply f k args)))

;; Pandas
Expand Down Expand Up @@ -281,6 +281,7 @@
(import-fn filter where)
(import-fn iqr interquartile-range)
(import-fn mean avg)
(import-fn update' update)

(comment
(require '[zero-one.geni.docs :as docs])
Expand Down
2 changes: 1 addition & 1 deletion test/zero_one/geni/dataset_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@
(-> (df-20)
(g/repartition-by-range :Suburb :SellerG)
g/partitions
count) => 7)
count) => pos?)
(fact "able to repartition by range by number and columns"
(-> (df-20)
(g/repartition-by-range 3 :Suburb :SellerG)
Expand Down
3 changes: 2 additions & 1 deletion test/zero_one/geni/ml_frequent_pattern_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@
(g/column-names (ml/association-rules model)) => ["antecedent"
"consequent"
"confidence"
"lift"]))
"lift"
"support"]))
1 change: 1 addition & 0 deletions test/zero_one/geni/ml_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,7 @@
:threshold 0.5,
:fit-intercept true,
:label-col "label",
:max-block-size-in-mb 0.0
:standardization true,
:probability-col "probability",
:prediction-col "prediction",
Expand Down
2 changes: 1 addition & 1 deletion test/zero_one/geni/rdd_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
(rdd/resources) => {}
(rdd/spark-home) => (System/getenv "SPARK_HOME")
(rdd/sc) => (partial instance? SparkContext)
(rdd/version) => "3.0.1"))
(rdd/version) => "3.1.0"))

(facts "On repartitioning" :rdd
(fact "partition-by works"
Expand Down
10 changes: 5 additions & 5 deletions test/zero_one/geni/sql_functions_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
:to-2 (g/to-json (g/struct {:time (g/to-timestamp (g/lit "2015-08-26") "yyyy-MM-dd")})
{:timestampFormat "dd/MM/yyyy"})})
g/collect
first) => {:schema-1 "array<struct<col:bigint>>"
:schema-2 "array<struct<col:bigint>>"
first) => {:schema-1 "ARRAY<STRUCT<`col`: BIGINT>>"
:schema-2 "ARRAY<STRUCT<`col`: BIGINT>>"
:from-1 {:a 1 :b 0.8}
:from-2 {:time (Timestamp. 1440547200000)}
:to-1 "{\"a\":1,\"b\":2}"
Expand All @@ -44,8 +44,8 @@
:to-2 (g/to-csv (g/struct {:time (g/to-timestamp (g/lit "2015-08-26") "yyyy-MM-dd")})
{:timestampFormat "dd/MM/yyyy"})})
g/collect
first) => {:schema-1 "struct<_c0:int,_c1:string>"
:schema-2 "struct<_c0:int,_c1:string>"
first) => {:schema-1 "STRUCT<`_c0`: INT, `_c1`: STRING>"
:schema-2 "STRUCT<`_c0`: INT, `_c1`: STRING>"
:from-1 {:a 1 :b 0.8}
:from-2 {:time (Timestamp. 1440547200000)}
:to-1 "1,2"
Expand Down Expand Up @@ -214,7 +214,7 @@
(-> (df-20)
(g/cube :SellerG :Regionname)
(g/agg (g/grouping-id :SellerG :Regionname))
g/first-vals) => ["Biggin" "Northern Metropolitan" 0]
g/first-vals) => ["Nelson" nil 1]
(-> (df-20)
(g/group-by :SellerG)
(g/agg (-> (g/collect-list :Regionname) (g/as :regions)))
Expand Down

0 comments on commit 42a2a48

Please sign in to comment.