Skip to content

Commit

Permalink
#28 Bump spark commons version
Browse files Browse the repository at this point in the history
* Bump spark commons version
* Add provided dependencies docs to the readme
  • Loading branch information
Zejnilovic authored Aug 18, 2022
1 parent 53c412e commit 8d0f490
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 22 deletions.
18 changes: 15 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,27 @@
- Dataframe in
- Standardized Dataframe out

## Dependency
SBT:
## Usage

### Needed Provided Dependencies

The library needs following dependencies to be included in your project

```sbt
"org.apache.spark" %% "spark-core" % SPARK_VERSION,
"org.apache.spark" %% "spark-sql" % SPARK_VERSION,
"za.co.absa" %% s"spark-commons-spark${SPARK_MAJOR}.${SPARK_MINOR}" % "0.3.1",
```

### Usage in SBT:
```sbt
"za.co.absa" %% "spark-data-standardization" % VERSION
```

### Usage in Maven

### Scala 2.11 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/za.co.absa/spark-data-standardization_2.11/badge.svg)](https://maven-badges.herokuapp.com/maven-central/za.co.absa/spark-data-standardization_2.11)

Maven
```xml
<dependency>
<groupId>za.co.absa</groupId>
Expand Down
17 changes: 4 additions & 13 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import sys.process._
import Dependencies._

ThisBuild / name := "spark-data-standardization"
ThisBuild / organization := "za.co.absa"
Expand All @@ -26,24 +28,13 @@ ThisBuild / scalaVersion := scala211

ThisBuild / versionScheme := Some("early-semver")

def sparkVersion(scalaVersion: String): String = if (scalaVersion==scala212) "3.2.1" else "2.4.7"

def sparkFastTestsVersion(scalaVersion: String): String = if (scalaVersion == scala212) "1.1.0" else "0.23.0"

libraryDependencies ++= List(
"org.apache.spark" %% "spark-core" % sparkVersion(scalaVersion.value) % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion(scalaVersion.value) % "provided",
"za.co.absa" %% "spark-commons" % "0.2.0",
"com.github.mrpowers" %% "spark-fast-tests" % sparkFastTestsVersion(scalaVersion.value) % Test,
"org.scalatest" %% "scalatest" % "3.2.2" % Test,
"com.typesafe" % "config" % "1.4.1"
)
libraryDependencies ++= dependencyList(scalaVersion.value)

lazy val printSparkScalaVersion = taskKey[Unit]("Print Spark and Scala versions for standardization")
ThisBuild / printSparkScalaVersion := {
val log = streams.value.log
val scalaVers = scalaVersion.value
log.info(s"Building with Spark ${sparkVersion(scalaVers)}, Scala ${scalaVers}")
log.info(s"Building with Spark ${getSparkVersion(scalaVers)}, Scala ${scalaVers}")
}

Test / parallelExecution := false
Expand Down
43 changes: 43 additions & 0 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright 2022 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import sbt._

object Dependencies {
private def getSparkVersionUpToMinor(sparkVersion: String): String = {
val pattern = "([0-9]+)\\.([0-9]+)\\.([0-9]+)".r
val pattern(major, minor, patch) = sparkVersion
s"$major.$minor"
}

private def sparkFastTestsVersion(scalaVersion: String): String = if (scalaVersion.startsWith("2.11")) "0.23.0" else "1.1.0"

def getSparkVersion(scalaVersion: String): String = if (scalaVersion.startsWith("2.11")) "2.4.7" else "3.2.1"

def dependencyList(scalaVersion: String): Seq[ModuleID] = {
val sparkVersion = getSparkVersion(scalaVersion)
val sparkVersionUpToMinor = getSparkVersionUpToMinor(sparkVersion)
List(
"org.apache.spark" %% "spark-core" % sparkVersion % Provided,
"org.apache.spark" %% "spark-sql" % sparkVersion % Provided,
"za.co.absa" %% s"spark-commons-spark$sparkVersionUpToMinor" % "0.3.1" % Provided,
"za.co.absa" %% "spark-commons-test" % "0.3.1" % Test,
"com.typesafe" % "config" % "1.4.1",
"com.github.mrpowers" %% "spark-fast-tests" % sparkFastTestsVersion(scalaVersion) % Test,
"org.scalatest" %% "scalatest" % "3.2.2" % Test
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,18 @@ import java.sql.{Date, Timestamp}
import java.util.Base64

import org.apache.spark.sql.types._
import za.co.absa.spark.commons.implicits.StructFieldImplicits.{StructFieldEnhancements, StructFieldMetadataEnhancements}
import za.co.absa.spark.commons.implicits.StructFieldImplicits.StructFieldMetadataEnhancements
import za.co.absa.standardization.ValidationIssue
import za.co.absa.standardization.numeric.{DecimalSymbols, NumericPattern, Radix}
import za.co.absa.standardization.schema.{MetadataKeys, MetadataValues}
import za.co.absa.standardization.time.DateTimePattern
import za.co.absa.standardization.typeClasses.{DoubleLike, LongLike}
import za.co.absa.standardization.types.parsers._
import za.co.absa.standardization.validation.field._

import scala.util.{Failure, Success, Try}

sealed abstract class TypedStructField(structField: StructField)(implicit defaults: TypeDefaults)
extends StructFieldEnhancements(structField) with Serializable {
sealed abstract class TypedStructField(val structField: StructField)(implicit defaults: TypeDefaults)
extends Serializable {

type BaseType

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ class StandardizationInterpreterSuite extends AnyFunSuite with SparkTestBase wit


val srcString:String = FileReader.readFileAsString("src/test/resources/data/patients.json")
val src = JsonUtils.getDataFrameFromJson(spark, Seq(srcString))
val src = JsonUtils.getDataFrameFromJson(Seq(srcString))

logDataFrameContent(src)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ class StandardizationInterpreter_ArraySuite extends AnyFunSuite with SparkTestBa
val seq = Seq(
s"""{"$fieldName": [["a", "bb", "ccc"],["1", "12"],["Hello", null, "World"]]}"""
)
val src = JsonUtils.getDataFrameFromJson(spark, seq)
val src = JsonUtils.getDataFrameFromJson(seq)

val subArrayJson = """{"type": "array", "elementType": "string", "containsNull": false}"""
val desiredSchema = generateDesiredSchema(subArrayJson, s""""${MetadataKeys.DefaultValue}": "Nope"""")
Expand Down

0 comments on commit 8d0f490

Please sign in to comment.