From 850a13d40001efcb60f547ae0a73c81d1b6f60f5 Mon Sep 17 00:00:00 2001 From: Edward Cho Date: Mon, 7 Aug 2023 19:00:52 -0400 Subject: [PATCH 01/21] Update release version to 2.0.4-spark-3.3 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 39545d252..44aadccaa 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.amazon.deequ deequ - 2.0.3-spark-3.3 + 2.0.4-spark-3.3 1.8 From 50a57905ca802b09c2a4dba87338c6d93b8e12c5 Mon Sep 17 00:00:00 2001 From: Fabio Buso Date: Tue, 15 Sep 2020 22:45:32 +0200 Subject: [PATCH 02/21] Change groupId and publish on archiva (#2) --- pom.xml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 44aadccaa..9798ec271 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - com.amazon.deequ + com.logicalclocks deequ 2.0.4-spark-3.3 @@ -416,4 +416,12 @@ + + + Hops + Hops Repo + https://archiva.hops.works/repository/Hops/ + + + From d91b861e4d60622d005b405fe2c4540545639bdd Mon Sep 17 00:00:00 2001 From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com> Date: Wed, 16 Sep 2020 14:33:08 +0200 Subject: [PATCH 03/21] Adapt profiler for hsfs (#1) * Add correlation * make histograms configurable add uniqueness --- deequ-scalastyle.xml | 2 +- .../amazon/deequ/profiles/ColumnProfile.scala | 29 +++- .../deequ/profiles/ColumnProfiler.scala | 155 +++++++++++++----- .../profiles/ColumnProfilerRunBuilder.scala | 20 +++ .../deequ/profiles/ColumnProfilerRunner.scala | 4 + 5 files changed, 162 insertions(+), 48 deletions(-) diff --git a/deequ-scalastyle.xml b/deequ-scalastyle.xml index b5e9680a3..c726413bc 100644 --- a/deequ-scalastyle.xml +++ b/deequ-scalastyle.xml @@ -35,7 +35,7 @@ - + diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala index 543936824..39b54d508 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala @@ -24,6 +24,9 @@ import com.google.gson.{Gson, GsonBuilder, JsonArray, JsonObject, JsonPrimitive} abstract class ColumnProfile { def column: String def completeness: Double + def distinctness: Double + def entropy: Double + def uniqueness: Double def approximateNumDistinctValues: Long def dataType: DataTypeInstances.Value def isDataTypeInferred: Boolean @@ -34,6 +37,9 @@ abstract class ColumnProfile { case class StandardColumnProfile( column: String, completeness: Double, + distinctness: Double, + entropy: Double, + uniqueness: Double, approximateNumDistinctValues: Long, dataType: DataTypeInstances.Value, isDataTypeInferred: Boolean, @@ -56,6 +62,9 @@ case class StringColumnProfile( case class NumericColumnProfile( column: String, completeness: Double, + distinctness: Double, + entropy: Double, + uniqueness: Double, approximateNumDistinctValues: Long, dataType: DataTypeInstances.Value, isDataTypeInferred: Boolean, @@ -67,7 +76,8 @@ case class NumericColumnProfile( minimum: Option[Double], sum: Option[Double], stdDev: Option[Double], - approxPercentiles: Option[Seq[Double]]) + approxPercentiles: Option[Seq[Double]], + correlation: Option[Map[String, Double]]) extends ColumnProfile case class ColumnProfiles( @@ -98,6 +108,9 @@ object ColumnProfiles { } columnProfileJson.addProperty("completeness", profile.completeness) + columnProfileJson.addProperty("distinctness", profile.distinctness) + columnProfileJson.addProperty("entropy", profile.entropy) + columnProfileJson.addProperty("uniqueness", profile.uniqueness) columnProfileJson.addProperty("approximateNumDistinctValues", profile.approximateNumDistinctValues) @@ -134,6 +147,18 @@ object ColumnProfiles { columnProfileJson.addProperty("stdDev", stdDev) } + // correlation + if (numericColumnProfile.correlation.isDefined) { + val correlationsJson = new JsonArray + numericColumnProfile.correlation.get.foreach { correlation => + val correlationJson = new JsonObject() + correlationJson.addProperty("column", correlation._1) + correlationJson.addProperty("correlation", correlation._2) + correlationsJson.add(correlationJson) + } + columnProfileJson.add("correlations", correlationsJson) + } + // KLL Sketch if (numericColumnProfile.kll.isDefined) { val kllSketch = numericColumnProfile.kll.get @@ -182,7 +207,7 @@ object ColumnProfiles { json.add("columns", columns) val gson = new GsonBuilder() - .setPrettyPrinting() + // .setPrettyPrinting() .create() gson.toJson(json) diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala index 57c8c3019..d4c42bd42 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala @@ -49,6 +49,9 @@ private[deequ] case class GenericColumnStatistics( typeDetectionHistograms: Map[String, Map[String, Long]], approximateNumDistincts: Map[String, Long], completenesses: Map[String, Double], + distinctness: Map[String, Double], + entropy: Map[String, Double], + uniqueness: Map[String, Double], predefinedTypes: Map[String, DataTypeInstances.Value]) { def typeOf(column: String): DataTypeInstances.Value = { @@ -69,7 +72,8 @@ private[deequ] case class NumericColumnStatistics( maxima: Map[String, Double], sums: Map[String, Double], kll: Map[String, BucketDistribution], - approxPercentiles: Map[String, Seq[Double]] + approxPercentiles: Map[String, Seq[Double]], + correlation: Map[String, Map[String, Double]] ) private[deequ] case class CategoricalColumnStatistics(histograms: Map[String, Distribution]) @@ -119,6 +123,8 @@ object ColumnProfiler { reuseExistingResultsUsingKey: Option[ResultKey] = None, failIfResultsForReusingMissing: Boolean = false, saveInMetricsRepositoryUsingKey: Option[ResultKey] = None, + correlation: Boolean = true, + histogram: Boolean = true, kllProfiling: Boolean = false, kllParameters: Option[KLLParameters] = None, predefinedTypes: Map[String, DataTypeInstances.Value] = Map.empty) @@ -179,7 +185,7 @@ object ColumnProfiler { // We compute mean, stddev, min, max for all numeric columns val analyzersForSecondPass = getAnalyzersForSecondPass(relevantColumns, - genericStatistics, kllProfiling, kllParameters) + genericStatistics, kllProfiling, kllParameters, correlation) var analysisRunnerSecondPass = AnalysisRunner .onData(castedDataForSecondPass) @@ -196,39 +202,45 @@ object ColumnProfiler { val numericStatistics = extractNumericStatistics(secondPassResults) - // Third pass - if (printStatusUpdates) { - println("### PROFILING: Computing histograms of low-cardinality columns in pass (3/3)...") - } - - // We compute exact histograms for all low-cardinality string columns, find those here - val targetColumnsForHistograms = findTargetColumnsForHistograms(data.schema, genericStatistics, - lowCardinalityHistogramThreshold) - - // Find out, if we have values for those we can reuse - val analyzerContextExistingValues = getAnalyzerContextWithHistogramResultsForReusingIfNecessary( - metricsRepository, - reuseExistingResultsUsingKey, - targetColumnsForHistograms - ) - - // The columns we need to calculate the histograms for - val nonExistingHistogramColumns = targetColumnsForHistograms - .filter { column => analyzerContextExistingValues.metricMap.get(Histogram(column)).isEmpty } - - // Calculate and save/append results if necessary - val histograms: Map[String, Distribution] = getHistogramsForThirdPass( - data, - nonExistingHistogramColumns, - analyzerContextExistingValues, - printStatusUpdates, - failIfResultsForReusingMissing, - metricsRepository, - saveInMetricsRepositoryUsingKey) + val thirdPassResults = histogram match { + case true => + // Third pass + if (printStatusUpdates) { + println("### PROFILING: Computing histograms of low-cardinality columns in pass (3/3)...") + } - val thirdPassResults = CategoricalColumnStatistics(histograms) + // We compute exact histograms for all low-cardinality string columns, find those here + val targetColumnsForHistograms = findTargetColumnsForHistograms(data.schema, + genericStatistics, lowCardinalityHistogramThreshold) + + // Find out, if we have values for those we can reuse + val analyzerContextExistingValues = + getAnalyzerContextWithHistogramResultsForReusingIfNecessary( + metricsRepository, + reuseExistingResultsUsingKey, + targetColumnsForHistograms + ) + + // The columns we need to calculate the histograms for + val nonExistingHistogramColumns = targetColumnsForHistograms + .filter { column => + analyzerContextExistingValues.metricMap.get(Histogram(column)).isEmpty } + + // Calculate and save/append results if necessary + val histograms: Map[String, Distribution] = getHistogramsForThirdPass( + data, + nonExistingHistogramColumns, + analyzerContextExistingValues, + printStatusUpdates, + failIfResultsForReusingMissing, + metricsRepository, + saveInMetricsRepositoryUsingKey) + histograms + case _ => Map.empty[String, Distribution] + } - createProfiles(relevantColumns, genericStatistics, stringStatistics, numericStatistics, thirdPassResults) + createProfiles(relevantColumns, genericStatistics, stringStatistics, numericStatistics, + CategoricalColumnStatistics(thirdPassResults)) } private[this] def getRelevantColumns( @@ -261,10 +273,12 @@ object ColumnProfiler { Seq( Completeness(name), ApproxCountDistinct(name), DataType(name), MinLength(name, analyzerOptions = Some(analyzerOptions)), - MaxLength(name, analyzerOptions = Some(analyzerOptions)) + MaxLength(name, analyzerOptions = Some(analyzerOptions)), + Uniqueness(name), Distinctness(name), Entropy(name) ) } else { - Seq(Completeness(name), ApproxCountDistinct(name)) + Seq(Completeness(name), ApproxCountDistinct(name), Uniqueness(name), + Distinctness(name), Entropy(name)) } } } @@ -273,17 +287,23 @@ object ColumnProfiler { relevantColumnNames: Seq[String], genericStatistics: GenericColumnStatistics, kllProfiling: Boolean, - kllParameters: Option[KLLParameters] = None) + kllParameters: Option[KLLParameters] = None, + correlation: Boolean) : Seq[Analyzer[_, Metric[_]]] = { - relevantColumnNames + val numericColumnNames = relevantColumnNames .filter { name => Set(Integral, Fractional).contains(genericStatistics.typeOf(name)) } - .flatMap { name => getNumericColAnalyzers(name, kllProfiling, kllParameters) } + numericColumnNames + .flatMap { name => + getNumericColAnalyzers(name, kllProfiling, kllParameters, correlation, numericColumnNames) + } } private[this] def getNumericColAnalyzers( column: String, kllProfiling: Boolean, - kllParameters: Option[KLLParameters]) + kllParameters: Option[KLLParameters], + correlation: Boolean, + numericColumnNames: Seq[String]) : Seq[Analyzer[_, Metric[_]]] = { val mandatoryAnalyzers = Seq(Minimum(column), Maximum(column), Mean(column), StandardDeviation(column), Sum(column)) @@ -294,7 +314,13 @@ object ColumnProfiler { Seq.empty } - mandatoryAnalyzers ++ optionalAnalyzers + val correlationAnalyzers = if (correlation) { + numericColumnNames.map(x => Correlation(column, x)) + } else { + Seq.empty + } + + mandatoryAnalyzers ++ optionalAnalyzers ++ correlationAnalyzers } private[this] def setMetricsRepositoryConfigurationIfNecessary( @@ -446,9 +472,25 @@ object ColumnProfiler { analyzer.column -> metric.value.get } + val entropy = results.metricMap + .collect { case (analyzer: Entropy, metric: DoubleMetric) => + analyzer.column -> metric.value.get + } + + val uniqueness = results.metricMap + .collect { case (analyzer: Uniqueness, metric: DoubleMetric) => + // we only compute uniqueness for single columns + analyzer.columns.head -> metric.value.get + } + + val distinctness = results.metricMap + .collect { case (analyzer: Distinctness, metric: DoubleMetric) => + analyzer.columns.head -> metric.value.get + } + val knownTypes = schema.fields .filter { column => columns.contains(column.name) } - .filterNot { column => predefinedTypes.contains(column.name)} + .filterNot { column => predefinedTypes.contains(column.name) } .filter { _.dataType != StringType } @@ -468,7 +510,7 @@ object ColumnProfiler { .toMap GenericColumnStatistics(numRecords, inferredTypes, knownTypes, typeDetectionHistograms, - approximateNumDistincts, completenesses, predefinedTypes) + approximateNumDistincts, completenesses, distinctness, entropy, uniqueness, predefinedTypes) } @@ -566,7 +608,7 @@ object ColumnProfiler { .toMap val approxPercentiles = results.metricMap - .collect { case (analyzer: KLLSketch, metric: KLLMetric) => + .collect { case (analyzer: KLLSketch, metric: KLLMetric) => metric.value match { case Success(bucketDistribution) => @@ -579,8 +621,20 @@ object ColumnProfiler { .flatten .toMap + val correlation = results.metricMap + .collect { case (analyzer: Correlation, metric: DoubleMetric) => + metric.value match { + case Success(metricValue) => + Some(analyzer.firstColumn -> Map(analyzer.secondColumn -> metricValue)) + case _ => None + } + } + .flatten + .groupBy(_._1) + .map { case (key, value) => value.reduce((x, y) => x._1 -> (x._2.toSeq ++ y._2.toSeq).toMap) } - NumericColumnStatistics(means, stdDevs, minima, maxima, sums, kll, approxPercentiles) + NumericColumnStatistics(means, stdDevs, minima, maxima, sums, kll, + approxPercentiles, correlation) } /* Identifies all columns, which: @@ -723,6 +777,9 @@ object ColumnProfiler { .map { name => val completeness = genericStats.completenesses(name) + val distinctness = genericStats.distinctness(name) + val entropy = genericStats.entropy(name) + val uniqueness = genericStats.uniqueness(name) val approxNumDistinct = genericStats.approximateNumDistincts(name) val dataType = genericStats.typeOf(name) val isDataTypeInferred = genericStats.inferredTypes.contains(name) @@ -736,6 +793,9 @@ object ColumnProfiler { NumericColumnProfile( name, completeness, + distinctness, + entropy, + uniqueness, approxNumDistinct, dataType, isDataTypeInferred, @@ -747,7 +807,9 @@ object ColumnProfiler { numericStats.minima.get(name), numericStats.sums.get(name), numericStats.stdDevs.get(name), - numericStats.approxPercentiles.get(name)) + numericStats.approxPercentiles.get(name), + numericStats.correlation.get(name) + ) case String => StringColumnProfile( @@ -766,6 +828,9 @@ object ColumnProfiler { StandardColumnProfile( name, completeness, + distinctness, + entropy, + uniqueness, approxNumDistinct, dataType, isDataTypeInferred, diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala index 5ac181951..14e3297ad 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala @@ -39,6 +39,8 @@ class ColumnProfilerRunBuilder(val data: DataFrame) { protected var saveColumnProfilesJsonPath: Option[String] = None protected var saveConstraintSuggestionsJsonPath: Option[String] = None protected var saveEvaluationResultsJsonPath: Option[String] = None + protected var correlation = true + protected var histogram = true protected var kllProfiling = false protected var kllParameters: Option[KLLParameters] = None protected var predefinedTypes: Map[String, DataTypeInstances.Value] = Map.empty @@ -110,6 +112,22 @@ class ColumnProfilerRunBuilder(val data: DataFrame) { this } + /** + * Enable correlation profiling on Numerical columns, enabled by default. + */ + def withCorrelation(correlation: Boolean): this.type = { + this.correlation = correlation + this + } + + /** + * Enable histogram profiling on Numerical columns, enabled by default. + */ + def withHistogram(histogram: Boolean): this.type = { + this.histogram = histogram + this + } + /** * Enable KLL Sketches profiling on Numerical columns, disabled by default. */ @@ -180,6 +198,8 @@ class ColumnProfilerRunBuilder(val data: DataFrame) { reuseExistingResultsKey, failIfResultsForReusingMissing, saveOrAppendResultsKey), + correlation, + histogram, kllProfiling, kllParameters, predefinedTypes diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala index 768173053..a02a5d4ee 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala @@ -48,6 +48,8 @@ class ColumnProfilerRunner { cacheInputs: Boolean, fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions, metricsRepositoryOptions: ColumnProfilerRunBuilderMetricsRepositoryOptions, + correlation: Boolean, + histogram: Boolean, kllProfiling: Boolean, kllParameters: Option[KLLParameters], predefinedTypes: Map[String, DataTypeInstances.Value]) @@ -67,6 +69,8 @@ class ColumnProfilerRunner { metricsRepositoryOptions.reuseExistingResultsKey, metricsRepositoryOptions.failIfResultsForReusingMissing, metricsRepositoryOptions.saveOrAppendResultsKey, + correlation, + histogram, kllProfiling, kllParameters, predefinedTypes From bfa462e1af343c54cbcc869a8892b12fd837fbb3 Mon Sep 17 00:00:00 2001 From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com> Date: Thu, 17 Sep 2020 14:04:29 +0200 Subject: [PATCH 04/21] make tests compile (#3) --- .../com/amazon/deequ/KLL/KLLProfileTest.scala | 11 ++- .../deequ/profiles/ColumnProfilerTest.scala | 54 +++++++++--- .../rules/ConstraintRulesTest.scala | 82 +++++++++---------- 3 files changed, 95 insertions(+), 52 deletions(-) diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala index 6ee81a7d3..119112e56 100644 --- a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala +++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala @@ -64,6 +64,9 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "att1", 1.0, + 1.0, + 1.0, + 1.0, 6, DataTypeInstances.Fractional, false, @@ -85,7 +88,9 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, - 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0))) + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)), + None + ) assertProfilesEqual(expectedColumnProfile, actualColumnProfile.asInstanceOf[NumericColumnProfile]) @@ -104,6 +109,9 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "att1", 1.0, + 1.0, + 1.0, + 1.0, 30, DataTypeInstances.Fractional, false, @@ -120,6 +128,7 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec Some(1.0), Some(465.0), Some(8.65544144839919), + None, None) assertProfilesEqual(expectedColumnProfile, diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala index 6eabc8f8a..62b953fdc 100644 --- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala +++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala @@ -60,6 +60,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "att2", 2.0 / 3.0, + 1.0, + 1.0, + 1.0, 2, DataTypeInstances.String, true, @@ -111,6 +114,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "item", 1.0, + 1.0, + 1.0, + 1.0, 6, DataTypeInstances.String, false, @@ -134,6 +140,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "att2", 2.0 / 3.0, + 1.0, + 1.0, + 1.0, 2, DataTypeInstances.String, true, @@ -163,6 +172,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "item", 1.0, + 1.0, + 1.0, + 1.0, 6, DataTypeInstances.Integral, true, @@ -186,7 +198,8 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, - 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0))) + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)), + None) assertProfilesEqual(expectedColumnProfile, actualColumnProfile.asInstanceOf[NumericColumnProfile]) @@ -203,6 +216,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "item", 1.0, + 1.0, + 1.0, + 1.0, 6, DataTypeInstances.Integral, true, @@ -226,7 +242,8 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, - 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0))) + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)), + None) assertProfilesEqual(expectedColumnProfile, actualColumnProfile.asInstanceOf[NumericColumnProfile]) @@ -244,6 +261,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "item", 1.0, + 1.0, + 1.0, + 1.0, 6, DataTypeInstances.Integral, true, @@ -303,7 +323,8 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, - 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0))) + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)), + None) assertProfilesEqual(expectedColumnProfile, actualColumnProfile.asInstanceOf[NumericColumnProfile]) @@ -320,6 +341,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "att1", 1.0, + 1.0, + 1.0, + 1.0, 6, DataTypeInstances.Fractional, false, @@ -337,7 +361,8 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, - 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0))) + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)), + None) assertProfilesEqual(expectedColumnProfile, actualColumnProfile.asInstanceOf[NumericColumnProfile]) @@ -353,6 +378,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "att2", 2.0 / 3.0, + 1.0, + 1.0, + 1.0, 2, DataTypeInstances.String, isDataTypeInferred = true, @@ -561,6 +589,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec StandardColumnProfile( "PassengerId", 1.0, + 1.0, + 1.0, + 1.0, 891, DataTypeInstances.Integral, false, @@ -569,17 +600,20 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec StandardColumnProfile( "Survived", 1.0, + 1.0, + 1.0, + 1.0, 2, DataTypeInstances.Integral, false, Map.empty, None), - StandardColumnProfile("Pclass", 1.0, 3, DataTypeInstances.Integral, false, Map.empty, None), - StandardColumnProfile("Name", 1.0, 0, DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Sex", 1.0, 2, DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Ticket", 1.0, 681, DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Fare", 1.0, 0, DataTypeInstances.Fractional, false, Map.empty, None), - StandardColumnProfile("Cabin", 0.22, 0, DataTypeInstances.String, true, Map.empty, None) + StandardColumnProfile("Pclass", 1.0, 1.0, 1.0, 1.0, 3, DataTypeInstances.Integral, false, Map.empty, None), + StandardColumnProfile("Name", 1.0, 1.0, 1.0, 1.0, 0, DataTypeInstances.String, true, Map.empty, None), + StandardColumnProfile("Sex", 1.0, 1.0, 1.0, 1.0, 2, DataTypeInstances.String, true, Map.empty, None), + StandardColumnProfile("Ticket", 1.0, 1.0, 1.0, 1.0, 681, DataTypeInstances.String, true, Map.empty, None), + StandardColumnProfile("Fare", 1.0, 1.0, 1.0, 1.0, 0, DataTypeInstances.Fractional, false, Map.empty, None), + StandardColumnProfile("Cabin", 0.22, 1.0, 1.0, 1.0, 0, DataTypeInstances.String, true, Map.empty, None) ) assertSameColumnProfiles(columnProfiles.profiles, expectedProfiles) diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index 075247932..f9cd9dc4d 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -34,8 +34,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "CompleteIfCompleteRule" should { "be applied correctly" in { - val complete = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None) - val incomplete = StandardColumnProfile("col1", .25, 100, String, false, Map.empty, None) + val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) + val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) val completeInteger = getFakeNumericColumnProfileWithMinMaxMeanAndStdDev( @@ -129,8 +129,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "RetainCompletenessRule" should { "be applied correctly" in { - val complete = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None) - val incomplete = StandardColumnProfile("col1", .25, 100, String, false, Map.empty, None) + val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) + val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000)) assert(RetainCompletenessRule().shouldBeApplied(incomplete, 1000)) @@ -188,10 +188,10 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "UniqueIfApproximatelyUniqueRule" should { "be applied correctly" in { - val unique = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None) - val maybeUnique = StandardColumnProfile("col1", 1.0, 95, String, false, Map.empty, None) - val maybeNonUnique = StandardColumnProfile("col1", 1.0, 91, String, false, Map.empty, None) - val nonUnique = StandardColumnProfile("col1", 1.0, 20, String, false, Map.empty, None) + val unique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) + val maybeUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None) + val maybeNonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 91, String, false, Map.empty, None) + val nonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, String, false, Map.empty, None) assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(unique, 100)) assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(maybeUnique, 100)) @@ -251,19 +251,19 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "RetainTypeRule" should { "be applied correctly" in { - val string = StandardColumnProfile("col1", 1.0, 100, String, true, Map.empty, None) - val boolean = StandardColumnProfile("col1", 1.0, 100, Boolean, true, Map.empty, None) - val fractional = StandardColumnProfile("col1", 1.0, 100, Fractional, true, Map.empty, None) - val integer = StandardColumnProfile("col1", 1.0, 100, Integral, true, Map.empty, None) - val unknown = StandardColumnProfile("col1", 1.0, 100, Unknown, true, Map.empty, None) + val string = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, true, Map.empty, None) + val boolean = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Boolean, true, Map.empty, None) + val fractional = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, true, Map.empty, None) + val integer = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Integral, true, Map.empty, None) + val unknown = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Unknown, true, Map.empty, None) - val stringNonInferred = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, + val stringNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) - val booleanNonInferred = StandardColumnProfile("col1", 1.0, 100, Boolean, false, Map.empty, + val booleanNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Boolean, false, Map.empty, None) - val fractionalNonInferred = StandardColumnProfile("col1", 1.0, 100, Fractional, false, + val fractionalNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, false, Map.empty, None) - val integerNonInferred = StandardColumnProfile("col1", 1.0, 100, Integral, false, + val integerNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Integral, false, Map.empty, None) assert(!RetainTypeRule().shouldBeApplied(string, 100)) @@ -381,24 +381,24 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val noDistribution = Distribution(Map.empty, 0) - val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 100, String, false, + val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(nonSkewedDist)) - val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, + val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDist)) - val stringWithFlgDist = StandardColumnProfile("flg", 1.0, + val stringWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0, 2, String, false, Map.empty, Some(flgDist)) - val integralWithFlgDist = StandardColumnProfile("flg", 1.0, + val integralWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0, 2, DataTypeInstances.Integral, false, Map.empty, Some(flgDist)) - val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 100, String, false, + val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(skewedDist)) - val stringNoDist = StandardColumnProfile("col1", 1.0, 95, String, false, Map.empty, None) - val boolNoDist = StandardColumnProfile("col1", 1.0, 94, Boolean, false, Map.empty, None) - val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 20, Boolean, false, Map.empty, + val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None) + val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, Boolean, false, Map.empty, None) + val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, Boolean, false, Map.empty, Some(noDistribution)) - val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, + val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedDist)) - val integralNoDist = StandardColumnProfile("col1", 1.0, + val integralNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, DataTypeInstances.Integral, false, Map.empty, None) assert(CategoricalRangeRule().shouldBeApplied(stringWithNonSkewedDist, 100)) @@ -561,30 +561,30 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val noDistribution = Distribution(Map.empty, 0) - val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1", 1.0, + val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(nonSkewedDistWithFractionalCategoricalRange)) - val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0, + val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(nonSkewedDistWithActualCategoricalRange)) - val stringWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 100, String, false, + val stringWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(somewhatSkewedDist)) - val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 100, String, false, + val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(skewedDist)) - val stringNoDist = StandardColumnProfile("col1", 1.0, 95, String, false, Map.empty, None) - val boolNoDist = StandardColumnProfile("col1", 1.0, 94, Boolean, false, Map.empty, None) - val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 20, Boolean, false, Map.empty, + val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None) + val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, Boolean, false, Map.empty, None) + val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, Boolean, false, Map.empty, Some(noDistribution)) val integralWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1", - 1.0, 100, DataTypeInstances.Integral, false, Map.empty, + 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDistWithFractionalCategoricalRange)) - val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0, + val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDistWithActualCategoricalRange)) - val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, + val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(somewhatSkewedIntegralDist)) - val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, + val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist)) - val integralNoDist = StandardColumnProfile("col1", 1.0, + val integralNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, DataTypeInstances.Integral, false, Map.empty, None) assert(FractionalCategoricalRangeRule().shouldBeApplied(stringWithSomewhatSkewedDist, 100)) @@ -708,8 +708,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "NonNegativeNumbersRule and PositiveNumbersRule" should { "be applied correctly" in { def columnProfileWithMinimum(minimum: Double): NumericColumnProfile = { - NumericColumnProfile("col1", 1.0, 100, Fractional, isDataTypeInferred = false, - Map.empty, None, None, Some(10), Some(100), Some(minimum), Some(10000), Some(1.0), None) + NumericColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, isDataTypeInferred = false, + Map.empty, None, None, Some(10), Some(100), Some(minimum), Some(10000), Some(1.0), None, None) } val nRecords = 100 From d6753dbde08ed3c9fd735d4e79d93b0677b7cc34 Mon Sep 17 00:00:00 2001 From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com> Date: Thu, 17 Sep 2020 15:01:19 +0200 Subject: [PATCH 05/21] Fix tests checkstyle and 4 tests (#4) --- .../deequ/profiles/ColumnProfilerTest.scala | 38 ++++--- .../rules/ConstraintRulesTest.scala | 106 ++++++++++-------- 2 files changed, 84 insertions(+), 60 deletions(-) diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala index 62b953fdc..5f92df0f8 100644 --- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala +++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala @@ -60,9 +60,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "att2", 2.0 / 3.0, - 1.0, - 1.0, - 1.0, + 0.5, + 0.5623351446188083, + 0.25, 2, DataTypeInstances.String, true, @@ -115,7 +115,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec "item", 1.0, 1.0, - 1.0, + 1.791759469228055, 1.0, 6, DataTypeInstances.String, @@ -140,9 +140,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "att2", 2.0 / 3.0, - 1.0, - 1.0, - 1.0, + 0.5, + 0.5623351446188083, + 0.25, 2, DataTypeInstances.String, true, @@ -378,9 +378,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "att2", 2.0 / 3.0, - 1.0, - 1.0, - 1.0, + 0.5, + 0.5623351446188083, + 0.25, 2, DataTypeInstances.String, isDataTypeInferred = true, @@ -608,12 +608,18 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec false, Map.empty, None), - StandardColumnProfile("Pclass", 1.0, 1.0, 1.0, 1.0, 3, DataTypeInstances.Integral, false, Map.empty, None), - StandardColumnProfile("Name", 1.0, 1.0, 1.0, 1.0, 0, DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Sex", 1.0, 1.0, 1.0, 1.0, 2, DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Ticket", 1.0, 1.0, 1.0, 1.0, 681, DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Fare", 1.0, 1.0, 1.0, 1.0, 0, DataTypeInstances.Fractional, false, Map.empty, None), - StandardColumnProfile("Cabin", 0.22, 1.0, 1.0, 1.0, 0, DataTypeInstances.String, true, Map.empty, None) + StandardColumnProfile("Pclass", 1.0, 1.0, 1.0, 1.0, 3, + DataTypeInstances.Integral, false, Map.empty, None), + StandardColumnProfile("Name", 1.0, 1.0, 1.0, 1.0, 0, + DataTypeInstances.String, true, Map.empty, None), + StandardColumnProfile("Sex", 1.0, 1.0, 1.0, 1.0, 2, + DataTypeInstances.String, true, Map.empty, None), + StandardColumnProfile("Ticket", 1.0, 1.0, 1.0, 1.0, 681, + DataTypeInstances.String, true, Map.empty, None), + StandardColumnProfile("Fare", 1.0, 1.0, 1.0, 1.0, 0, + DataTypeInstances.Fractional, false, Map.empty, None), + StandardColumnProfile("Cabin", 0.22, 1.0, 1.0, 1.0, 0, + DataTypeInstances.String, true, Map.empty, None) ) assertSameColumnProfiles(columnProfiles.profiles, expectedProfiles) diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index f9cd9dc4d..72eab3fd3 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -34,8 +34,10 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "CompleteIfCompleteRule" should { "be applied correctly" in { - val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) - val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) + val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + String, false, Map.empty, None) + val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, + String, false, Map.empty, None) val completeInteger = getFakeNumericColumnProfileWithMinMaxMeanAndStdDev( @@ -129,8 +131,10 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "RetainCompletenessRule" should { "be applied correctly" in { - val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) - val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) + val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + String, false, Map.empty, None) + val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, + String, false, Map.empty, None) assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000)) assert(RetainCompletenessRule().shouldBeApplied(incomplete, 1000)) @@ -188,10 +192,14 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "UniqueIfApproximatelyUniqueRule" should { "be applied correctly" in { - val unique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None) - val maybeUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None) - val maybeNonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 91, String, false, Map.empty, None) - val nonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, String, false, Map.empty, None) + val unique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + String, false, Map.empty, None) + val maybeUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, + String, false, Map.empty, None) + val maybeNonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 91, + String, false, Map.empty, None) + val nonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, + String, false, Map.empty, None) assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(unique, 100)) assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(maybeUnique, 100)) @@ -251,20 +259,24 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "RetainTypeRule" should { "be applied correctly" in { - val string = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, true, Map.empty, None) - val boolean = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Boolean, true, Map.empty, None) - val fractional = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, true, Map.empty, None) - val integer = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Integral, true, Map.empty, None) - val unknown = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Unknown, true, Map.empty, None) - - val stringNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, - None) - val booleanNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Boolean, false, Map.empty, - None) - val fractionalNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, false, - Map.empty, None) - val integerNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Integral, false, - Map.empty, None) + val string = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + String, true, Map.empty, None) + val boolean = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + Boolean, true, Map.empty, None) + val fractional = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + Fractional, true, Map.empty, None) + val integer = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + Integral, true, Map.empty, None) + val unknown = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + Unknown, true, Map.empty, None) + val stringNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + String, false, Map.empty, None) + val booleanNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + Boolean, false, Map.empty, None) + val fractionalNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + Fractional, false, Map.empty, None) + val integerNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + Integral, false, Map.empty, None) assert(!RetainTypeRule().shouldBeApplied(string, 100)) assert(!RetainTypeRule().shouldBeApplied(unknown, 100)) @@ -381,8 +393,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val noDistribution = Distribution(Map.empty, 0) - val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, - Map.empty, Some(nonSkewedDist)) + val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + String, false, Map.empty, Some(nonSkewedDist)) val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDist)) val stringWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0, @@ -390,12 +402,14 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val integralWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0, 2, DataTypeInstances.Integral, false, Map.empty, Some(flgDist)) - val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, - Map.empty, Some(skewedDist)) - val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None) - val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, Boolean, false, Map.empty, None) - val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, Boolean, false, Map.empty, - Some(noDistribution)) + val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + String, false, Map.empty, Some(skewedDist)) + val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, + String, false, Map.empty, None) + val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, + Boolean, false, Map.empty, None) + val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, + Boolean, false, Map.empty, Some(noDistribution)) val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedDist)) val integralNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, @@ -561,24 +575,28 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val noDistribution = Distribution(Map.empty, 0) - val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, - 100, String, false, Map.empty, Some(nonSkewedDistWithFractionalCategoricalRange)) - val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, - 100, String, false, Map.empty, Some(nonSkewedDistWithActualCategoricalRange)) - val stringWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, - Map.empty, Some(somewhatSkewedDist)) - val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, - Map.empty, Some(skewedDist)) - val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None) - val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, Boolean, false, Map.empty, None) - val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, Boolean, false, Map.empty, - Some(noDistribution)) + val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile( + "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, + Some(nonSkewedDistWithFractionalCategoricalRange)) + val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile( + "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, + Some(nonSkewedDistWithActualCategoricalRange)) + val stringWithSomewhatSkewedDist = StandardColumnProfile( + "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(somewhatSkewedDist)) + val stringWithSkewedDist = StandardColumnProfile( + "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(skewedDist)) + val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, + String, false, Map.empty, None) + val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, + Boolean, false, Map.empty, None) + val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, + Boolean, false, Map.empty, Some(noDistribution)) val integralWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDistWithFractionalCategoricalRange)) - val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, - 100, DataTypeInstances.Integral, false, Map.empty, + val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile( + "col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDistWithActualCategoricalRange)) val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, Some(somewhatSkewedIntegralDist)) From dac9285b6c5cde2a0e7f1e6f562f5b853be48c95 Mon Sep 17 00:00:00 2001 From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com> Date: Thu, 17 Sep 2020 15:15:05 +0200 Subject: [PATCH 06/21] Fix test checkstyle (#5) --- .../amazon/deequ/suggestions/rules/ConstraintRulesTest.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index 72eab3fd3..9a90af7ca 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -726,8 +726,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "NonNegativeNumbersRule and PositiveNumbersRule" should { "be applied correctly" in { def columnProfileWithMinimum(minimum: Double): NumericColumnProfile = { - NumericColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, isDataTypeInferred = false, - Map.empty, None, None, Some(10), Some(100), Some(minimum), Some(10000), Some(1.0), None, None) + NumericColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, + isDataTypeInferred = false, Map.empty, None, None, Some(10), Some(100), + Some(minimum), Some(10000), Some(1.0), None, None) } val nRecords = 100 From 0007bf194c395c5511ae68be25ca1793e17e383e Mon Sep 17 00:00:00 2001 From: Fabio Buso Date: Tue, 1 Jun 2021 23:17:22 +0200 Subject: [PATCH 07/21] Hopsify Deequ 1.1.0 --- pom.xml | 136 ++++++++++++++++-- .../analyzers/catalyst/DeequFunctions.scala | 2 +- 2 files changed, 123 insertions(+), 15 deletions(-) diff --git a/pom.xml b/pom.xml index 9798ec271..14be4a912 100644 --- a/pom.xml +++ b/pom.xml @@ -5,21 +5,9 @@ 4.0.0 com.logicalclocks - deequ - 2.0.4-spark-3.3 + deequ_${scala.major.version} + 2.0.4.0 - - 1.8 - 1.8 - UTF-8 - - 2.12 - ${scala.major.version}.10 - ${scala.major.version} - 4.8.1 - - 3.3.0 - deequ Deequ is a library built on top of Apache Spark for defining "unit tests for data", @@ -67,6 +55,47 @@ https://github.com/awslabs/deequ + + + + + + 1.8 + 1.8 + UTF-8 + + + ${scala-212.major.version} + ${scala.major.version}.10 + 2.11 + 2.12 + _scala-${scala.major.version} + _spark-${spark.version} + 4.8.1 + + + ${spark-33.version} + 2.2.2 + 2.3.2 + 2.4.2 + 3.0.0 + 3.1.1.0 + 3.3.0.0 + + provided + + org.scala-lang @@ -86,12 +115,14 @@ org.apache.spark spark-core_${scala.major.version} ${spark.version} + ${spark.scope} org.apache.spark spark-sql_${scala.major.version} ${spark.version} + ${spark.scope} @@ -414,8 +445,85 @@ + + + + + + + Hops + Hops Repo + https://archiva.hops.works/repository/Hops/ + + true + + + true + + + + Hops diff --git a/src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala b/src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala index dd973b301..3bef80fe1 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala @@ -47,7 +47,7 @@ object DeequFunctions { /** Standard deviation with state */ def stateful_stddev_pop(column: Column): Column = withAggregateFunction { - StatefulStdDevPop(column.expr) + StatefulStdDevPop(column.expr, true) } /** Approximate number of distinct values with state via HLL's */ From a090645ffb25c99b2d6cd4cf69376fe985dc48c2 Mon Sep 17 00:00:00 2001 From: Fabio Buso Date: Mon, 7 Jun 2021 23:57:42 +0200 Subject: [PATCH 08/21] Bump deequ hops version --- pom.xml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 14be4a912..6c17cabc8 100644 --- a/pom.xml +++ b/pom.xml @@ -6,8 +6,7 @@ com.logicalclocks deequ_${scala.major.version} - 2.0.4.0 - + 2.0.4.1 deequ Deequ is a library built on top of Apache Spark for defining "unit tests for data", From 556d55a7e4e3a7b990c52f61fecc2b669344a6bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?= Date: Mon, 30 Oct 2023 10:27:31 +0100 Subject: [PATCH 09/21] Prepare for 2.0.4.1-SNAPSHOT development --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 6c17cabc8..40c378eb1 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.logicalclocks deequ_${scala.major.version} - 2.0.4.1 + 2.0.4.1-SNAPSHOT deequ Deequ is a library built on top of Apache Spark for defining "unit tests for data", From 3fe618857c3682145df8ffcd9f83425e761763de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Till=20D=C3=B6hmen?= Date: Tue, 10 Aug 2021 11:27:18 +0200 Subject: [PATCH 10/21] Fix for NaNs and Infinity values in profile JSON (#7) Co-authored-by: doehmen-admin --- .../amazon/deequ/profiles/ColumnProfile.scala | 40 ++++++++++++------- .../deequ/profiles/ColumnProfilerTest.scala | 31 ++++++++++++++ 2 files changed, 57 insertions(+), 14 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala index 39b54d508..58ff9ef0b 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala @@ -107,10 +107,10 @@ object ColumnProfiles { } } - columnProfileJson.addProperty("completeness", profile.completeness) - columnProfileJson.addProperty("distinctness", profile.distinctness) - columnProfileJson.addProperty("entropy", profile.entropy) - columnProfileJson.addProperty("uniqueness", profile.uniqueness) + columnProfileJson.addProperty("completeness", normalizeDouble(profile.completeness)) + columnProfileJson.addProperty("distinctness", normalizeDouble(profile.distinctness)) + columnProfileJson.addProperty("entropy", normalizeDouble(profile.entropy)) + columnProfileJson.addProperty("uniqueness", normalizeDouble(profile.uniqueness)) columnProfileJson.addProperty("approximateNumDistinctValues", profile.approximateNumDistinctValues) @@ -122,7 +122,7 @@ object ColumnProfiles { val histogramEntry = new JsonObject() histogramEntry.addProperty("value", name) histogramEntry.addProperty("count", distributionValue.absolute) - histogramEntry.addProperty("ratio", distributionValue.ratio) + histogramEntry.addProperty("ratio", normalizeDouble(distributionValue.ratio)) histogramJson.add(histogramEntry) } @@ -132,19 +132,19 @@ object ColumnProfiles { profile match { case numericColumnProfile: NumericColumnProfile => numericColumnProfile.mean.foreach { mean => - columnProfileJson.addProperty("mean", mean) + columnProfileJson.addProperty("mean", normalizeDouble(mean)) } numericColumnProfile.maximum.foreach { maximum => - columnProfileJson.addProperty("maximum", maximum) + columnProfileJson.addProperty("maximum", normalizeDouble(maximum)) } numericColumnProfile.minimum.foreach { minimum => - columnProfileJson.addProperty("minimum", minimum) + columnProfileJson.addProperty("minimum", normalizeDouble(minimum)) } numericColumnProfile.sum.foreach { sum => - columnProfileJson.addProperty("sum", sum) + columnProfileJson.addProperty("sum", normalizeDouble(sum)) } numericColumnProfile.stdDev.foreach { stdDev => - columnProfileJson.addProperty("stdDev", stdDev) + columnProfileJson.addProperty("stdDev", normalizeDouble(stdDev)) } // correlation @@ -153,7 +153,7 @@ object ColumnProfiles { numericColumnProfile.correlation.get.foreach { correlation => val correlationJson = new JsonObject() correlationJson.addProperty("column", correlation._1) - correlationJson.addProperty("correlation", correlation._2) + correlationJson.addProperty("correlation", normalizeDouble(correlation._2)) correlationsJson.add(correlationJson) } columnProfileJson.add("correlations", correlationsJson) @@ -167,8 +167,8 @@ object ColumnProfiles { val tmp = new JsonArray() kllSketch.buckets.foreach{bucket => val entry = new JsonObject() - entry.addProperty("low_value", bucket.lowValue) - entry.addProperty("high_value", bucket.highValue) + entry.addProperty("low_value", normalizeDouble(bucket.lowValue)) + entry.addProperty("high_value", normalizeDouble(bucket.highValue)) entry.addProperty("count", bucket.count) tmp.add(entry) } @@ -206,10 +206,22 @@ object ColumnProfiles { json.add("columns", columns) - val gson = new GsonBuilder() + val gson = new GsonBuilder().serializeNulls() // .setPrettyPrinting() .create() gson.toJson(json) } + + def normalizeDouble(numeric: Double): java.lang.Double ={ + if (numeric.isNaN) { + null.asInstanceOf[java.lang.Double] + } else if (numeric.isNegInfinity) { + Double.MinValue + } else if(numeric.isPosInfinity) { + Double.MaxValue + } else { + numeric + } + } } diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala index 5f92df0f8..b63394991 100644 --- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala +++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala @@ -22,6 +22,7 @@ import com.amazon.deequ.analyzers.Histogram.NullFieldReplacement import com.amazon.deequ.metrics.{BucketDistribution, BucketValue, Distribution, DistributionValue} import com.amazon.deequ.utils.FixtureSupport import org.apache.spark.sql.Row +import org.apache.spark.sql.functions.lit import org.apache.spark.sql.types._ import org.scalatest.{Matchers, WordSpec} @@ -625,6 +626,36 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec assertSameColumnProfiles(columnProfiles.profiles, expectedProfiles) } + "return correct JSON for NumericColumnProfiles with NaNs" in + withSparkSession { session => + + val nRows = 100 + + import session.implicits._ + import org.apache.spark.sql.functions + + var data = session.sparkContext.range(0,nRows).toDF().select(functions.col("value")) + data = data.withColumnRenamed("value","att0") + data = data.withColumn("att1",lit(0.0).cast(LongType)) + data = data.withColumn("att2",lit(0.0).cast(LongType)) + + val profile = ColumnProfiler.profile(data, Option(Seq("att1","att2"))) + val profiles = profile.profiles.map{pro => pro._2}.toSeq + val json_profile = ColumnProfiles.toJson(profiles) + val correct_profile = "{\"columns\":[{\"column\":\"att1\",\"dataType\":\"Integral\"," + + "\"isDataTypeInferred\":\"false\",\"completeness\":1.0,\"distinctness\":0.01,\"entropy\":0.0," + + "\"uniqueness\":0.0,\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\",\"count\":100," + + "\"ratio\":1.0}],\"mean\":0.0,\"maximum\":0.0,\"minimum\":0.0,\"sum\":0.0,\"stdDev\":0.0," + + "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\",\"correlation\":null}]," + + "\"approxPercentiles\":[]},{\"column\":\"att2\",\"dataType\":\"Integral\",\"isDataTypeInferred\":\"false\"," + + "\"completeness\":1.0,\"distinctness\":0.01,\"entropy\":0.0,\"uniqueness\":0.0," + + "\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\",\"count\":100,\"ratio\":1.0}]," + + "\"mean\":0.0,\"maximum\":0.0,\"minimum\":0.0,\"sum\":0.0,\"stdDev\":0.0," + + "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\",\"correlation\":null}]," + + "\"approxPercentiles\":[]}]}" + assert(json_profile == correct_profile) + } + private[this] def assertSameColumnProfiles( actualProfiles: Map[String, ColumnProfile], expectedProfiles: List[ColumnProfile]) From 9e10b6c2cc8711537fb01ea8491ada8e87f733b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Till=20D=C3=B6hmen?= Date: Tue, 10 Aug 2021 12:16:06 +0200 Subject: [PATCH 11/21] Fix for NaNs and Infinity values in profile JSON (stylecheck) (#8) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: doehmen-admin Co-authored-by: Till Döhmen --- .../amazon/deequ/profiles/ColumnProfile.scala | 4 +-- .../deequ/profiles/ColumnProfilerTest.scala | 28 +++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala index 58ff9ef0b..7ba21ee1a 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala @@ -213,12 +213,12 @@ object ColumnProfiles { gson.toJson(json) } - def normalizeDouble(numeric: Double): java.lang.Double ={ + def normalizeDouble(numeric: Double): java.lang.Double = { if (numeric.isNaN) { null.asInstanceOf[java.lang.Double] } else if (numeric.isNegInfinity) { Double.MinValue - } else if(numeric.isPosInfinity) { + } else if (numeric.isPosInfinity) { Double.MaxValue } else { numeric diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala index b63394991..e0441be2a 100644 --- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala +++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala @@ -634,24 +634,30 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec import session.implicits._ import org.apache.spark.sql.functions - var data = session.sparkContext.range(0,nRows).toDF().select(functions.col("value")) - data = data.withColumnRenamed("value","att0") - data = data.withColumn("att1",lit(0.0).cast(LongType)) - data = data.withColumn("att2",lit(0.0).cast(LongType)) + var data = session.sparkContext.range(0, nRows).toDF().select(functions.col("value")) + data = data.withColumnRenamed("value", "att0") + data = data.withColumn("att1", lit(0.0).cast(LongType)) + data = data.withColumn("att2", lit(0.0).cast(LongType)) - val profile = ColumnProfiler.profile(data, Option(Seq("att1","att2"))) + val profile = ColumnProfiler.profile(data, Option(Seq("att1", "att2"))) val profiles = profile.profiles.map{pro => pro._2}.toSeq val json_profile = ColumnProfiles.toJson(profiles) val correct_profile = "{\"columns\":[{\"column\":\"att1\",\"dataType\":\"Integral\"," + - "\"isDataTypeInferred\":\"false\",\"completeness\":1.0,\"distinctness\":0.01,\"entropy\":0.0," + - "\"uniqueness\":0.0,\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\",\"count\":100," + + "\"isDataTypeInferred\":\"false\",\"completeness\":1.0,\"distinctness\":0.01," + + "\"entropy\":0.0," + + "\"uniqueness\":0.0,\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\"," + + "\"count\":100," + "\"ratio\":1.0}],\"mean\":0.0,\"maximum\":0.0,\"minimum\":0.0,\"sum\":0.0,\"stdDev\":0.0," + - "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\",\"correlation\":null}]," + - "\"approxPercentiles\":[]},{\"column\":\"att2\",\"dataType\":\"Integral\",\"isDataTypeInferred\":\"false\"," + + "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\"," + + "\"correlation\":null}]," + + "\"approxPercentiles\":[]},{\"column\":\"att2\",\"dataType\":\"Integral\"," + + "\"isDataTypeInferred\":\"false\"," + "\"completeness\":1.0,\"distinctness\":0.01,\"entropy\":0.0,\"uniqueness\":0.0," + - "\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\",\"count\":100,\"ratio\":1.0}]," + + "\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\",\"count\":100," + + "\"ratio\":1.0}]," + "\"mean\":0.0,\"maximum\":0.0,\"minimum\":0.0,\"sum\":0.0,\"stdDev\":0.0," + - "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\",\"correlation\":null}]," + + "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\"," + + "\"correlation\":null}]," + "\"approxPercentiles\":[]}]}" assert(json_profile == correct_profile) } From 8791fbb55aab54cb9ee39fbec6374a19c9cea390 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Till=20D=C3=B6hmen?= Date: Thu, 26 Aug 2021 16:51:56 +0200 Subject: [PATCH 12/21] [HOPSWORKS-2681] Profiling optimization (#6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Till Döhmen --- .../amazon/deequ/analyzers/Completeness.scala | 4 +- .../com/amazon/deequ/analyzers/DataType.scala | 1 + .../deequ/examples/DataProfilingExample.scala | 1 + .../amazon/deequ/profiles/ColumnProfile.scala | 50 +- .../deequ/profiles/ColumnProfiler.scala | 224 +++++++- .../profiles/ColumnProfilerRunBuilder.scala | 78 ++- .../deequ/profiles/ColumnProfilerRunner.scala | 59 ++- .../ConstraintSuggestionRunner.scala | 1 + .../FractionalCategoricalRangeRule.scala | 4 +- .../rules/RetainCompletenessRule.scala | 1 + .../com/amazon/deequ/KLL/KLLProfileTest.scala | 12 +- .../deequ/KLL/KLLProfileTestApprox.scala | 481 ++++++++++++++++++ .../amazon/deequ/VerificationResultTest.scala | 1 + .../deequ/analyzers/AnalyzerTests.scala | 7 +- .../amazon/deequ/analyzers/StatesTest.scala | 11 - .../com/amazon/deequ/checks/CheckTest.scala | 7 - .../profiles/ColumnProfilerRunnerTest.scala | 54 +- .../deequ/profiles/ColumnProfilerTest.scala | 72 +-- .../ConstraintSuggestionResultTest.scala | 3 + .../ConstraintSuggestionRunnerTest.scala | 2 +- ...ConstraintSuggestionsIntegrationTest.scala | 28 +- .../rules/ConstraintRulesTest.scala | 122 ++--- 22 files changed, 1023 insertions(+), 200 deletions(-) create mode 100644 src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala diff --git a/src/main/scala/com/amazon/deequ/analyzers/Completeness.scala b/src/main/scala/com/amazon/deequ/analyzers/Completeness.scala index 5e80e2f6e..f4e30739e 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/Completeness.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/Completeness.scala @@ -16,7 +16,7 @@ package com.amazon.deequ.analyzers -import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNotNested} +import com.amazon.deequ.analyzers.Preconditions.{hasColumn} import org.apache.spark.sql.functions.sum import org.apache.spark.sql.types.{IntegerType, StructType} import Analyzers._ @@ -44,7 +44,7 @@ case class Completeness(column: String, where: Option[String] = None) extends } override protected def additionalPreconditions(): Seq[StructType => Unit] = { - hasColumn(column) :: isNotNested(column) :: Nil + hasColumn(column) :: Nil } override def filterCondition: Option[String] = where diff --git a/src/main/scala/com/amazon/deequ/analyzers/DataType.scala b/src/main/scala/com/amazon/deequ/analyzers/DataType.scala index fb3c1ca06..d0ec2a7ac 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/DataType.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/DataType.scala @@ -35,6 +35,7 @@ object DataTypeInstances extends Enumeration { val Integral: Value = Value(2) val Boolean: Value = Value(3) val String: Value = Value(4) + val Decimal: Value = Value(5) } case class DataTypeHistogram( diff --git a/src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala b/src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala index ecb17dae3..c5e350819 100644 --- a/src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala +++ b/src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala @@ -43,6 +43,7 @@ private[examples] object DataProfilingExample extends App { any shuffles. */ val result = ColumnProfilerRunner() .onData(rawData) + .nonOptimized() .run() /* We get a profile for each column which allows to inspect the completeness of the column, diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala index 7ba21ee1a..84df99511 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala @@ -24,9 +24,9 @@ import com.google.gson.{Gson, GsonBuilder, JsonArray, JsonObject, JsonPrimitive} abstract class ColumnProfile { def column: String def completeness: Double - def distinctness: Double - def entropy: Double - def uniqueness: Double + def distinctness: Option[Double] + def entropy: Option[Double] + def uniqueness: Option[Double] def approximateNumDistinctValues: Long def dataType: DataTypeInstances.Value def isDataTypeInferred: Boolean @@ -37,9 +37,9 @@ abstract class ColumnProfile { case class StandardColumnProfile( column: String, completeness: Double, - distinctness: Double, - entropy: Double, - uniqueness: Double, + distinctness: Option[Double], + entropy: Option[Double], + uniqueness: Option[Double], approximateNumDistinctValues: Long, dataType: DataTypeInstances.Value, isDataTypeInferred: Boolean, @@ -62,9 +62,9 @@ case class StringColumnProfile( case class NumericColumnProfile( column: String, completeness: Double, - distinctness: Double, - entropy: Double, - uniqueness: Double, + distinctness: Option[Double], + entropy: Option[Double], + uniqueness: Option[Double], approximateNumDistinctValues: Long, dataType: DataTypeInstances.Value, isDataTypeInferred: Boolean, @@ -108,9 +108,16 @@ object ColumnProfiles { } columnProfileJson.addProperty("completeness", normalizeDouble(profile.completeness)) - columnProfileJson.addProperty("distinctness", normalizeDouble(profile.distinctness)) - columnProfileJson.addProperty("entropy", normalizeDouble(profile.entropy)) - columnProfileJson.addProperty("uniqueness", normalizeDouble(profile.uniqueness)) + if (profile.distinctness.isDefined) { + columnProfileJson.addProperty("distinctness", normalizeDouble(profile.distinctness.get)) + } + if (profile.entropy.isDefined) { + columnProfileJson.addProperty("entropy", normalizeDouble(profile.entropy.get)) + } + if (profile.uniqueness.isDefined) { + columnProfileJson.addProperty("uniqueness", normalizeDouble(profile.uniqueness.get)) + } + columnProfileJson.addProperty("approximateNumDistinctValues", profile.approximateNumDistinctValues) @@ -165,14 +172,33 @@ object ColumnProfiles { val kllSketchJson = new JsonObject() val tmp = new JsonArray() + var totalCount = kllSketch.buckets.foldLeft(0.0)(_ + _.count) + if (totalCount == 0) totalCount = 1 + kllSketch.buckets.foreach{bucket => val entry = new JsonObject() entry.addProperty("low_value", normalizeDouble(bucket.lowValue)) entry.addProperty("high_value", normalizeDouble(bucket.highValue)) entry.addProperty("count", bucket.count) + entry.addProperty("ratio", bucket.count/totalCount) tmp.add(entry) } + if (profile.histogram.isEmpty) { + val histogramJson = new JsonArray() + kllSketch.buckets.foreach{bucket => + val histogramEntry = new JsonObject() + histogramEntry.addProperty("value", "%.2f".formatLocal(java.util.Locale.US, + bucket.lowValue) + "-" + "%.2f".formatLocal(java.util.Locale.US, bucket + .highValue)) + histogramEntry.addProperty("count", bucket.count) + histogramEntry.addProperty("ratio", bucket.count/totalCount) + histogramJson.add(histogramEntry) + } + + columnProfileJson.add("histogram", histogramJson) + } + kllSketchJson.add("buckets", tmp) val entry = new JsonObject() entry.addProperty("c", kllSketch.parameters(0)) diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala index d4c42bd42..9de7e3b25 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala @@ -16,6 +16,8 @@ package com.amazon.deequ.profiles +import scala.util.Success +import scala.collection.mutable.ListBuffer import com.amazon.deequ.analyzers.DataTypeInstances._ import com.amazon.deequ.analyzers._ import com.amazon.deequ.analyzers.runners.AnalysisRunBuilder @@ -41,6 +43,9 @@ import org.apache.spark.sql.types.TimestampType import org.apache.spark.sql.types.{DataType => SparkDataType} import scala.util.Success +import com.amazon.deequ.repository.{MetricsRepository, ResultKey} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.{BinaryType, BooleanType, ByteType, DateType, DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType, StringType, StructType, TimestampType, DataType => SparkDataType} private[deequ] case class GenericColumnStatistics( numRecords: Long, @@ -243,6 +248,188 @@ object ColumnProfiler { CategoricalColumnStatistics(thirdPassResults)) } + + /** + * Profile a (potentially very large) dataset. + * + * @param data data dataset as dataframe + * @param restrictToColumns an contain a subset of columns to profile, otherwise + * all columns will be considered + * @param printStatusUpdates + * @param lowCardinalityHistogramThreshold the maximum (estimated) number of distinct values + * in a column until which we should compute exact + * histograms for it (defaults to 120) + * @param metricsRepository the repo to store metrics + * @param reuseExistingResultsUsingKey key for reuse existing result + * @param failIfResultsForReusingMissing true if we have results for reusing + * @param saveInMetricsRepositoryUsingKey key for saving in metrics repo + * @param kllParameters parameters for KLL Sketches + * + * @return the profile of columns + */ + // scalastyle:off argcount + private[deequ] def profileOptimized( + data: DataFrame, + restrictToColumns: Option[Seq[String]] = None, + printStatusUpdates: Boolean = false, + lowCardinalityHistogramThreshold: Int = ColumnProfiler + .DEFAULT_CARDINALITY_THRESHOLD, + metricsRepository: Option[MetricsRepository] = None, + reuseExistingResultsUsingKey: Option[ResultKey] = None, + failIfResultsForReusingMissing: Boolean = false, + saveInMetricsRepositoryUsingKey: Option[ResultKey] = None, + correlation: Boolean = true, + histogram: Boolean = false, + exactUniqueness: Boolean = false, + exactUniquenessCols: Option[Seq[String]] = None, + maxCorrelationCols: Option[Int] = None, + kllParameters: Option[KLLParameters] = None + ) + : ColumnProfiles = { + + // Ensure that all desired columns exist + restrictToColumns.foreach { restrictToColumns => + restrictToColumns.foreach { columnName => + require(data.schema.fieldNames.contains(columnName), s"Unable to find column $columnName") + } + } + + // Find columns we want to profile + val relevantColumns = getRelevantColumns(data.schema, restrictToColumns) + + // We assume that data types are predefined by the schema, and skip the data type detection + val predefinedTypes = data.schema.fields + .filter { column => relevantColumns.contains(column.name) } + .map { field => + val knownType = field.dataType match { + case ByteType | ShortType | IntegerType | LongType => Integral + case FloatType | DoubleType => Fractional + case DecimalType() => Decimal + case BooleanType => Boolean + case StringType | TimestampType | DateType | BinaryType => String + case _ => + println(s"Unable to map type ${field.dataType}") + Unknown + } + + field.name -> knownType + } + .toMap + + val numericColumnNames = relevantColumns + .filter { name => Set(Integral, Fractional, Decimal).contains(predefinedTypes(name)) } + + // First pass + if (printStatusUpdates) { + println("### PROFILING: Computing generic column statistics in pass (1/2)...") + } + + // We compute completeness, approximate number of distinct values for all cols + // and min, max, mean, stddev, sum, kll and correlations for numeric cols + // and uniqueness, distinctness and entropy for optional cols + var correlationCalculatedColumnNames = new ListBuffer[String]() + val analyzersForGenericStats = relevantColumns.flatMap { name => + val analyzers = ListBuffer[Analyzer[_, Metric[_]]]() + + // Add default analyzers. + analyzers ++= Seq(Completeness(name), ApproxCountDistinct(name)) + + if (numericColumnNames.contains(name)) { + // Add numeric analyzers. + analyzers ++= Seq(Minimum(name), Maximum(name), Mean(name), + StandardDeviation(name), Sum(name)) + // Add KLL analyzer. + if (histogram && predefinedTypes(name) != Decimal) { + analyzers += KLLSketch(name, kllParameters) + } + if (correlation && (maxCorrelationCols.isEmpty || (numericColumnNames.length <= + maxCorrelationCols.get))) { + // Add correlation analyzers. + correlationCalculatedColumnNames += name + analyzers ++= numericColumnNames + .filterNot(x => correlationCalculatedColumnNames.contains(x)) + .map(x => Correlation(name, x)) + } + } + + if (exactUniqueness && (exactUniquenessCols.isEmpty || + (exactUniquenessCols.isDefined && exactUniquenessCols.get.contains(name))) + && predefinedTypes(name) != Unknown) { + // Add grouping analyzers. + analyzers ++= Seq(Uniqueness(name), Distinctness(name), Entropy(name)) + } + + analyzers + } + + var analysisRunnerFirstPass = AnalysisRunner + .onData(data) + .addAnalyzers(analyzersForGenericStats) + .addAnalyzer(Size()) + + analysisRunnerFirstPass = setMetricsRepositoryConfigurationIfNecessary( + analysisRunnerFirstPass, + metricsRepository, + reuseExistingResultsUsingKey, + failIfResultsForReusingMissing, + saveInMetricsRepositoryUsingKey) + + val firstPassResults = analysisRunnerFirstPass.run() + + val genericStatistics = extractGenericStatistics( + relevantColumns, + data.schema, + firstPassResults, + predefinedTypes) + + + val numericStatistics = if (correlation) { + extractNumericStatistics(firstPassResults, correlationCalculatedColumnNames) + } else { + extractNumericStatistics(firstPassResults) + } + + val secondPassResults = histogram match { + case true => + // Second pass + if (printStatusUpdates) { + println("### PROFILING: Computing histograms of low-cardinality columns in pass (2/2)...") + } + + // We compute exact histograms for all low-cardinality string columns, find those here + val targetColumnsForHistograms = findTargetColumnsForHistograms(data.schema, + genericStatistics, lowCardinalityHistogramThreshold) + + // Find out, if we have values for those we can reuse + val analyzerContextExistingValues = + getAnalyzerContextWithHistogramResultsForReusingIfNecessary( + metricsRepository, + reuseExistingResultsUsingKey, + targetColumnsForHistograms + ) + + // The columns we need to calculate the histograms for + val nonExistingHistogramColumns = targetColumnsForHistograms + .filter { column => + analyzerContextExistingValues.metricMap.get(Histogram(column)).isEmpty } + + // Calculate and save/append results if necessary + val histograms: Map[String, Distribution] = getHistogramsForThirdPass( + data, + nonExistingHistogramColumns, + analyzerContextExistingValues, + printStatusUpdates, + failIfResultsForReusingMissing, + metricsRepository, + saveInMetricsRepositoryUsingKey) + histograms + case _ => Map.empty[String, Distribution] + } + + createProfiles(relevantColumns, genericStatistics, numericStatistics, + CategoricalColumnStatistics(secondPassResults)) + } + private[this] def getRelevantColumns( schema: StructType, restrictToColumns: Option[Seq[String]]) @@ -291,7 +478,8 @@ object ColumnProfiler { correlation: Boolean) : Seq[Analyzer[_, Metric[_]]] = { val numericColumnNames = relevantColumnNames - .filter { name => Set(Integral, Fractional).contains(genericStatistics.typeOf(name)) } + .filter { name => Set(Integral, Fractional).contains(genericStatistics.typeOf + (name)) } numericColumnNames .flatMap { name => getNumericColAnalyzers(name, kllProfiling, kllParameters, correlation, numericColumnNames) @@ -543,7 +731,9 @@ object ColumnProfiler { } - private[this] def extractNumericStatistics(results: AnalyzerContext): NumericColumnStatistics = { + private[this] def extractNumericStatistics(results: AnalyzerContext, + correlationCols: Seq[String] = Seq[String]()) + : NumericColumnStatistics = { val means = results.metricMap .collect { case (analyzer: Mean, metric: DoubleMetric) => @@ -621,7 +811,10 @@ object ColumnProfiler { .flatten .toMap - val correlation = results.metricMap + val correlationDiagonal = correlationCols.map { name => + Some((name -> Map(name -> 1.0))) + } + val correlationLower = results.metricMap .collect { case (analyzer: Correlation, metric: DoubleMetric) => metric.value match { case Success(metricValue) => @@ -629,9 +822,19 @@ object ColumnProfiler { case _ => None } } - .flatten + val correlationUpper = results.metricMap + .collect { case (analyzer: Correlation, metric: DoubleMetric) => + metric.value match { + case Success(metricValue) => + Some(analyzer.secondColumn -> Map(analyzer.firstColumn -> metricValue)) + case _ => None + } + } + val correlation = (correlationLower ++ correlationDiagonal ++ correlationUpper).flatten .groupBy(_._1) - .map { case (key, value) => value.reduce((x, y) => x._1 -> (x._2.toSeq ++ y._2.toSeq).toMap) } + .map { case (key, value) => value.reduce((x, y) => x._1 -> (x._2.toSeq ++ y._2.toSeq).toMap + )} + NumericColumnStatistics(means, stdDevs, minima, maxima, sums, kll, approxPercentiles, correlation) @@ -659,7 +862,8 @@ object ColumnProfiler { genericStatistics.approximateNumDistincts .filter { case (column, _) => originalStringNumericOrBooleanColumns.contains(column) && - Set(String, Boolean, Integral, Fractional).contains(genericStatistics.typeOf(column)) + Set(String, Boolean, Integral, Fractional).contains(genericStatistics.typeOf + (column)) } .filter { case (_, count) => count <= lowCardinalityHistogramThreshold } .map { case (column, _) => column } @@ -777,9 +981,9 @@ object ColumnProfiler { .map { name => val completeness = genericStats.completenesses(name) - val distinctness = genericStats.distinctness(name) - val entropy = genericStats.entropy(name) - val uniqueness = genericStats.uniqueness(name) + val distinctness = genericStats.distinctness.get(name) + val entropy = genericStats.entropy.get(name) + val uniqueness = genericStats.uniqueness.get(name) val approxNumDistinct = genericStats.approximateNumDistincts(name) val dataType = genericStats.typeOf(name) val isDataTypeInferred = genericStats.inferredTypes.contains(name) @@ -789,7 +993,7 @@ object ColumnProfiler { val profile = genericStats.typeOf(name) match { - case Integral | Fractional => + case Integral | Fractional | Decimal => NumericColumnProfile( name, completeness, diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala index 14e3297ad..ed2bee395 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala @@ -17,7 +17,7 @@ package com.amazon.deequ.profiles import com.amazon.deequ.repository._ -import com.amazon.deequ.analyzers.{DataTypeInstances, KLLParameters} +import com.amazon.deequ.analyzers.{DataTypeInstances, KLLParameters, KLLSketch} import org.apache.spark.sql.{DataFrame, SparkSession} /** A class to build a Constraint Suggestion run using a fluent API */ @@ -39,11 +39,15 @@ class ColumnProfilerRunBuilder(val data: DataFrame) { protected var saveColumnProfilesJsonPath: Option[String] = None protected var saveConstraintSuggestionsJsonPath: Option[String] = None protected var saveEvaluationResultsJsonPath: Option[String] = None - protected var correlation = true - protected var histogram = true + protected var correlation = false + protected var histogram = false protected var kllProfiling = false protected var kllParameters: Option[KLLParameters] = None protected var predefinedTypes: Map[String, DataTypeInstances.Value] = Map.empty + protected var maxCorrelationCols: Option[Int] = None + protected var exactUniqueness = false + protected var exactUniquenessCols: Option[Seq[String]] = None + protected var optimized = true protected def this(constraintSuggestionRunBuilder: ColumnProfilerRunBuilder) { @@ -66,9 +70,18 @@ class ColumnProfilerRunBuilder(val data: DataFrame) { saveConstraintSuggestionsJsonPath = constraintSuggestionRunBuilder .saveConstraintSuggestionsJsonPath saveEvaluationResultsJsonPath = constraintSuggestionRunBuilder.saveEvaluationResultsJsonPath + + restrictToColumns = constraintSuggestionRunBuilder.restrictToColumns + correlation = constraintSuggestionRunBuilder.correlation + maxCorrelationCols = constraintSuggestionRunBuilder.maxCorrelationCols + histogram = constraintSuggestionRunBuilder.histogram + kllProfiling = constraintSuggestionRunBuilder.kllProfiling kllParameters = constraintSuggestionRunBuilder.kllParameters predefinedTypes = constraintSuggestionRunBuilder.predefinedTypes + exactUniqueness = constraintSuggestionRunBuilder.exactUniqueness + exactUniquenessCols = constraintSuggestionRunBuilder.exactUniquenessCols + optimized = constraintSuggestionRunBuilder.optimized } /** @@ -93,7 +106,7 @@ class ColumnProfilerRunBuilder(val data: DataFrame) { /** * Set the thresholds of values until it is considered to expensive to - * calculate the histograms + * calculate the histograms (for backwards compatability) * * @param lowCardinalityHistogramThreshold The threshold */ @@ -113,23 +126,64 @@ class ColumnProfilerRunBuilder(val data: DataFrame) { } /** - * Enable correlation profiling on Numerical columns, enabled by default. + * Enable correlation profiling on Numerical columns, disabled by default. + * + * @param correlation Enable oder disable correlation profiling + * @param maxCorrelationCols The maximum number of columns to calculate correlations on */ - def withCorrelation(correlation: Boolean): this.type = { + def withCorrelation(correlation: Boolean, maxCorrelationCols: Int = 100): this.type = { this.correlation = correlation + this.maxCorrelationCols = Some(maxCorrelationCols) this } /** - * Enable histogram profiling on Numerical columns, enabled by default. + * Enable histogram profiling on Numerical and Categorial columns, disabled by default. + * + * @param histogram Enable oder disable histogram profiling + * @param maxBuckets The maximum number of distinct values to calculate the histogram for */ - def withHistogram(histogram: Boolean): this.type = { + def withHistogram(histogram: Boolean, maxBuckets: Int = 20): this.type = { this.histogram = histogram + this.kllProfiling = histogram + this.lowCardinalityHistogramThreshold = maxBuckets + this.kllParameters = Some(KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch + .DEFAULT_SHRINKING_FACTOR, maxBuckets)); + this + } + + /** + * Enables exact Uniqueness, Entropy and Distinctness for all columns + * + * @param exactUniqueness Enable oder disable uniqueness, entropy and distinctness profiling + */ + def withExactUniqueness(exactUniqueness: Boolean): this.type = { + this.exactUniqueness = exactUniqueness + this + } + + /** + * Enables exact Uniqueness, Entropy and Distinctness for specified columns + * + * @param exactUniquenessColumns List of columns that should be selected for uniqueness profiling + */ + def restrictExactUniquenessColumns(exactUniquenessColumns: Seq[String]): this.type = { + this.exactUniquenessCols = Some(exactUniquenessColumns) + this + } + + /** + * Use unoptimized version of profiler (optimizations on by default) + * + */ + def nonOptimized(): this.type = { + this.optimized = false this } /** * Enable KLL Sketches profiling on Numerical columns, disabled by default. + * (for backwards compatability) */ def withKLLProfiling(): this.type = { this.kllProfiling = true @@ -138,6 +192,7 @@ class ColumnProfilerRunBuilder(val data: DataFrame) { /** * Set KLL parameters. + * (for backwards compatability) * * @param kllParameters kllParameters(sketchSize, shrinkingFactor, numberOfBuckets) */ @@ -148,6 +203,7 @@ class ColumnProfilerRunBuilder(val data: DataFrame) { /** * Set predefined data types for each column (e.g. baseline) + * (for backwards compatability) * * @param dataTypes dataType map for baseline columns */ @@ -202,7 +258,11 @@ class ColumnProfilerRunBuilder(val data: DataFrame) { histogram, kllProfiling, kllParameters, - predefinedTypes + predefinedTypes, + optimized, + maxCorrelationCols, + exactUniqueness, + exactUniquenessCols ) } } diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala index a02a5d4ee..49ac84a11 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala @@ -52,29 +52,54 @@ class ColumnProfilerRunner { histogram: Boolean, kllProfiling: Boolean, kllParameters: Option[KLLParameters], - predefinedTypes: Map[String, DataTypeInstances.Value]) + predefinedTypes: Map[String, DataTypeInstances.Value], + optimized: Boolean, + maxCorrelationCols: Option[Int], + exactUniqueness: Boolean, + exactUniquenessCols: Option[Seq[String]]) : ColumnProfiles = { if (cacheInputs) { data.cache() } - val columnProfiles = ColumnProfiler - .profile( - data, - restrictToColumns, - printStatusUpdates, - lowCardinalityHistogramThreshold, - metricsRepositoryOptions.metricsRepository, - metricsRepositoryOptions.reuseExistingResultsKey, - metricsRepositoryOptions.failIfResultsForReusingMissing, - metricsRepositoryOptions.saveOrAppendResultsKey, - correlation, - histogram, - kllProfiling, - kllParameters, - predefinedTypes - ) + val columnProfiles: ColumnProfiles = { + if (!optimized) { + ColumnProfiler.profile( + data, + restrictToColumns, + printStatusUpdates, + lowCardinalityHistogramThreshold, + metricsRepositoryOptions.metricsRepository, + metricsRepositoryOptions.reuseExistingResultsKey, + metricsRepositoryOptions.failIfResultsForReusingMissing, + metricsRepositoryOptions.saveOrAppendResultsKey, + correlation, + histogram, + kllProfiling, + kllParameters, + predefinedTypes + ) + } else { + ColumnProfiler.profileOptimized( + data, + restrictToColumns, + printStatusUpdates, + lowCardinalityHistogramThreshold, + metricsRepositoryOptions.metricsRepository, + metricsRepositoryOptions.reuseExistingResultsKey, + metricsRepositoryOptions.failIfResultsForReusingMissing, + metricsRepositoryOptions.saveOrAppendResultsKey, + correlation, + histogram, + exactUniqueness, + exactUniquenessCols, + maxCorrelationCols, + kllParameters + ) + } + } + saveColumnProfilesJsonToFileSystemIfNecessary( fileOutputOptions, diff --git a/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala b/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala index de915956d..9e46e5e81 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala @@ -181,6 +181,7 @@ class ConstraintSuggestionRunner { var columnProfilerRunner = ColumnProfilerRunner() .onData(trainingData) + .nonOptimized() .printStatusUpdates(printStatusUpdates) .withLowCardinalityHistogramThreshold(lowCardinalityHistogramThreshold) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala index 55e410f33..be2029079 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala @@ -99,7 +99,9 @@ case class FractionalCategoricalRangeRule( description, this, s""".isContainedIn("${profile.column}", Array($categoriesCode), - | _ >= $targetCompliance, Some("$hint"))""".stripMargin.replaceAll("\n", ""), + | _ >= $targetCompliance, Some("$hint"))""" + .stripMargin.replaceAll("\n", "") + .stripMargin.replaceAll("\r", ""), valuesByPopularity.toSeq ) } diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index 67ae61f92..71382d1b4 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -58,6 +58,7 @@ case class RetainCompletenessRule() extends ConstraintRule[ColumnProfile] { s""".hasCompleteness("${profile.column}", _ >= $targetCompleteness, | Some("It should be above $targetCompleteness!"))""" .stripMargin.replaceAll("\n", "") + .stripMargin.replaceAll("\r", "") ) } diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala index 119112e56..e462b26d9 100644 --- a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala +++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala @@ -64,9 +64,9 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "att1", 1.0, - 1.0, - 1.0, - 1.0, + Some(1.0), + Some(1.0), + Some(1.0), 6, DataTypeInstances.Fractional, false, @@ -109,9 +109,9 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "att1", 1.0, - 1.0, - 1.0, - 1.0, + Some(1.0), + Some(1.0), + Some(1.0), 30, DataTypeInstances.Fractional, false, diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala new file mode 100644 index 000000000..a64f8071d --- /dev/null +++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala @@ -0,0 +1,481 @@ +/** + * Copyright 2021 Logical Clocks AB. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + +package com.amazon.deequ.KLL + +import com.amazon.deequ.SparkContextSpec +import com.amazon.deequ.analyzers.{DataTypeInstances, KLLParameters, KLLSketch} +import com.amazon.deequ.metrics.{BucketDistribution, BucketValue, Distribution, DistributionValue} +import com.amazon.deequ.profiles.{ColumnProfiler, ColumnProfiles, NumericColumnProfile, StandardColumnProfile} +import com.amazon.deequ.utils.FixtureSupport +import org.apache.spark.sql.Row +import org.apache.spark.sql.types._ +import org.scalatest.{Matchers, WordSpec} + +class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec + with FixtureSupport { + + def assertProfilesEqual(expected: NumericColumnProfile, actual: NumericColumnProfile): Unit = { + + assert(expected.column == actual.column) + assert(expected.completeness == actual.completeness) + assert(math.abs(expected.approximateNumDistinctValues - + actual.approximateNumDistinctValues) <= 1) + assert(expected.uniqueness == actual.uniqueness) + assert(expected.distinctness == actual.distinctness) + assert(expected.entropy == actual.entropy) + assert(expected.dataType == actual.dataType) + assert(expected.isDataTypeInferred == expected.isDataTypeInferred) + assert(expected.typeCounts == actual.typeCounts) + assert(expected.histogram == actual.histogram) + assert(expected.mean == actual.mean) + assert(expected.maximum == actual.maximum) + assert(expected.minimum == actual.minimum) + assert(expected.sum == actual.sum) + assert(expected.stdDev == actual.stdDev) + assert(expected.kll == actual.kll) + assert(expected.approxPercentiles == actual.approxPercentiles) + assert(expected.correlation == actual.correlation) + } + + + def assertStandardProfilesEqual(expected: StandardColumnProfile, + actual: StandardColumnProfile): Unit = { + + assert(expected.column == actual.column) + assert(expected.completeness == actual.completeness) + assert(expected.uniqueness == actual.uniqueness) + assert(expected.distinctness == actual.distinctness) + assert(expected.entropy == actual.entropy) + assert(math.abs(expected.approximateNumDistinctValues - + actual.approximateNumDistinctValues) <= 1) + assert(expected.dataType == actual.dataType) + assert(expected.isDataTypeInferred == expected.isDataTypeInferred) + assert(expected.typeCounts == actual.typeCounts) + assert(expected.histogram == actual.histogram) + } + + "Column Profiler" should { + + "return correct NumericColumnProfiles for numeric columns with correct DataType" in + withSparkSession { session => + + val data = getDfWithNumericFractionalValues(session) + + val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("att1", + "att2")), kllParameters = Some(KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch + .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true) + .profiles("att1") + + val expectedColumnProfile = NumericColumnProfile( + "att1", + 1.0, + None, + None, + None, + 6, + DataTypeInstances.Fractional, + false, + Map.empty, + actualColumnProfile.histogram, + Some(BucketDistribution(List(BucketValue(1.0, 1.25, 1), + BucketValue(1.25, 1.5, 0), + BucketValue(1.5, 1.75, 0), + BucketValue(1.75, 2.0, 0), + BucketValue(2.0, 2.25, 1), + BucketValue(2.25, 2.5, 0), + BucketValue(2.5, 2.75, 0), + BucketValue(2.75, 3.0, 0), + BucketValue(3.0, 3.25, 1), + BucketValue(3.25, 3.5, 0), + BucketValue(3.5, 3.75, 0), + BucketValue(3.75, 4.0, 0), + BucketValue(4.0, 4.25, 1), + BucketValue(4.25, 4.5, 0), + BucketValue(4.5, 4.75, 0), + BucketValue(4.75, 5.0, 0), + BucketValue(5.0, 5.25, 1), + BucketValue(5.25, 5.5, 0), + BucketValue(5.5, 5.75, 0), + BucketValue(5.75, 6.0, 1)), + List(0.64, 2048.0), + Array(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)))), + Some(3.5), + Some(6.0), + Some(1.0), + Some(21.0), + Some(1.707825127659933), + Some(Seq(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, + 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)), + Some(Map[String, Double]("att1" -> 1.0, "att2" -> 0.9263710192499128)) + ) + + assertProfilesEqual(expectedColumnProfile, + actualColumnProfile.asInstanceOf[NumericColumnProfile]) + } + + "return correct JSON for NumericColumnProfiles" in + withSparkSession { session => + + val data = getDfWithNumericFractionalValues(session) + + val profile = ColumnProfiler.profileOptimized(data, Option(Seq("att1", "att2")), + kllParameters = Some(KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch + .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true) + val profiles = profile.profiles.map{pro => pro._2}.toSeq + val json_profile = ColumnProfiles.toJson(profiles) + val correct_profile = "{\"columns\":[{\"column\":\"att1\",\"dataType\":\"Fractional\"," + + "\"isDataTypeInferred\":\"false\",\"completeness\":1.0," + + "\"approximateNumDistinctValues\":6,\"histogram\":[{\"value\":\"6.0\",\"count\":1," + + "\"ratio\":0.16666666666666666},{\"value\":\"3.0\",\"count\":1," + + "\"ratio\":0.16666666666666666},{\"value\":\"2.0\",\"count\":1," + + "\"ratio\":0.16666666666666666},{\"value\":\"4.0\",\"count\":1," + + "\"ratio\":0.16666666666666666},{\"value\":\"1.0\",\"count\":1," + + "\"ratio\":0.16666666666666666},{\"value\":\"5.0\",\"count\":1," + + "\"ratio\":0.16666666666666666}],\"mean\":3.5,\"maximum\":6.0,\"minimum\":1.0," + + "\"sum\":21.0,\"stdDev\":1.707825127659933,\"correlations\":[{\"column\":\"att2\"," + + "\"correlation\":0.9263710192499128},{\"column\":\"att1\",\"correlation\":1.0}]," + + "\"kll\":{\"buckets\":[{\"low_value\":1.0,\"high_value\":1.25,\"count\":1," + + "\"ratio\":0.16666666666666666},{\"low_value\":1.25,\"high_value\":1.5,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":1.5,\"high_value\":1.75,\"count\":0,\"ratio\":0.0}," + + "{\"low_value\":1.75,\"high_value\":2.0,\"count\":0,\"ratio\":0.0},{\"low_value\":2.0," + + "\"high_value\":2.25,\"count\":1,\"ratio\":0.16666666666666666},{\"low_value\":2.25," + + "\"high_value\":2.5,\"count\":0,\"ratio\":0.0},{\"low_value\":2.5,\"high_value\":2.75," + + "\"count\":0,\"ratio\":0.0},{\"low_value\":2.75,\"high_value\":3.0,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":3.0,\"high_value\":3.25,\"count\":1," + + "\"ratio\":0.16666666666666666},{\"low_value\":3.25,\"high_value\":3.5,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":3.5,\"high_value\":3.75,\"count\":0,\"ratio\":0.0}," + + "{\"low_value\":3.75,\"high_value\":4.0,\"count\":0,\"ratio\":0.0},{\"low_value\":4.0," + + "\"high_value\":4.25,\"count\":1,\"ratio\":0.16666666666666666},{\"low_value\":4.25," + + "\"high_value\":4.5,\"count\":0,\"ratio\":0.0},{\"low_value\":4.5,\"high_value\":4.75," + + "\"count\":0,\"ratio\":0.0},{\"low_value\":4.75,\"high_value\":5.0,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":5.0,\"high_value\":5.25,\"count\":1," + + "\"ratio\":0.16666666666666666},{\"low_value\":5.25,\"high_value\":5.5,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":5.5,\"high_value\":5.75,\"count\":0,\"ratio\":0.0}," + + "{\"low_value\":5.75,\"high_value\":6.0,\"count\":1,\"ratio\":0.16666666666666666}]," + + "\"sketch\":{\"parameters\":{\"c\":0.64,\"k\":2048.0},\"data\":\"[[1.0,2.0,3.0,4.0,5.0," + + "6.0]]\"}},\"approxPercentiles\":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0," + + "1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0," + + "3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0," + + "4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0," + + "5.0,5.0,5.0,5.0,5.0,5.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0," + + "6.0,6.0]},{\"column\":\"att2\",\"dataType\":\"Fractional\"," + + "\"isDataTypeInferred\":\"false\",\"completeness\":1.0," + + "\"approximateNumDistinctValues\":4,\"histogram\":[{\"value\":\"0.0\",\"count\":3," + + "\"ratio\":0.5},{\"value\":\"6.0\",\"count\":1,\"ratio\":0.16666666666666666}," + + "{\"value\":\"7.0\",\"count\":1,\"ratio\":0.16666666666666666},{\"value\":\"5.0\"," + + "\"count\":1,\"ratio\":0.16666666666666666}],\"mean\":3.0,\"maximum\":7.0," + + "\"minimum\":0.0,\"sum\":18.0,\"stdDev\":3.0550504633038935," + + "\"correlations\":[{\"column\":\"att2\",\"correlation\":1.0},{\"column\":\"att1\"," + + "\"correlation\":0.9263710192499128}],\"kll\":{\"buckets\":[{\"low_value\":0.0," + + "\"high_value\":0.35,\"count\":3,\"ratio\":0.5},{\"low_value\":0.35,\"high_value\":0.7," + + "\"count\":0,\"ratio\":0.0},{\"low_value\":0.7,\"high_value\":1.05,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":1.05,\"high_value\":1.4,\"count\":0,\"ratio\":0.0}," + + "{\"low_value\":1.4,\"high_value\":1.75,\"count\":0,\"ratio\":0.0},{\"low_value\":1.75," + + "\"high_value\":2.1,\"count\":0,\"ratio\":0.0},{\"low_value\":2.1,\"high_value\":2.45," + + "\"count\":0,\"ratio\":0.0},{\"low_value\":2.45,\"high_value\":2.8,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":2.8,\"high_value\":3.15,\"count\":0,\"ratio\":0.0}," + + "{\"low_value\":3.15,\"high_value\":3.5,\"count\":0,\"ratio\":0.0},{\"low_value\":3.5," + + "\"high_value\":3.85,\"count\":0,\"ratio\":0.0},{\"low_value\":3.85,\"high_value\":4.2," + + "\"count\":0,\"ratio\":0.0},{\"low_value\":4.2,\"high_value\":4.55,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":4.55,\"high_value\":4.9,\"count\":0,\"ratio\":0.0}," + + "{\"low_value\":4.9,\"high_value\":5.25,\"count\":1,\"ratio\":0.16666666666666666}," + + "{\"low_value\":5.25,\"high_value\":5.6,\"count\":0,\"ratio\":0.0},{\"low_value\":5.6," + + "\"high_value\":5.95,\"count\":0,\"ratio\":0.0},{\"low_value\":5.95,\"high_value\":6.3," + + "\"count\":1,\"ratio\":0.16666666666666666},{\"low_value\":6.3,\"high_value\":6.65," + + "\"count\":0,\"ratio\":0.0},{\"low_value\":6.65,\"high_value\":7.0,\"count\":1," + + "\"ratio\":0.16666666666666666}],\"sketch\":{\"parameters\":{\"c\":0.64,\"k\":2048.0}," + + "\"data\":\"[[0.0,0.0,0.0,5.0,6.0,7.0]]\"}},\"approxPercentiles\":[0.0,0.0,0.0,0.0,0.0," + + "0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0," + + "0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0," + + "0.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,6.0,6.0," + + "6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,7.0,7.0,7.0,7.0,7.0,7.0," + + "7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0]}]}" + assert(json_profile == correct_profile) + } + + "return correct NumericColumnProfiles with uniqueness, distinctness and entropy " in + withSparkSession { session => + + val data = getDfWithNumericFractionalValues(session) + + val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("att1")), + exactUniqueness = true, exactUniquenessCols = Some(Seq("att1")), kllParameters = Some + (KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch + .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true).profiles("att1") + + val expectedColumnProfile = NumericColumnProfile( + "att1", + 1.0, + Some(1.0), + Some(1.791759469228055), + Some(1.0), + 6, + DataTypeInstances.Fractional, + false, + Map.empty, + actualColumnProfile.histogram, + Some(BucketDistribution(List(BucketValue(1.0, 1.25, 1), + BucketValue(1.25, 1.5, 0), + BucketValue(1.5, 1.75, 0), + BucketValue(1.75, 2.0, 0), + BucketValue(2.0, 2.25, 1), + BucketValue(2.25, 2.5, 0), + BucketValue(2.5, 2.75, 0), + BucketValue(2.75, 3.0, 0), + BucketValue(3.0, 3.25, 1), + BucketValue(3.25, 3.5, 0), + BucketValue(3.5, 3.75, 0), + BucketValue(3.75, 4.0, 0), + BucketValue(4.0, 4.25, 1), + BucketValue(4.25, 4.5, 0), + BucketValue(4.5, 4.75, 0), + BucketValue(4.75, 5.0, 0), + BucketValue(5.0, 5.25, 1), + BucketValue(5.25, 5.5, 0), + BucketValue(5.5, 5.75, 0), + BucketValue(5.75, 6.0, 1)), + List(0.64, 2048.0), + Array(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)))), + Some(3.5), + Some(6.0), + Some(1.0), + Some(21.0), + Some(1.707825127659933), + Some(Seq(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, + 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)), + Some(Map[String, Double]("att1" -> 1.0)) + ) + + assertProfilesEqual(expectedColumnProfile, + actualColumnProfile.asInstanceOf[NumericColumnProfile]) + } + + "return correct StandardColumnProfile plus histogram for String column" in + withSparkSession { session => + + val data = getDfWithNumericFractionalValues(session) + + val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("item")), + exactUniqueness = true, exactUniquenessCols = Some(Seq("item")), histogram = true) + .profiles("item") + + val expectedColumnProfile = StandardColumnProfile( + "item", + 1.0, + Some(1.0), + Some(1.791759469228055), + Some(1.0), + 6, + DataTypeInstances.String, + false, + Map.empty, + Some(Distribution(Map("4" -> DistributionValue(1, 0.16666666666666666), + "5" -> DistributionValue(1, 0.16666666666666666), + "6" -> DistributionValue(1, 0.16666666666666666), + "1" -> DistributionValue(1, 0.16666666666666666), + "2" -> DistributionValue(1, 0.16666666666666666), + "3" -> DistributionValue(1, 0.16666666666666666)), 6)) + ) + + assertStandardProfilesEqual(expectedColumnProfile, + actualColumnProfile.asInstanceOf[StandardColumnProfile]) + } + + "return correct StandardColumnProfile plus histogram for Decimal column" in + withSparkSession { session => + + val schema = + StructType(Seq(StructField(name = "num", dataType = DecimalType.SYSTEM_DEFAULT), + StructField(name = "num2", dataType = DecimalType.SYSTEM_DEFAULT))) + + val rows = session.sparkContext.parallelize(Seq( + Row(BigDecimal(1), BigDecimal(4)), + Row(BigDecimal(2), BigDecimal(3)), + Row(BigDecimal(3), BigDecimal(2)), + Row(BigDecimal(4), BigDecimal(1)))) + + val data = session.createDataFrame(rows, schema) + + val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("num", "num2")), + histogram = true).profiles("num").asInstanceOf[NumericColumnProfile] + + val expectedColumnProfile = NumericColumnProfile( + "num", + 1.0, + None, + None, + None, + 4, + DataTypeInstances.Decimal, + false, + Map.empty, + None, + None, + Some(2.5), + Some(4), + Some(1), + Some(10), + Some(1.118033988749895), + None, + Some(Map("num2" -> -1.0, "num" -> 1.0)) + ) + + assertProfilesEqual(expectedColumnProfile, actualColumnProfile) + } + + "return correct StandardColumnProfile for Decimal column and correlations off" in + withSparkSession { session => + + val schema = + StructType(Seq(StructField(name = "num", dataType = DecimalType.SYSTEM_DEFAULT), + StructField(name = "num2", dataType = DecimalType.SYSTEM_DEFAULT))) + + val rows = session.sparkContext.parallelize(Seq( + Row(BigDecimal(1), BigDecimal(4)), + Row(BigDecimal(2), BigDecimal(3)), + Row(BigDecimal(3), BigDecimal(2)), + Row(BigDecimal(4), BigDecimal(1)))) + + val data = session.createDataFrame(rows, schema) + + val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("num", "num2")), + histogram = true, correlation = false).profiles("num").asInstanceOf[NumericColumnProfile] + + val expectedColumnProfile = NumericColumnProfile( + "num", + 1.0, + None, + None, + None, + 4, + DataTypeInstances.Decimal, + false, + Map.empty, + None, + None, + Some(2.5), + Some(4), + Some(1), + Some(10), + Some(1.118033988749895), + None, + None + ) + + assertProfilesEqual(expectedColumnProfile, actualColumnProfile) + } + + "return correct NumericColumnProfiles With KLL for numeric columns with correct DataType" in + withSparkSession { session => + + val data = getDfWithNumericFractionalValuesForKLL(session) + + val actualColumnProfile = ColumnProfiler.profile(data, Option(Seq("att1")), false, 1, + kllProfiling = true, + kllParameters = Option(KLLParameters(2, 0.64, 2))) + .profiles("att1") + + val expectedColumnProfile = NumericColumnProfile( + "att1", + 1.0, + Some(1.0), + Some(3.4011973816621546), + Some(1.0), + 30, + DataTypeInstances.Fractional, + false, + Map.empty, + None, + Some(BucketDistribution(List(BucketValue(1.0, 15.5, 16), + BucketValue(15.5, 30.0, 14)), + List(0.64, 2.0), + Array(Array(27.0, 28.0, 29.0, 30.0), + Array(25.0), + Array(1.0, 6.0, 10.0, 15.0, 19.0, 23.0)))), + Some(15.5), + Some(30.0), + Some(1.0), + Some(465.0), + Some(8.65544144839919), + Some(Seq(1.0, 1.0, 1.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 10.0, 10.0, 10.0, + 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, + 10.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, + 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 19.0, 19.0, 19.0, + 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, + 19.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, + 23.0, 23.0, 23.0, 23.0, 23.0, 25.0, 25.0, 25.0, 25.0, + 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, + 25.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 28.0, 28.0, + 28.0, 28.0, 29.0, 29.0, 29.0, 30.0, 30.0, 30.0)), + Some(Map[String, Double]("att1" -> 1.0))) + + assertProfilesEqual(expectedColumnProfile, + actualColumnProfile.asInstanceOf[NumericColumnProfile]) + } + + "return KLL Sketches for ShortType columns" in withSparkSession { session => + val attribute = "attribute" + val data = com.amazon.deequ.dataFrameWithColumn( + attribute, + ShortType, + session, + Row(1: Short), + Row(2: Short), + Row(3: Short), + Row(4: Short), + Row(5: Short), + Row(6: Short), + Row(null) + ) + + val actualColumnProfile = ColumnProfiler.profile(data, + kllProfiling = true, + kllParameters = Option(KLLParameters(2, 0.64, 2))) + .profiles(attribute) + val numericalProfile = actualColumnProfile.asInstanceOf[NumericColumnProfile] + assert(numericalProfile.kll.isDefined) + val kll = numericalProfile.kll + assert(kll.get.buckets == List(BucketValue(1.0, 3.5, 4), BucketValue(3.5, 6.0, 2))) + assert(kll.get.parameters == List(0.64, 2.0)) + assert(kll.get.data.length == 2) + val target = Array(Array(5.0, 6.0), Array(1.0, 3.0)) + for (i <- kll.get.data.indices) { + assert(kll.get.data(i).sameElements(target(i))) + } + } + } +} + diff --git a/src/test/scala/com/amazon/deequ/VerificationResultTest.scala b/src/test/scala/com/amazon/deequ/VerificationResultTest.scala index 93aa73201..1f891e68e 100644 --- a/src/test/scala/com/amazon/deequ/VerificationResultTest.scala +++ b/src/test/scala/com/amazon/deequ/VerificationResultTest.scala @@ -168,6 +168,7 @@ class VerificationResultTest extends WordSpec with Matchers with SparkContextSpe |"constraint_message":"Value: 1.0 does not meet the constraint requirement! | Should be smaller than 0.8!"}]""" .stripMargin.replaceAll("\n", "") + .stripMargin.replaceAll("\r", "") assertSameResultsJson(checkResultsAsJson, expectedJson) } diff --git a/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala b/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala index 03787b886..7674654b4 100644 --- a/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala +++ b/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala @@ -55,7 +55,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with "compute correct metrics" in withSparkSession { sparkSession => val dfMissing = getDfMissing(sparkSession) - assert(Completeness("someMissingColumn").preconditions.size == 2, + assert(Completeness("someMissingColumn").preconditions.size == 1, "should check column name availability") val result1 = Completeness("att1").calculate(dfMissing) assert(result1 == DoubleMetric(Entity.Column, @@ -346,7 +346,9 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with val nonZeroValuesWithStringKeys = nonZeroValues.toSeq .map { case (instance, distValue) => instance.toString -> distValue } - val dataTypes = DataTypeInstances.values.map { _.toString } + val dataTypes = DataTypeInstances.values.filterNot(_.equals(DataTypeInstances.Decimal)).map { + _.toString + } val zeros = dataTypes .diff { nonZeroValuesWithStringKeys.map { case (distKey, _) => distKey }.toSet } @@ -572,7 +574,6 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with Row(BigDecimal(678)))) val data = session.createDataFrame(rows, schema) - val result = Minimum("num").calculate(data) assert(result.value.isSuccess) diff --git a/src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala b/src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala index efae77f51..b8861bff5 100644 --- a/src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala +++ b/src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala @@ -36,17 +36,6 @@ class StatesTest extends AnyWordSpec with Matchers with SparkContextSpec with Fi val stateAB = stateA.sum(stateB) - println(stateA.frequencies.schema) - stateA.frequencies.collect().foreach { println } - println() - - println(stateB.frequencies.schema) - stateB.frequencies.collect().foreach { println } - println() - - println(stateAB.frequencies.schema) - stateAB.frequencies.collect().foreach { println } - val mergedFrequencies = stateAB.frequencies.collect() .map { row => row.getString(0) -> row.getLong(1) } .toMap diff --git a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala index 70e998ee5..b2e45bc51 100644 --- a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala +++ b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala @@ -55,8 +55,6 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix val context = runChecks(getDfCompleteAndInCompleteColumns(sparkSession), check1, check2, check3) - context.metricMap.foreach { println } - assertEvaluatesTo(check1, context, CheckStatus.Success) assertEvaluatesTo(check2, context, CheckStatus.Error) assertEvaluatesTo(check3, context, CheckStatus.Warning) @@ -82,8 +80,6 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix val context = runChecks(getDfCompleteAndInCompleteColumns(sparkSession), check1, check2, check3) - context.metricMap.foreach { println } - assertEvaluatesTo(check1, context, CheckStatus.Success) assertEvaluatesTo(check2, context, CheckStatus.Error) assertEvaluatesTo(check3, context, CheckStatus.Warning) @@ -130,8 +126,6 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix val context = runChecks(getDfMissing(sparkSession), check1, check2, check3) - context.metricMap.foreach { println } - assertEvaluatesTo(check1, context, CheckStatus.Success) assertEvaluatesTo(check2, context, CheckStatus.Error) assertEvaluatesTo(check3, context, CheckStatus.Warning) @@ -823,7 +817,6 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix val check = Check(CheckLevel.Error, "some description") .containsCreditCardNumber(col, _ == 1.0) val context = runChecks(df, check) - context.allMetrics.foreach(println) assertEvaluatesTo(check, context, CheckStatus.Success) } diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerRunnerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerRunnerTest.scala index c5b3164f8..6ca25b95a 100644 --- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerRunnerTest.scala +++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerRunnerTest.scala @@ -60,7 +60,7 @@ class ColumnProfilerRunnerTest extends WordSpec with Matchers with SparkContextS (results, stat.jobCount) } - assert(jobNumberAllCalculations == 3) + assert(jobNumberAllCalculations == 1) assert(jobNumberReusing == 0) assertConstraintSuggestionResultsEquals(separateResults, resultsReusingMetrics) } @@ -191,6 +191,7 @@ class ColumnProfilerRunnerTest extends WordSpec with Matchers with SparkContextS val results = ColumnProfilerRunner() .onData(df) .withKLLProfiling() + .nonOptimized() .run() assert(results.profiles("att1").asInstanceOf[NumericColumnProfile].kll.isDefined) @@ -198,6 +199,57 @@ class ColumnProfilerRunnerTest extends WordSpec with Matchers with SparkContextS assert(results.profiles("att3").asInstanceOf[NumericColumnProfile].kll.isDefined) } + "should run optimized Profiler with two exact uniqueness columns" in + withMonitorableSparkSession {(sparkSession, sparkMonitor) => + + val df = getDfWithNumericValues(sparkSession) + + val (results: ColumnProfiles, jobNumberAllCalculations) = sparkMonitor + .withMonitoringSession { stat => + val results = ColumnProfilerRunner() + .onData(df) + .withExactUniqueness(true) + .restrictExactUniquenessColumns(Seq("att1", "att2")) + .run() + + (results, stat.jobCount) + } + + assert(jobNumberAllCalculations == 5) + assert(results.profiles("att1").asInstanceOf[NumericColumnProfile].uniqueness.isDefined) + assert(results.profiles("att2").asInstanceOf[NumericColumnProfile].uniqueness.isDefined) + assert(results.profiles("att3").asInstanceOf[NumericColumnProfile].uniqueness.isEmpty) + + } + + "should run less jobs with optimized Profiler" in + withMonitorableSparkSession { (sparkSession, sparkMonitor) => + + val df = getDfWithNumericValues(sparkSession) + + val jobNumberUnoptimized = sparkMonitor + .withMonitoringSession { stat => + val results = ColumnProfilerRunner() + .onData(df) + .nonOptimized() + .run() + + stat.jobCount + } + + val jobNumberOptimized = sparkMonitor + .withMonitoringSession { stat => + val results = ColumnProfilerRunner() + .onData(df) + .run() + + stat.jobCount + } + + assert(jobNumberUnoptimized == 10) + assert(jobNumberOptimized == 1) + } + } private[this] def assertConstraintSuggestionResultsEquals( diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala index e0441be2a..a02382ebe 100644 --- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala +++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala @@ -61,9 +61,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "att2", 2.0 / 3.0, - 0.5, - 0.5623351446188083, - 0.25, + Some(0.5), + Some(0.5623351446188083), + Some(0.25), 2, DataTypeInstances.String, true, @@ -115,9 +115,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "item", 1.0, - 1.0, - 1.791759469228055, - 1.0, + Some(1.0), + Some(1.791759469228055), + Some(1.0), 6, DataTypeInstances.String, false, @@ -141,9 +141,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "att2", 2.0 / 3.0, - 0.5, - 0.5623351446188083, - 0.25, + Some(0.5), + Some(0.5623351446188083), + Some(0.25), 2, DataTypeInstances.String, true, @@ -173,9 +173,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "item", 1.0, - 1.0, - 1.0, - 1.0, + Some(1.0), + Some(1.0), + Some(1.0), 6, DataTypeInstances.Integral, true, @@ -217,9 +217,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "item", 1.0, - 1.0, - 1.0, - 1.0, + Some(1.0), + Some(1.0), + Some(1.0), 6, DataTypeInstances.Integral, true, @@ -262,9 +262,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "item", 1.0, - 1.0, - 1.0, - 1.0, + Some(1.0), + Some(1.0), + Some(1.0), 6, DataTypeInstances.Integral, true, @@ -342,9 +342,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = NumericColumnProfile( "att1", 1.0, - 1.0, - 1.0, - 1.0, + Some(1.0), + Some(1.0), + Some(1.0), 6, DataTypeInstances.Fractional, false, @@ -379,9 +379,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec val expectedColumnProfile = StringColumnProfile( "att2", 2.0 / 3.0, - 0.5, - 0.5623351446188083, - 0.25, + Some(0.5), + Some(0.5623351446188083), + Some(0.25), 2, DataTypeInstances.String, isDataTypeInferred = true, @@ -590,9 +590,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec StandardColumnProfile( "PassengerId", 1.0, - 1.0, - 1.0, - 1.0, + Some(1.0), + Some(1.0), + Some(1.0), 891, DataTypeInstances.Integral, false, @@ -601,25 +601,25 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec StandardColumnProfile( "Survived", 1.0, - 1.0, - 1.0, - 1.0, + Some(1.0), + Some(1.0), + Some(1.0), 2, DataTypeInstances.Integral, false, Map.empty, None), - StandardColumnProfile("Pclass", 1.0, 1.0, 1.0, 1.0, 3, + StandardColumnProfile("Pclass", 1.0, Some(1.0), Some(1.0), Some(1.0), 3, DataTypeInstances.Integral, false, Map.empty, None), - StandardColumnProfile("Name", 1.0, 1.0, 1.0, 1.0, 0, + StandardColumnProfile("Name", 1.0, Some(1.0), Some(1.0), Some(1.0), 0, DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Sex", 1.0, 1.0, 1.0, 1.0, 2, + StandardColumnProfile("Sex", 1.0, Some(1.0), Some(1.0), Some(1.0), 2, DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Ticket", 1.0, 1.0, 1.0, 1.0, 681, + StandardColumnProfile("Ticket", 1.0, Some(1.0), Some(1.0), Some(1.0), 681, DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Fare", 1.0, 1.0, 1.0, 1.0, 0, + StandardColumnProfile("Fare", 1.0, Some(1.0), Some(1.0), Some(1.0), 0, DataTypeInstances.Fractional, false, Map.empty, None), - StandardColumnProfile("Cabin", 0.22, 1.0, 1.0, 1.0, 0, + StandardColumnProfile("Cabin", 0.22, Some(1.0), Some(1.0), Some(1.0), 0, DataTypeInstances.String, true, Map.empty, None) ) diff --git a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResultTest.scala b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResultTest.scala index 6a98bf3c6..5927608af 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResultTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResultTest.scala @@ -278,6 +278,7 @@ class ConstraintSuggestionResultTest extends WordSpec with Matchers with SparkCo | ] |}""" .stripMargin.replaceAll("\n", "") + .stripMargin.replaceAll("\r", "") assertJsonStringsAreEqual(constraintSuggestionJson, expectedJson) } @@ -366,6 +367,7 @@ class ConstraintSuggestionResultTest extends WordSpec with Matchers with SparkCo | ] |}""" .stripMargin.replaceAll("\n", "") + .stripMargin.replaceAll("\r", "") assertJsonStringsAreEqual(evaluationResultsJson, expectedJson) } @@ -453,6 +455,7 @@ class ConstraintSuggestionResultTest extends WordSpec with Matchers with SparkCo | ] |}""" .stripMargin.replaceAll("\n", "") + .stripMargin.replaceAll("\r", "") assertJsonStringsAreEqual(evaluationResultsJson, expectedJson) } diff --git a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunnerTest.scala b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunnerTest.scala index 9ec88f90b..1cc5883e7 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunnerTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunnerTest.scala @@ -78,7 +78,7 @@ class ConstraintSuggestionRunnerTest extends WordSpec with Matchers with SparkCo (results, stat.jobCount) } - assert(jobNumberAllCalculations == 3) + assert(jobNumberAllCalculations == 10) assert(jobNumberReusing == 0) assertConstraintSuggestionResultsEquals(separateResults, resultsReusingMetrics) } diff --git a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionsIntegrationTest.scala b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionsIntegrationTest.scala index d90b16ef7..920c5ff27 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionsIntegrationTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionsIntegrationTest.scala @@ -34,9 +34,7 @@ case class Record( propertyA: String, measurement2: String, measurement3: String, - description: String, - allNullColumn: String, - allNullColumn2: java.lang.Double + description: String ) class ConstraintSuggestionsIntegrationTest extends WordSpec with SparkContextSpec { @@ -76,7 +74,7 @@ class ConstraintSuggestionsIntegrationTest extends WordSpec with SparkContextSpe val randomLength = minLength + rng.nextInt(maxLength - minLength + 1) val description = rng.nextString(randomLength) - Record(id, marketplace, measurement, propertyA, measurement2, measurement3, description, null, null) + Record(id, marketplace, measurement, propertyA, measurement2, measurement3, description) } val data = session.createDataFrame(records) @@ -114,28 +112,6 @@ class ConstraintSuggestionsIntegrationTest extends WordSpec with SparkContextSpe analyzer == Completeness("marketplace") && assertionFunc(1.0) } - // Categorical range for "marketplace" - assertConstraintExistsIn(constraintSuggestionResult) { (analyzer, assertionFunc) => - - assertionFunc(1.0) && - analyzer.isInstanceOf[Compliance] && - analyzer.asInstanceOf[Compliance] - .instance.startsWith(s"'marketplace' has value range") - } - - // Categorical range for "marketplace" with values - assert( - constraintSuggestionResult.constraintSuggestions - .getOrElse("marketplace", Seq.empty) - .exists { - case value: ConstraintSuggestionWithValue[Seq[String]] => - val constraintWithValue = value.value - println(constraintWithValue) - constraintWithValue.sorted == categories.toSeq.sorted - case _ => false - } - ) - // IS NOT NULL for "measurement" assertConstraintExistsIn(constraintSuggestionResult) { (analyzer, assertionFunc) => analyzer == Completeness("measurement") && assertionFunc(1.0) diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index 9a90af7ca..d18282901 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -34,9 +34,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "CompleteIfCompleteRule" should { "be applied correctly" in { - val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + val complete = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, None) - val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, + val incomplete = StandardColumnProfile("col1", .25, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, None) val completeInteger = @@ -131,9 +131,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "RetainCompletenessRule" should { "be applied correctly" in { - val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + val complete = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, None) - val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, + val incomplete = StandardColumnProfile("col1", .25, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, None) assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000)) @@ -170,8 +170,11 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val codeForConstraint = RetainCompletenessRule().candidate(fakeColumnProfile, 100) .codeForConstraint - val expectedCodeForConstraint = """.hasCompleteness("att1", _ >= 0.4, - | Some("It should be above 0.4!"))""".stripMargin.replaceAll("\n", "") + val expectedCodeForConstraint = + """.hasCompleteness("att1", _ >= 0.4, + | Some("It should be above 0.4!"))""" + .stripMargin.replaceAll("\n", "") + .stripMargin.replaceAll("\r", "") assert(expectedCodeForConstraint == codeForConstraint) @@ -192,13 +195,13 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "UniqueIfApproximatelyUniqueRule" should { "be applied correctly" in { - val unique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + val unique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, None) - val maybeUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, + val maybeUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95, String, false, Map.empty, None) - val maybeNonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 91, + val maybeNonUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 91, String, false, Map.empty, None) - val nonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, + val nonUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 20, String, false, Map.empty, None) assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(unique, 100)) @@ -259,24 +262,24 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "RetainTypeRule" should { "be applied correctly" in { - val string = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + val string = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, true, Map.empty, None) - val boolean = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + val boolean = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Boolean, true, Map.empty, None) - val fractional = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + val fractional = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Fractional, true, Map.empty, None) - val integer = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + val integer = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Integral, true, Map.empty, None) - val unknown = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, + val unknown = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Unknown, true, Map.empty, None) - val stringNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, - String, false, Map.empty, None) - val booleanNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, - Boolean, false, Map.empty, None) - val fractionalNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, - Fractional, false, Map.empty, None) - val integerNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, - Integral, false, Map.empty, None) + val stringNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), + 100, String, false, Map.empty, None) + val booleanNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), + 100, Boolean, false, Map.empty, None) + val fractionalNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), + Some(1.0), 100, Fractional, false, Map.empty, None) + val integerNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), + 100, Integral, false, Map.empty, None) assert(!RetainTypeRule().shouldBeApplied(string, 100)) assert(!RetainTypeRule().shouldBeApplied(unknown, 100)) @@ -393,26 +396,26 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val noDistribution = Distribution(Map.empty, 0) - val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, - String, false, Map.empty, Some(nonSkewedDist)) - val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, - 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDist)) - val stringWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0, + val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), + Some(1.0), 100, String, false, Map.empty, Some(nonSkewedDist)) + val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), + Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDist)) + val stringWithFlgDist = StandardColumnProfile("flg", 1.0, Some(1.0), Some(1.0), Some(1.0), 2, String, false, Map.empty, Some(flgDist)) - val integralWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0, + val integralWithFlgDist = StandardColumnProfile("flg", 1.0, Some(1.0), Some(1.0), Some(1.0), 2, DataTypeInstances.Integral, false, Map.empty, Some(flgDist)) - val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, - String, false, Map.empty, Some(skewedDist)) - val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, + val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), + Some(1.0), 100, String, false, Map.empty, Some(skewedDist)) + val stringNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95, String, false, Map.empty, None) - val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, + val boolNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 94, Boolean, false, Map.empty, None) - val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, - Boolean, false, Map.empty, Some(noDistribution)) - val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, - 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedDist)) - val integralNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, + val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), + 20, Boolean, false, Map.empty, Some(noDistribution)) + val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), + Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedDist)) + val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95, DataTypeInstances.Integral, false, Map.empty, None) assert(CategoricalRangeRule().shouldBeApplied(stringWithNonSkewedDist, 100)) @@ -576,33 +579,36 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val noDistribution = Distribution(Map.empty, 0) val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile( - "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, + "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, Some(nonSkewedDistWithFractionalCategoricalRange)) val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile( - "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, + "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, Some(nonSkewedDistWithActualCategoricalRange)) val stringWithSomewhatSkewedDist = StandardColumnProfile( - "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(somewhatSkewedDist)) + "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, + Some(somewhatSkewedDist)) val stringWithSkewedDist = StandardColumnProfile( - "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(skewedDist)) - val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, - String, false, Map.empty, None) - val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, + "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, + Some(skewedDist)) + val stringNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), + 95, String, false, Map.empty, None) + val boolNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 94, Boolean, false, Map.empty, None) - val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, - Boolean, false, Map.empty, Some(noDistribution)) + val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), + 20, Boolean, false, Map.empty, Some(noDistribution)) val integralWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1", - 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, + 1.0, Some(1.0), Some(1.0), Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDistWithFractionalCategoricalRange)) val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile( - "col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty, - Some(nonSkewedIntegralDistWithActualCategoricalRange)) - val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, - 100, DataTypeInstances.Integral, false, Map.empty, Some(somewhatSkewedIntegralDist)) - val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, - 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist)) - val integralNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, + "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, DataTypeInstances.Integral, false, + Map.empty, Some(nonSkewedIntegralDistWithActualCategoricalRange)) + val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), + Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, + Some(somewhatSkewedIntegralDist)) + val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), + Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist)) + val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95, DataTypeInstances.Integral, false, Map.empty, None) assert(FractionalCategoricalRangeRule().shouldBeApplied(stringWithSomewhatSkewedDist, 100)) @@ -700,8 +706,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val codeForConstraint = FractionalCategoricalRangeRule().candidate(fakeColumnProfile, 100) .codeForConstraint - val expectedCodeForConstraint = ".isContainedIn(\"categoricalColumn\", Array(\"_b%%__\"," + - " \"'_[a_[]}!@'\"), _ >= 0.9, Some(\"It should be above 0.9!\"))" + val expectedCodeForConstraint = ".isContainedIn(\"categoricalColumn\", Array(\"_b%%__\", " + + "\"'_[a_[]}!@'\"), _ >= 0.9, Some(\"It should be above 0.9!\"))" assert(expectedCodeForConstraint == codeForConstraint) @@ -726,7 +732,7 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "NonNegativeNumbersRule and PositiveNumbersRule" should { "be applied correctly" in { def columnProfileWithMinimum(minimum: Double): NumericColumnProfile = { - NumericColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, + NumericColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Fractional, isDataTypeInferred = false, Map.empty, None, None, Some(10), Some(100), Some(minimum), Some(10000), Some(1.0), None, None) } From 07f15ef9ae901423606ed441341d975879b30cfe Mon Sep 17 00:00:00 2001 From: Fabio Buso Date: Thu, 26 Aug 2021 22:07:35 +0200 Subject: [PATCH 13/21] Increase scala-style max method parameters check --- deequ-scalastyle.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deequ-scalastyle.xml b/deequ-scalastyle.xml index c726413bc..f97dfc64e 100644 --- a/deequ-scalastyle.xml +++ b/deequ-scalastyle.xml @@ -35,7 +35,7 @@ - + From 6ce9015727286b8dedbd8371a027320d0944ff9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Till=20D=C3=B6hmen?= Date: Tue, 14 Sep 2021 10:43:23 +0200 Subject: [PATCH 14/21] Support for Decimal-type histograms (#10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: doehmen-admin Co-authored-by: Till Döhmen --- .../deequ/analyzers/runners/KLLRunner.scala | 10 +- .../deequ/profiles/ColumnProfiler.scala | 9 +- .../deequ/KLL/KLLProfileTestApprox.scala | 160 ++++++++---------- .../amazon/deequ/utils/FixtureSupport.scala | 12 ++ 4 files changed, 101 insertions(+), 90 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/analyzers/runners/KLLRunner.scala b/src/main/scala/com/amazon/deequ/analyzers/runners/KLLRunner.scala index 64a008009..8f3739ec9 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/runners/KLLRunner.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/runners/KLLRunner.scala @@ -18,7 +18,7 @@ package com.amazon.deequ.analyzers.runners import com.amazon.deequ.analyzers.{Analyzer, KLLParameters, KLLSketch, KLLState, QuantileNonSample, State, StateLoader, StatePersister} import com.amazon.deequ.metrics.Metric -import org.apache.spark.sql.types.{ByteType, DoubleType, FloatType, IntegerType, LongType, ShortType, StructType} +import org.apache.spark.sql.types.{ByteType, DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType, StructType} import org.apache.spark.sql.{DataFrame, Row} @SerialVersionUID(1L) @@ -84,6 +84,13 @@ class FloatQuantileNonSample(sketchSize: Int, shrinkingFactor: Double) override def itemAsDouble(item: Any): Double = item.asInstanceOf[Float].toDouble } +@SerialVersionUID(1L) +class DecimalQuantileNonSample(sketchSize: Int, shrinkingFactor: Double) + extends UntypedQuantileNonSample(sketchSize, shrinkingFactor) with Serializable { + override def itemAsDouble(item: Any): Double = item.asInstanceOf[java.math.BigDecimal] + .doubleValue() +} + object KLLRunner { def computeKLLSketchesInExtraPass( @@ -139,6 +146,7 @@ object KLLRunner { case ShortType => new ShortQuantileNonSample(sketchSize, shrinkingFactor) case IntegerType => new IntQuantileNonSample(sketchSize, shrinkingFactor) case LongType => new LongQuantileNonSample(sketchSize, shrinkingFactor) + case DecimalType() => new DecimalQuantileNonSample(sketchSize, shrinkingFactor) // TODO at the moment, we will throw exceptions for Decimals case _ => throw new IllegalArgumentException(s"Cannot handle ${schema(column).dataType}") } diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala index 9de7e3b25..0e1143a4b 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala @@ -339,7 +339,7 @@ object ColumnProfiler { analyzers ++= Seq(Minimum(name), Maximum(name), Mean(name), StandardDeviation(name), Sum(name)) // Add KLL analyzer. - if (histogram && predefinedTypes(name) != Decimal) { + if (histogram) { analyzers += KLLSketch(name, kllParameters) } if (correlation && (maxCorrelationCols.isEmpty || (numericColumnNames.length <= @@ -842,7 +842,7 @@ object ColumnProfiler { /* Identifies all columns, which: * - * (1) have string, boolean, double, float, integer, long, or short data type + * (1) have string, boolean, double, float, integer, long, decimal, or short data type * (2) have less than `lowCardinalityHistogramThreshold` approximate distinct values */ private[this] def findTargetColumnsForHistograms( @@ -855,14 +855,15 @@ object ColumnProfiler { StringType, BooleanType, DoubleType, FloatType, IntegerType, LongType, ShortType ) val originalStringNumericOrBooleanColumns = schema - .filter { field => validSparkDataTypesForHistograms.contains(field.dataType) } + .filter { field => validSparkDataTypesForHistograms.contains(field.dataType) || + genericStatistics.typeOf(field.name) == Decimal } .map { field => field.name } .toSet genericStatistics.approximateNumDistincts .filter { case (column, _) => originalStringNumericOrBooleanColumns.contains(column) && - Set(String, Boolean, Integral, Fractional).contains(genericStatistics.typeOf + Set(String, Boolean, Integral, Fractional, Decimal).contains(genericStatistics.typeOf (column)) } .filter { case (_, count) => count <= lowCardinalityHistogramThreshold } diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala index a64f8071d..572e177b3 100644 --- a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala +++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala @@ -68,8 +68,81 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec assert(expected.histogram == actual.histogram) } + "Column Profiler" should { + "return correct NumericColumnProfiles for decimal column" in + withSparkSession { session => + + val data = getDfWithDecimalFractionalValues(session) + + val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("att1", + "att2")), kllParameters = Some(KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch + .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true) + .profiles("att1") + + val expectedColumnProfile = NumericColumnProfile( + "att1", + 1.0, + None, + None, + None, + 6, + DataTypeInstances.Decimal, + false, + Map.empty, + Some(Distribution(Map[String, DistributionValue]( + "4.000000000000000000" -> DistributionValue(1, 0.16666666666666666), + "1.000000000000000000" -> DistributionValue(1, 0.16666666666666666), + "5.000000000000000000" -> DistributionValue(1, 0.16666666666666666), + "6.000000000000000000" -> DistributionValue(1, 0.16666666666666666), + "2.000000000000000000" -> DistributionValue(1, 0.16666666666666666), + "3.000000000000000000" -> DistributionValue(1, 0.16666666666666666)), 6)), + Some(BucketDistribution(List(BucketValue(1.0, 1.25, 1), + BucketValue(1.25, 1.5, 0), + BucketValue(1.5, 1.75, 0), + BucketValue(1.75, 2.0, 0), + BucketValue(2.0, 2.25, 1), + BucketValue(2.25, 2.5, 0), + BucketValue(2.5, 2.75, 0), + BucketValue(2.75, 3.0, 0), + BucketValue(3.0, 3.25, 1), + BucketValue(3.25, 3.5, 0), + BucketValue(3.5, 3.75, 0), + BucketValue(3.75, 4.0, 0), + BucketValue(4.0, 4.25, 1), + BucketValue(4.25, 4.5, 0), + BucketValue(4.5, 4.75, 0), + BucketValue(4.75, 5.0, 0), + BucketValue(5.0, 5.25, 1), + BucketValue(5.25, 5.5, 0), + BucketValue(5.5, 5.75, 0), + BucketValue(5.75, 6.0, 1)), + List(0.64, 2048.0), + Array(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)))), + Some(3.5), + Some(6.0), + Some(1.0), + Some(21.0), + Some(1.707825127659933), + Some(Seq(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, + 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, + 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, + 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, + 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, + 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, + 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)), + Some(Map[String, Double]("att1" -> 1.0, "att2" -> 0.9263710192499128)) + ) + + assertProfilesEqual(expectedColumnProfile, + actualColumnProfile.asInstanceOf[NumericColumnProfile]) + } + "return correct NumericColumnProfiles for numeric columns with correct DataType" in withSparkSession { session => @@ -301,7 +374,8 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec DataTypeInstances.String, false, Map.empty, - Some(Distribution(Map("4" -> DistributionValue(1, 0.16666666666666666), + Some(Distribution(Map[String, DistributionValue]( + "4" -> DistributionValue(1, 0.16666666666666666), "5" -> DistributionValue(1, 0.16666666666666666), "6" -> DistributionValue(1, 0.16666666666666666), "1" -> DistributionValue(1, 0.16666666666666666), @@ -313,90 +387,6 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec actualColumnProfile.asInstanceOf[StandardColumnProfile]) } - "return correct StandardColumnProfile plus histogram for Decimal column" in - withSparkSession { session => - - val schema = - StructType(Seq(StructField(name = "num", dataType = DecimalType.SYSTEM_DEFAULT), - StructField(name = "num2", dataType = DecimalType.SYSTEM_DEFAULT))) - - val rows = session.sparkContext.parallelize(Seq( - Row(BigDecimal(1), BigDecimal(4)), - Row(BigDecimal(2), BigDecimal(3)), - Row(BigDecimal(3), BigDecimal(2)), - Row(BigDecimal(4), BigDecimal(1)))) - - val data = session.createDataFrame(rows, schema) - - val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("num", "num2")), - histogram = true).profiles("num").asInstanceOf[NumericColumnProfile] - - val expectedColumnProfile = NumericColumnProfile( - "num", - 1.0, - None, - None, - None, - 4, - DataTypeInstances.Decimal, - false, - Map.empty, - None, - None, - Some(2.5), - Some(4), - Some(1), - Some(10), - Some(1.118033988749895), - None, - Some(Map("num2" -> -1.0, "num" -> 1.0)) - ) - - assertProfilesEqual(expectedColumnProfile, actualColumnProfile) - } - - "return correct StandardColumnProfile for Decimal column and correlations off" in - withSparkSession { session => - - val schema = - StructType(Seq(StructField(name = "num", dataType = DecimalType.SYSTEM_DEFAULT), - StructField(name = "num2", dataType = DecimalType.SYSTEM_DEFAULT))) - - val rows = session.sparkContext.parallelize(Seq( - Row(BigDecimal(1), BigDecimal(4)), - Row(BigDecimal(2), BigDecimal(3)), - Row(BigDecimal(3), BigDecimal(2)), - Row(BigDecimal(4), BigDecimal(1)))) - - val data = session.createDataFrame(rows, schema) - - val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("num", "num2")), - histogram = true, correlation = false).profiles("num").asInstanceOf[NumericColumnProfile] - - val expectedColumnProfile = NumericColumnProfile( - "num", - 1.0, - None, - None, - None, - 4, - DataTypeInstances.Decimal, - false, - Map.empty, - None, - None, - Some(2.5), - Some(4), - Some(1), - Some(10), - Some(1.118033988749895), - None, - None - ) - - assertProfilesEqual(expectedColumnProfile, actualColumnProfile) - } - "return correct NumericColumnProfiles With KLL for numeric columns with correct DataType" in withSparkSession { session => diff --git a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala index 9b6ad9d4e..7b56c744e 100644 --- a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala +++ b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala @@ -274,6 +274,18 @@ trait FixtureSupport { ).toDF("item", "att1", "att2") } + def getDfWithDecimalFractionalValues(sparkSession: SparkSession): DataFrame = { + import sparkSession.implicits._ + Seq( + ("1", BigDecimal(1.0), BigDecimal(0.0)), + ("2", BigDecimal(2.0), BigDecimal(0.0)), + ("3", BigDecimal(3.0), BigDecimal(0.0)), + ("4", BigDecimal(4.0), BigDecimal(5.0)), + ("5", BigDecimal(5.0), BigDecimal(6.0)), + ("6", BigDecimal(6.0), BigDecimal(7.0)) + ).toDF("item", "att1", "att2") + } + def getDfWithNumericFractionalValuesForKLL(sparkSession: SparkSession): DataFrame = { import sparkSession.implicits._ Seq( From 0cd154bfa33fe9088015ca14232e4a0a358498ea Mon Sep 17 00:00:00 2001 From: moritzmeister Date: Wed, 1 Dec 2021 17:10:12 +0100 Subject: [PATCH 15/21] Fix NaN bug for histograms --- .../scala/com/amazon/deequ/profiles/ColumnProfile.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala index 84df99511..324108653 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala @@ -206,8 +206,9 @@ object ColumnProfiles { val store = new JsonObject() store.add("parameters", entry) - val gson = new Gson() - val dataJson = gson.toJson(kllSketch.data) + val gson = new GsonBuilder().serializeNulls().create(); + val dataJson = gson.toJson(kllSketch.data.map( + subarr => subarr.map(value => normalizeDouble(value)))) store.addProperty("data", dataJson) @@ -218,10 +219,10 @@ object ColumnProfiles { val approxPercentilesJson = new JsonArray() numericColumnProfile.approxPercentiles.foreach { _.foreach { percentile => - approxPercentilesJson.add(new JsonPrimitive(percentile)) + approxPercentilesJson.add( + if (percentile.isNaN) null else new JsonPrimitive(normalizeDouble(percentile))) } } - columnProfileJson.add("approxPercentiles", approxPercentilesJson) case _ => From d8a78e30e1e362a68c39b0f48ce8a29c550ddc2d Mon Sep 17 00:00:00 2001 From: moritzmeister Date: Mon, 25 Apr 2022 16:58:23 +0200 Subject: [PATCH 16/21] columns need to be filtered also when getting results --- .../scala/com/amazon/deequ/profiles/ColumnProfiler.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala index 0e1143a4b..a06395216 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala @@ -216,7 +216,7 @@ object ColumnProfiler { // We compute exact histograms for all low-cardinality string columns, find those here val targetColumnsForHistograms = findTargetColumnsForHistograms(data.schema, - genericStatistics, lowCardinalityHistogramThreshold) + genericStatistics, lowCardinalityHistogramThreshold, restrictToColumns) // Find out, if we have values for those we can reuse val analyzerContextExistingValues = @@ -398,7 +398,7 @@ object ColumnProfiler { // We compute exact histograms for all low-cardinality string columns, find those here val targetColumnsForHistograms = findTargetColumnsForHistograms(data.schema, - genericStatistics, lowCardinalityHistogramThreshold) + genericStatistics, lowCardinalityHistogramThreshold, restrictToColumns) // Find out, if we have values for those we can reuse val analyzerContextExistingValues = @@ -848,13 +848,15 @@ object ColumnProfiler { private[this] def findTargetColumnsForHistograms( schema: StructType, genericStatistics: GenericColumnStatistics, - lowCardinalityHistogramThreshold: Long) + lowCardinalityHistogramThreshold: Long, + restrictToColumns: Option[Seq[String]] = None) : Seq[String] = { val validSparkDataTypesForHistograms: Set[SparkDataType] = Set( StringType, BooleanType, DoubleType, FloatType, IntegerType, LongType, ShortType ) val originalStringNumericOrBooleanColumns = schema + .filter{ field => restrictToColumns.isEmpty || restrictToColumns.get.contains(field.name) } .filter { field => validSparkDataTypesForHistograms.contains(field.dataType) || genericStatistics.typeOf(field.name) == Decimal } .map { field => field.name } From 1674da56bf11e78064c5c8561b652ab25507197f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Till=20D=C3=B6hmen?= Date: Fri, 20 May 2022 16:18:01 +0200 Subject: [PATCH 17/21] fixed NaN issues and improved statistics JSON --- .../amazon/deequ/analyzers/Histogram.scala | 2 +- .../amazon/deequ/profiles/ColumnProfile.scala | 43 ++++++++- .../deequ/profiles/ColumnProfiler.scala | 37 +++++--- .../com/amazon/deequ/KLL/KLLProfileTest.scala | 2 + .../deequ/KLL/KLLProfileTestApprox.scala | 73 +++++++++------- .../profiles/ColumnProfilerNaNTest.scala | 48 ++++++++++ .../deequ/profiles/ColumnProfilerTest.scala | 22 +++-- .../rules/ConstraintRulesTest.scala | 87 ++++++++++--------- .../amazon/deequ/utils/FixtureSupport.scala | 38 +++++++- 9 files changed, 252 insertions(+), 100 deletions(-) create mode 100644 src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala diff --git a/src/main/scala/com/amazon/deequ/analyzers/Histogram.scala b/src/main/scala/com/amazon/deequ/analyzers/Histogram.scala index 42a7e72e5..78707a2b3 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/Histogram.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/Histogram.scala @@ -133,7 +133,7 @@ case class Histogram( } object Histogram { - val NullFieldReplacement = "NullValue" + val NullFieldReplacement = "-null-" val MaximumAllowedDetailBins = 1000 val count_function = "count" val sum_function = "sum" diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala index 324108653..6cfe29e7d 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala @@ -20,6 +20,8 @@ import com.amazon.deequ.analyzers.DataTypeInstances import com.amazon.deequ.metrics.{BucketDistribution, Distribution} import com.google.gson.{Gson, GsonBuilder, JsonArray, JsonObject, JsonPrimitive} +import scala.collection.immutable.ListMap + /* Profiling results for the columns which will be given to the constraint suggestion engine */ abstract class ColumnProfile { def column: String @@ -28,6 +30,7 @@ abstract class ColumnProfile { def entropy: Option[Double] def uniqueness: Option[Double] def approximateNumDistinctValues: Long + def exactNumDistinctValues: Option[Long] def dataType: DataTypeInstances.Value def isDataTypeInferred: Boolean def typeCounts: Map[String, Long] @@ -41,6 +44,7 @@ case class StandardColumnProfile( entropy: Option[Double], uniqueness: Option[Double], approximateNumDistinctValues: Long, + exactNumDistinctValues: Option[Long], dataType: DataTypeInstances.Value, isDataTypeInferred: Boolean, typeCounts: Map[String, Long], @@ -66,6 +70,7 @@ case class NumericColumnProfile( entropy: Option[Double], uniqueness: Option[Double], approximateNumDistinctValues: Long, + exactNumDistinctValues: Option[Long], dataType: DataTypeInstances.Value, isDataTypeInferred: Boolean, typeCounts: Map[String, Long], @@ -87,7 +92,16 @@ case class ColumnProfiles( object ColumnProfiles { + def toJson(columnProfiles: ColumnProfiles): String = { + toJson(columnProfiles.profiles.values.toSeq, columnProfiles.numRecords) + } + def toJson(columnProfiles: Seq[ColumnProfile]): String = { + // for backwards compatability with hsfs API + toJson(columnProfiles, -1) + } + + def toJson(columnProfiles: Seq[ColumnProfile], numRecords: Long): String = { val json = new JsonObject() @@ -108,6 +122,13 @@ object ColumnProfiles { } columnProfileJson.addProperty("completeness", normalizeDouble(profile.completeness)) + + if (numRecords >= 0) { + columnProfileJson.addProperty("numRecordsNonNull", + math.round(normalizeDouble(profile.completeness * numRecords))) + columnProfileJson.addProperty("numRecordsNull", + numRecords - math.round(normalizeDouble(profile.completeness * numRecords))) + } if (profile.distinctness.isDefined) { columnProfileJson.addProperty("distinctness", normalizeDouble(profile.distinctness.get)) } @@ -121,11 +142,19 @@ object ColumnProfiles { columnProfileJson.addProperty("approximateNumDistinctValues", profile.approximateNumDistinctValues) + if (profile.exactNumDistinctValues.isDefined) { + columnProfileJson.addProperty("exactNumDistinctValues", profile.exactNumDistinctValues.get) + } + if (profile.histogram.isDefined) { val histogram = profile.histogram.get val histogramJson = new JsonArray() - histogram.values.foreach { case (name, distributionValue) => + // sort histogram by descending quantity, then by key + val sorted = ListMap(histogram.values.toSeq.sortBy(kv => (kv._2.absolute, kv._1)) + (Ordering.Tuple2(Ordering[Long].reverse, Ordering.String)): _*) + + sorted.foreach { case (name, distributionValue) => val histogramEntry = new JsonObject() histogramEntry.addProperty("value", name) histogramEntry.addProperty("count", distributionValue.absolute) @@ -186,11 +215,16 @@ object ColumnProfiles { if (profile.histogram.isEmpty) { val histogramJson = new JsonArray() + + // increase precision for small bucket sizes + val fp = if (kllSketch.buckets.nonEmpty && scala.math.abs(kllSketch.buckets.head + .highValue - kllSketch.buckets.head.lowValue) > 0.05) "%.2f" else "%f" + kllSketch.buckets.foreach{bucket => val histogramEntry = new JsonObject() - histogramEntry.addProperty("value", "%.2f".formatLocal(java.util.Locale.US, - bucket.lowValue) + "-" + "%.2f".formatLocal(java.util.Locale.US, bucket - .highValue)) + histogramEntry.addProperty("value", fp.formatLocal(java.util.Locale.US, + bucket.lowValue) + " to " + fp.formatLocal(java.util.Locale.US, + bucket.highValue)) histogramEntry.addProperty("count", bucket.count) histogramEntry.addProperty("ratio", bucket.count/totalCount) histogramJson.add(histogramEntry) @@ -251,4 +285,5 @@ object ColumnProfiles { numeric } } + } diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala index a06395216..b1293aa91 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala @@ -18,6 +18,7 @@ package com.amazon.deequ.profiles import scala.util.Success import scala.collection.mutable.ListBuffer +import scala.collection.immutable.ListMap import com.amazon.deequ.analyzers.DataTypeInstances._ import com.amazon.deequ.analyzers._ import com.amazon.deequ.analyzers.runners.AnalysisRunBuilder @@ -53,6 +54,7 @@ private[deequ] case class GenericColumnStatistics( knownTypes: Map[String, DataTypeInstances.Value], typeDetectionHistograms: Map[String, Map[String, Long]], approximateNumDistincts: Map[String, Long], + exactNumDistincts: Map[String, Long], completenesses: Map[String, Double], distinctness: Map[String, Double], entropy: Map[String, Double], @@ -252,7 +254,7 @@ object ColumnProfiler { /** * Profile a (potentially very large) dataset. * - * @param data data dataset as dataframe + * @param dataInp data dataset as dataframe * @param restrictToColumns an contain a subset of columns to profile, otherwise * all columns will be considered * @param printStatusUpdates @@ -269,7 +271,7 @@ object ColumnProfiler { */ // scalastyle:off argcount private[deequ] def profileOptimized( - data: DataFrame, + dataInp: DataFrame, restrictToColumns: Option[Seq[String]] = None, printStatusUpdates: Boolean = false, lowCardinalityHistogramThreshold: Int = ColumnProfiler @@ -290,15 +292,16 @@ object ColumnProfiler { // Ensure that all desired columns exist restrictToColumns.foreach { restrictToColumns => restrictToColumns.foreach { columnName => - require(data.schema.fieldNames.contains(columnName), s"Unable to find column $columnName") + require(dataInp.schema.fieldNames.contains(columnName), s"Unable to find column " + + s"$columnName") } } // Find columns we want to profile - val relevantColumns = getRelevantColumns(data.schema, restrictToColumns) + val relevantColumns = getRelevantColumns(dataInp.schema, restrictToColumns) // We assume that data types are predefined by the schema, and skip the data type detection - val predefinedTypes = data.schema.fields + val predefinedTypes = dataInp.schema.fields .filter { column => relevantColumns.contains(column.name) } .map { field => val knownType = field.dataType match { @@ -319,6 +322,10 @@ object ColumnProfiler { val numericColumnNames = relevantColumns .filter { name => Set(Integral, Fractional, Decimal).contains(predefinedTypes(name)) } + // replace NaNs with null in numeric columns + val na_replacement = numericColumnNames.map((_, "null")).toMap + val data = dataInp.na.fill(na_replacement) + // First pass if (printStatusUpdates) { println("### PROFILING: Computing generic column statistics in pass (1/2)...") @@ -356,7 +363,8 @@ object ColumnProfiler { (exactUniquenessCols.isDefined && exactUniquenessCols.get.contains(name))) && predefinedTypes(name) != Unknown) { // Add grouping analyzers. - analyzers ++= Seq(Uniqueness(name), Distinctness(name), Entropy(name)) + analyzers ++= Seq(Uniqueness(name), Distinctness(name), Entropy(name), + CountDistinct(name)) } analyzers @@ -655,24 +663,29 @@ object ColumnProfiler { analyzer.column -> metric.value.get.toLong } + val exactNumDistincts = results.metricMap + .collect { case (analyzer: CountDistinct, metric: DoubleMetric) => + analyzer.columns.head -> metric.value.get.toLong + } + val completenesses = results.metricMap .collect { case (analyzer: Completeness, metric: DoubleMetric) => analyzer.column -> metric.value.get } val entropy = results.metricMap - .collect { case (analyzer: Entropy, metric: DoubleMetric) => + .collect { case (analyzer: Entropy, metric: DoubleMetric) if metric.value.isSuccess => analyzer.column -> metric.value.get } val uniqueness = results.metricMap - .collect { case (analyzer: Uniqueness, metric: DoubleMetric) => + .collect { case (analyzer: Uniqueness, metric: DoubleMetric) if metric.value.isSuccess => // we only compute uniqueness for single columns analyzer.columns.head -> metric.value.get } val distinctness = results.metricMap - .collect { case (analyzer: Distinctness, metric: DoubleMetric) => + .collect { case (analyzer: Distinctness, metric: DoubleMetric) if metric.value.isSuccess => analyzer.columns.head -> metric.value.get } @@ -698,7 +711,8 @@ object ColumnProfiler { .toMap GenericColumnStatistics(numRecords, inferredTypes, knownTypes, typeDetectionHistograms, - approximateNumDistincts, completenesses, distinctness, entropy, uniqueness, predefinedTypes) + approximateNumDistincts, exactNumDistincts, completenesses, distinctness, entropy, + uniqueness, predefinedTypes) } @@ -988,6 +1002,7 @@ object ColumnProfiler { val entropy = genericStats.entropy.get(name) val uniqueness = genericStats.uniqueness.get(name) val approxNumDistinct = genericStats.approximateNumDistincts(name) + val exactNumDistinct = genericStats.exactNumDistincts.get(name) val dataType = genericStats.typeOf(name) val isDataTypeInferred = genericStats.inferredTypes.contains(name) val histogram = categoricalStats.histograms.get(name) @@ -1004,6 +1019,7 @@ object ColumnProfiler { entropy, uniqueness, approxNumDistinct, + exactNumDistinct, dataType, isDataTypeInferred, typeCounts, @@ -1039,6 +1055,7 @@ object ColumnProfiler { entropy, uniqueness, approxNumDistinct, + exactNumDistinct, dataType, isDataTypeInferred, typeCounts, diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala index e462b26d9..81cedeb3c 100644 --- a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala +++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala @@ -68,6 +68,7 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec Some(1.0), Some(1.0), 6, + Some(6), DataTypeInstances.Fractional, false, Map.empty, @@ -113,6 +114,7 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec Some(1.0), Some(1.0), 30, + Some(30), DataTypeInstances.Fractional, false, Map.empty, diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala index 572e177b3..a47b14497 100644 --- a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala +++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala @@ -88,6 +88,7 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec None, None, 6, + None, DataTypeInstances.Decimal, false, Map.empty, @@ -160,6 +161,7 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec None, None, 6, + None, DataTypeInstances.Fractional, false, Map.empty, @@ -216,17 +218,17 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec val profile = ColumnProfiler.profileOptimized(data, Option(Seq("att1", "att2")), kllParameters = Some(KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch - .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true) - val profiles = profile.profiles.map{pro => pro._2}.toSeq - val json_profile = ColumnProfiles.toJson(profiles) + .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true, exactUniqueness = true) + val json_profile = ColumnProfiles.toJson(profile) val correct_profile = "{\"columns\":[{\"column\":\"att1\",\"dataType\":\"Fractional\"," + - "\"isDataTypeInferred\":\"false\",\"completeness\":1.0," + - "\"approximateNumDistinctValues\":6,\"histogram\":[{\"value\":\"6.0\",\"count\":1," + - "\"ratio\":0.16666666666666666},{\"value\":\"3.0\",\"count\":1," + - "\"ratio\":0.16666666666666666},{\"value\":\"2.0\",\"count\":1," + - "\"ratio\":0.16666666666666666},{\"value\":\"4.0\",\"count\":1," + - "\"ratio\":0.16666666666666666},{\"value\":\"1.0\",\"count\":1," + + "\"isDataTypeInferred\":\"false\",\"completeness\":1.0,\"numRecordsNonNull\":6," + + "\"numRecordsNull\":0,\"distinctness\":1.0,\"entropy\":1.791759469228055," + + "\"uniqueness\":1.0,\"approximateNumDistinctValues\":6,\"exactNumDistinctValues\":6," + + "\"histogram\":[{\"value\":\"1.0\",\"count\":1,\"ratio\":0.16666666666666666}," + + "{\"value\":\"2.0\",\"count\":1,\"ratio\":0.16666666666666666},{\"value\":\"3.0\"," + + "\"count\":1,\"ratio\":0.16666666666666666},{\"value\":\"4.0\",\"count\":1," + "\"ratio\":0.16666666666666666},{\"value\":\"5.0\",\"count\":1," + + "\"ratio\":0.16666666666666666},{\"value\":\"6.0\",\"count\":1," + "\"ratio\":0.16666666666666666}],\"mean\":3.5,\"maximum\":6.0,\"minimum\":1.0," + "\"sum\":21.0,\"stdDev\":1.707825127659933,\"correlations\":[{\"column\":\"att2\"," + "\"correlation\":0.9263710192499128},{\"column\":\"att1\",\"correlation\":1.0}]," + @@ -255,30 +257,32 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec "4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0," + "5.0,5.0,5.0,5.0,5.0,5.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0," + "6.0,6.0]},{\"column\":\"att2\",\"dataType\":\"Fractional\"," + - "\"isDataTypeInferred\":\"false\",\"completeness\":1.0," + - "\"approximateNumDistinctValues\":4,\"histogram\":[{\"value\":\"0.0\",\"count\":3," + - "\"ratio\":0.5},{\"value\":\"6.0\",\"count\":1,\"ratio\":0.16666666666666666}," + - "{\"value\":\"7.0\",\"count\":1,\"ratio\":0.16666666666666666},{\"value\":\"5.0\"," + - "\"count\":1,\"ratio\":0.16666666666666666}],\"mean\":3.0,\"maximum\":7.0," + - "\"minimum\":0.0,\"sum\":18.0,\"stdDev\":3.0550504633038935," + - "\"correlations\":[{\"column\":\"att2\",\"correlation\":1.0},{\"column\":\"att1\"," + - "\"correlation\":0.9263710192499128}],\"kll\":{\"buckets\":[{\"low_value\":0.0," + - "\"high_value\":0.35,\"count\":3,\"ratio\":0.5},{\"low_value\":0.35,\"high_value\":0.7," + - "\"count\":0,\"ratio\":0.0},{\"low_value\":0.7,\"high_value\":1.05,\"count\":0," + - "\"ratio\":0.0},{\"low_value\":1.05,\"high_value\":1.4,\"count\":0,\"ratio\":0.0}," + - "{\"low_value\":1.4,\"high_value\":1.75,\"count\":0,\"ratio\":0.0},{\"low_value\":1.75," + - "\"high_value\":2.1,\"count\":0,\"ratio\":0.0},{\"low_value\":2.1,\"high_value\":2.45," + - "\"count\":0,\"ratio\":0.0},{\"low_value\":2.45,\"high_value\":2.8,\"count\":0," + - "\"ratio\":0.0},{\"low_value\":2.8,\"high_value\":3.15,\"count\":0,\"ratio\":0.0}," + - "{\"low_value\":3.15,\"high_value\":3.5,\"count\":0,\"ratio\":0.0},{\"low_value\":3.5," + - "\"high_value\":3.85,\"count\":0,\"ratio\":0.0},{\"low_value\":3.85,\"high_value\":4.2," + - "\"count\":0,\"ratio\":0.0},{\"low_value\":4.2,\"high_value\":4.55,\"count\":0," + - "\"ratio\":0.0},{\"low_value\":4.55,\"high_value\":4.9,\"count\":0,\"ratio\":0.0}," + - "{\"low_value\":4.9,\"high_value\":5.25,\"count\":1,\"ratio\":0.16666666666666666}," + - "{\"low_value\":5.25,\"high_value\":5.6,\"count\":0,\"ratio\":0.0},{\"low_value\":5.6," + - "\"high_value\":5.95,\"count\":0,\"ratio\":0.0},{\"low_value\":5.95,\"high_value\":6.3," + - "\"count\":1,\"ratio\":0.16666666666666666},{\"low_value\":6.3,\"high_value\":6.65," + - "\"count\":0,\"ratio\":0.0},{\"low_value\":6.65,\"high_value\":7.0,\"count\":1," + + "\"isDataTypeInferred\":\"false\",\"completeness\":1.0,\"numRecordsNonNull\":6," + + "\"numRecordsNull\":0,\"distinctness\":0.6666666666666666,\"entropy\":1.242453324894," + + "\"uniqueness\":0.5,\"approximateNumDistinctValues\":4,\"exactNumDistinctValues\":4," + + "\"histogram\":[{\"value\":\"0.0\",\"count\":3,\"ratio\":0.5},{\"value\":\"5.0\"," + + "\"count\":1,\"ratio\":0.16666666666666666},{\"value\":\"6.0\",\"count\":1," + + "\"ratio\":0.16666666666666666},{\"value\":\"7.0\",\"count\":1," + + "\"ratio\":0.16666666666666666}],\"mean\":3.0,\"maximum\":7.0,\"minimum\":0.0," + + "\"sum\":18.0,\"stdDev\":3.0550504633038935,\"correlations\":[{\"column\":\"att2\"," + + "\"correlation\":1.0},{\"column\":\"att1\",\"correlation\":0.9263710192499128}]," + + "\"kll\":{\"buckets\":[{\"low_value\":0.0,\"high_value\":0.35,\"count\":3," + + "\"ratio\":0.5},{\"low_value\":0.35,\"high_value\":0.7,\"count\":0,\"ratio\":0.0}," + + "{\"low_value\":0.7,\"high_value\":1.05,\"count\":0,\"ratio\":0.0},{\"low_value\":1.05," + + "\"high_value\":1.4,\"count\":0,\"ratio\":0.0},{\"low_value\":1.4,\"high_value\":1.75," + + "\"count\":0,\"ratio\":0.0},{\"low_value\":1.75,\"high_value\":2.1,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":2.1,\"high_value\":2.45,\"count\":0,\"ratio\":0.0}," + + "{\"low_value\":2.45,\"high_value\":2.8,\"count\":0,\"ratio\":0.0},{\"low_value\":2.8," + + "\"high_value\":3.15,\"count\":0,\"ratio\":0.0},{\"low_value\":3.15,\"high_value\":3.5," + + "\"count\":0,\"ratio\":0.0},{\"low_value\":3.5,\"high_value\":3.85,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":3.85,\"high_value\":4.2,\"count\":0,\"ratio\":0.0}," + + "{\"low_value\":4.2,\"high_value\":4.55,\"count\":0,\"ratio\":0.0},{\"low_value\":4.55," + + "\"high_value\":4.9,\"count\":0,\"ratio\":0.0},{\"low_value\":4.9,\"high_value\":5.25," + + "\"count\":1,\"ratio\":0.16666666666666666},{\"low_value\":5.25,\"high_value\":5.6," + + "\"count\":0,\"ratio\":0.0},{\"low_value\":5.6,\"high_value\":5.95,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":5.95,\"high_value\":6.3,\"count\":1," + + "\"ratio\":0.16666666666666666},{\"low_value\":6.3,\"high_value\":6.65,\"count\":0," + + "\"ratio\":0.0},{\"low_value\":6.65,\"high_value\":7.0,\"count\":1," + "\"ratio\":0.16666666666666666}],\"sketch\":{\"parameters\":{\"c\":0.64,\"k\":2048.0}," + "\"data\":\"[[0.0,0.0,0.0,5.0,6.0,7.0]]\"}},\"approxPercentiles\":[0.0,0.0,0.0,0.0,0.0," + "0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0," + @@ -306,6 +310,7 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec Some(1.791759469228055), Some(1.0), 6, + Some(6), DataTypeInstances.Fractional, false, Map.empty, @@ -371,6 +376,7 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec Some(1.791759469228055), Some(1.0), 6, + Some(6), DataTypeInstances.String, false, Map.empty, @@ -404,6 +410,7 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec Some(3.4011973816621546), Some(1.0), 30, + Some(30), DataTypeInstances.Fractional, false, Map.empty, diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala new file mode 100644 index 000000000..c30c83399 --- /dev/null +++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala @@ -0,0 +1,48 @@ +/** + * Copyright 2021 Logical Clocks AB. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + +package com.amazon.deequ.profiles + +import com.amazon.deequ.SparkContextSpec +import com.amazon.deequ.utils.FixtureSupport +import org.scalatest.{Matchers, color} +import org.scalatest.wordspec.AnyWordSpec +class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport { + + "Column Profiler NaN Test" should { + "return results for data frame with NaN and null values without failure" in withSparkSession { + sparkSession => + val df = getDfWithNas(sparkSession) + + val runner: ColumnProfilerRunBuilder = new ColumnProfilerRunner() + .onData(df) + .withCorrelation(true, 50) + .withHistogram(true, 20) + .withExactUniqueness (true) + + val result = runner.run() + + val matches = result.profiles.map { case (colname: String, profile: ColumnProfile) => + val nacount = df.filter(df(colname).isNull || df(colname).isNaN).count() + val nacount_profile = result.numRecords - scala.math.round(profile.completeness * + result.numRecords) + nacount == nacount_profile + }.toSeq + + assert(matches.forall(_ == true)) + } + } +} \ No newline at end of file diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala index a02382ebe..4bbf5abb0 100644 --- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala +++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala @@ -65,6 +65,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec Some(0.5623351446188083), Some(0.25), 2, + None, DataTypeInstances.String, true, Map( @@ -119,6 +120,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec Some(1.791759469228055), Some(1.0), 6, + None, DataTypeInstances.String, false, Map(), @@ -145,6 +147,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec Some(0.5623351446188083), Some(0.25), 2, + None, DataTypeInstances.String, true, Map( @@ -177,6 +180,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec Some(1.0), Some(1.0), 6, + Some(6), DataTypeInstances.Integral, true, Map( @@ -221,6 +225,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec Some(1.0), Some(1.0), 6, + Some(6), DataTypeInstances.Integral, true, Map( @@ -266,6 +271,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec Some(1.0), Some(1.0), 6, + Some(6), DataTypeInstances.Integral, true, Map( @@ -346,6 +352,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec Some(1.0), Some(1.0), 6, + Some(6), DataTypeInstances.Fractional, false, Map.empty, @@ -383,6 +390,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec Some(0.5623351446188083), Some(0.25), 2, + None, DataTypeInstances.String, isDataTypeInferred = true, Map( @@ -594,6 +602,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec Some(1.0), Some(1.0), 891, + Some(891), DataTypeInstances.Integral, false, Map.empty, @@ -605,21 +614,22 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec Some(1.0), Some(1.0), 2, + Some(2), DataTypeInstances.Integral, false, Map.empty, None), - StandardColumnProfile("Pclass", 1.0, Some(1.0), Some(1.0), Some(1.0), 3, + StandardColumnProfile("Pclass", 1.0, Some(1.0), Some(1.0), Some(1.0), 3, Some(3), DataTypeInstances.Integral, false, Map.empty, None), - StandardColumnProfile("Name", 1.0, Some(1.0), Some(1.0), Some(1.0), 0, + StandardColumnProfile("Name", 1.0, Some(1.0), Some(1.0), Some(1.0), 0, Some(0), DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Sex", 1.0, Some(1.0), Some(1.0), Some(1.0), 2, + StandardColumnProfile("Sex", 1.0, Some(1.0), Some(1.0), Some(1.0), 2, Some(2), DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Ticket", 1.0, Some(1.0), Some(1.0), Some(1.0), 681, + StandardColumnProfile("Ticket", 1.0, Some(1.0), Some(1.0), Some(1.0), 681, Some(681), DataTypeInstances.String, true, Map.empty, None), - StandardColumnProfile("Fare", 1.0, Some(1.0), Some(1.0), Some(1.0), 0, + StandardColumnProfile("Fare", 1.0, Some(1.0), Some(1.0), Some(1.0), 0, Some(0), DataTypeInstances.Fractional, false, Map.empty, None), - StandardColumnProfile("Cabin", 0.22, Some(1.0), Some(1.0), Some(1.0), 0, + StandardColumnProfile("Cabin", 0.22, Some(1.0), Some(1.0), Some(1.0), 0, Some(0), DataTypeInstances.String, true, Map.empty, None) ) diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index d18282901..be29496dc 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -35,9 +35,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "be applied correctly" in { val complete = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, - String, false, Map.empty, None) + Some(100), String, false, Map.empty, None) val incomplete = StandardColumnProfile("col1", .25, Some(1.0), Some(1.0), Some(1.0), 100, - String, false, Map.empty, None) + Some(100), String, false, Map.empty, None) val completeInteger = getFakeNumericColumnProfileWithMinMaxMeanAndStdDev( @@ -132,9 +132,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "be applied correctly" in { val complete = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, - String, false, Map.empty, None) + Some(100), String, false, Map.empty, None) val incomplete = StandardColumnProfile("col1", .25, Some(1.0), Some(1.0), Some(1.0), 100, - String, false, Map.empty, None) + Some(100), String, false, Map.empty, None) assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000)) assert(RetainCompletenessRule().shouldBeApplied(incomplete, 1000)) @@ -196,13 +196,13 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "be applied correctly" in { val unique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, - String, false, Map.empty, None) + Some(100), String, false, Map.empty, None) val maybeUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95, - String, false, Map.empty, None) + Some(95), String, false, Map.empty, None) val maybeNonUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 91, - String, false, Map.empty, None) + Some(91), String, false, Map.empty, None) val nonUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 20, - String, false, Map.empty, None) + Some(20), String, false, Map.empty, None) assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(unique, 100)) assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(maybeUnique, 100)) @@ -263,23 +263,23 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "be applied correctly" in { val string = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, - String, true, Map.empty, None) + Some(100), String, true, Map.empty, None) val boolean = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, - Boolean, true, Map.empty, None) + Some(100), Boolean, true, Map.empty, None) val fractional = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, - Fractional, true, Map.empty, None) + Some(100), Fractional, true, Map.empty, None) val integer = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, - Integral, true, Map.empty, None) + Some(100), Integral, true, Map.empty, None) val unknown = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, - Unknown, true, Map.empty, None) + Some(100), Unknown, true, Map.empty, None) val stringNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), - 100, String, false, Map.empty, None) + 100, Some(100), String, false, Map.empty, None) val booleanNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), - 100, Boolean, false, Map.empty, None) + 100, Some(100), Boolean, false, Map.empty, None) val fractionalNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), - Some(1.0), 100, Fractional, false, Map.empty, None) + Some(1.0), 100, Some(100), Fractional, false, Map.empty, None) val integerNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), - 100, Integral, false, Map.empty, None) + 100, Some(100), Integral, false, Map.empty, None) assert(!RetainTypeRule().shouldBeApplied(string, 100)) assert(!RetainTypeRule().shouldBeApplied(unknown, 100)) @@ -397,26 +397,27 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val noDistribution = Distribution(Map.empty, 0) val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), - Some(1.0), 100, String, false, Map.empty, Some(nonSkewedDist)) + Some(1.0), 100, Some(100), String, false, Map.empty, Some(nonSkewedDist)) val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), - Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDist)) + Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, + Some(nonSkewedIntegralDist)) val stringWithFlgDist = StandardColumnProfile("flg", 1.0, Some(1.0), Some(1.0), Some(1.0), - 2, String, false, Map.empty, Some(flgDist)) + 2, Some(2), String, false, Map.empty, Some(flgDist)) val integralWithFlgDist = StandardColumnProfile("flg", 1.0, Some(1.0), Some(1.0), Some(1.0), - 2, DataTypeInstances.Integral, false, Map.empty, Some(flgDist)) + 2, Some(2), DataTypeInstances.Integral, false, Map.empty, Some(flgDist)) val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), - Some(1.0), 100, String, false, Map.empty, Some(skewedDist)) + Some(1.0), 100, Some(100), String, false, Map.empty, Some(skewedDist)) val stringNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95, - String, false, Map.empty, None) + Some(95), String, false, Map.empty, None) val boolNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 94, - Boolean, false, Map.empty, None) + Some(94), Boolean, false, Map.empty, None) val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), - 20, Boolean, false, Map.empty, Some(noDistribution)) + 20, Some(20), Boolean, false, Map.empty, Some(noDistribution)) val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), - Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedDist)) + Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(skewedDist)) val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), - 95, DataTypeInstances.Integral, false, Map.empty, None) + 95, Some(95), DataTypeInstances.Integral, false, Map.empty, None) assert(CategoricalRangeRule().shouldBeApplied(stringWithNonSkewedDist, 100)) assert(CategoricalRangeRule().shouldBeApplied(integralWithNonSkewedDist, 100)) @@ -579,37 +580,37 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext val noDistribution = Distribution(Map.empty, 0) val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile( - "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, + "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), String, false, Map.empty, Some(nonSkewedDistWithFractionalCategoricalRange)) val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile( - "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, + "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), String, false, Map.empty, Some(nonSkewedDistWithActualCategoricalRange)) val stringWithSomewhatSkewedDist = StandardColumnProfile( - "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, + "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), String, false, Map.empty, Some(somewhatSkewedDist)) val stringWithSkewedDist = StandardColumnProfile( - "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty, + "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), String, false, Map.empty, Some(skewedDist)) val stringNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), - 95, String, false, Map.empty, None) + 95, Some(95), String, false, Map.empty, None) val boolNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 94, - Boolean, false, Map.empty, None) + Some(94), Boolean, false, Map.empty, None) val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), - 20, Boolean, false, Map.empty, Some(noDistribution)) + 20, Some(20), Boolean, false, Map.empty, Some(noDistribution)) val integralWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1", - 1.0, Some(1.0), Some(1.0), Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, - Some(nonSkewedIntegralDistWithFractionalCategoricalRange)) + 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, + Map.empty, Some(nonSkewedIntegralDistWithFractionalCategoricalRange)) val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile( - "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, DataTypeInstances.Integral, false, - Map.empty, Some(nonSkewedIntegralDistWithActualCategoricalRange)) + "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), DataTypeInstances.Integral, + false, Map.empty, Some(nonSkewedIntegralDistWithActualCategoricalRange)) val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), - Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, + Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(somewhatSkewedIntegralDist)) val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), - Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist)) + Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist)) val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), - 95, DataTypeInstances.Integral, false, Map.empty, None) + 95, Some(95), DataTypeInstances.Integral, false, Map.empty, None) assert(FractionalCategoricalRangeRule().shouldBeApplied(stringWithSomewhatSkewedDist, 100)) assert(FractionalCategoricalRangeRule().shouldBeApplied( @@ -732,8 +733,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "NonNegativeNumbersRule and PositiveNumbersRule" should { "be applied correctly" in { def columnProfileWithMinimum(minimum: Double): NumericColumnProfile = { - NumericColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Fractional, - isDataTypeInferred = false, Map.empty, None, None, Some(10), Some(100), + NumericColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), + Fractional, isDataTypeInferred = false, Map.empty, None, None, Some(10), Some(100), Some(minimum), Some(10000), Some(1.0), None, None) } diff --git a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala index 7b56c744e..41a30c420 100644 --- a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala +++ b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala @@ -19,9 +19,8 @@ package com.amazon.deequ.utils import com.amazon.deequ.analyzers.DataTypeInstances import com.amazon.deequ.profiles.NumericColumnProfile import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.Row -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, MapType, StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} import scala.util.Random @@ -337,6 +336,39 @@ trait FixtureSupport { "onlyUniqueWithOtherNonUnique", "halfUniqueCombinedWithNonUnique") } + def getDfWithNas(sparkSession: SparkSession): DataFrame = { + import org.apache.spark.sql.functions._ + + val schema = StructType( Array( + StructField("nullstr", StringType, true), + StructField("nullstrmixed", StringType, true), + StructField("nullint", IntegerType, true), + StructField("nullintmixed", IntegerType, true), + StructField("nulldbl", DoubleType, true), + StructField("nulldblna", DoubleType, true), + StructField("nulldblnamixed", DoubleType, true), + StructField("nullna", DoubleType, true) + )) + + val data = Seq( + Row(null, "b", null, 2, null, java.lang.Double.NaN, 2.0, java.lang.Double.NaN), + Row(null, null, null, null, null, null, null, java.lang.Double.NaN), + Row(null, "c", null, 1, null, java.lang.Double.NaN, 1.0, java.lang.Double.NaN), + Row(null, null, null, null, null, null, java.lang.Double.NaN, java.lang.Double.NaN), + Row(null, "a", null, 0, null, null, 1.0, java.lang.Double.NaN), + Row(null, "a", null, 0, null, null, 1.0, java.lang.Double.NaN) + ) + + val nulldf = sparkSession.createDataFrame( + sparkSession.sparkContext.parallelize(data), + schema + ) + + nulldf.withColumn("nullstrmixed2", + when(col("nullstrmixed").equalTo("null"), null) + .otherwise(col("nullstrmixed"))) + } + def getDfWithDistinctValues(sparkSession: SparkSession): DataFrame = { import sparkSession.implicits._ From b21d74ed530a9dc564a628bb59566c99c5aa1ea8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Till=20D=C3=B6hmen?= Date: Fri, 27 May 2022 15:10:03 +0200 Subject: [PATCH 18/21] fixed stylecheck --- .../scala/com/amazon/deequ/profiles/ColumnProfile.scala | 4 ++-- .../com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala | 6 ++++-- .../deequ/suggestions/rules/ConstraintRulesTest.scala | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala index 6cfe29e7d..d57cc7b2c 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala @@ -98,7 +98,7 @@ object ColumnProfiles { def toJson(columnProfiles: Seq[ColumnProfile]): String = { // for backwards compatability with hsfs API - toJson(columnProfiles, -1) + toJson(columnProfiles, -1) } def toJson(columnProfiles: Seq[ColumnProfile], numRecords: Long): String = { @@ -218,7 +218,7 @@ object ColumnProfiles { // increase precision for small bucket sizes val fp = if (kllSketch.buckets.nonEmpty && scala.math.abs(kllSketch.buckets.head - .highValue - kllSketch.buckets.head.lowValue) > 0.05) "%.2f" else "%f" + .highValue - kllSketch.buckets.head.lowValue) > 0.05) { "%.2f" } else { "%f" } kllSketch.buckets.foreach{bucket => val histogramEntry = new JsonObject() diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala index c30c83399..cd936a4a3 100644 --- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala +++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala @@ -20,7 +20,9 @@ import com.amazon.deequ.SparkContextSpec import com.amazon.deequ.utils.FixtureSupport import org.scalatest.{Matchers, color} import org.scalatest.wordspec.AnyWordSpec -class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport { + +class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextSpec with + FixtureSupport { "Column Profiler NaN Test" should { "return results for data frame with NaN and null values without failure" in withSparkSession { @@ -45,4 +47,4 @@ class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextS assert(matches.forall(_ == true)) } } -} \ No newline at end of file +} diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index be29496dc..a9655d248 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -608,7 +608,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(somewhatSkewedIntegralDist)) val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), - Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist)) + Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some + (skewedIntegralDist)) val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95, Some(95), DataTypeInstances.Integral, false, Map.empty, None) From 8edc0e998ed995300abd1594e463c3398d9d5894 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Till=20D=C3=B6hmen?= Date: Fri, 20 May 2022 16:18:01 +0200 Subject: [PATCH 19/21] fixed NaN issues and improved statistics JSON --- src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala | 2 +- .../com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala | 4 +--- .../amazon/deequ/suggestions/rules/ConstraintRulesTest.scala | 3 +-- src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala | 1 - 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala index d57cc7b2c..9c2905a0c 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala @@ -218,7 +218,7 @@ object ColumnProfiles { // increase precision for small bucket sizes val fp = if (kllSketch.buckets.nonEmpty && scala.math.abs(kllSketch.buckets.head - .highValue - kllSketch.buckets.head.lowValue) > 0.05) { "%.2f" } else { "%f" } + .highValue - kllSketch.buckets.head.lowValue) > 0.05) "%.2f" else "%f" kllSketch.buckets.foreach{bucket => val histogramEntry = new JsonObject() diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala index cd936a4a3..ac596bc9a 100644 --- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala +++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala @@ -20,9 +20,7 @@ import com.amazon.deequ.SparkContextSpec import com.amazon.deequ.utils.FixtureSupport import org.scalatest.{Matchers, color} import org.scalatest.wordspec.AnyWordSpec - -class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextSpec with - FixtureSupport { +class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport { "Column Profiler NaN Test" should { "return results for data frame with NaN and null values without failure" in withSparkSession { diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index a9655d248..be29496dc 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -608,8 +608,7 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(somewhatSkewedIntegralDist)) val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), - Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some - (skewedIntegralDist)) + Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist)) val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95, Some(95), DataTypeInstances.Integral, false, Map.empty, None) diff --git a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala index 41a30c420..c80497eb7 100644 --- a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala +++ b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala @@ -18,7 +18,6 @@ package com.amazon.deequ.utils import com.amazon.deequ.analyzers.DataTypeInstances import com.amazon.deequ.profiles.NumericColumnProfile -import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, MapType, StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SparkSession} From 05b4e1af810e2506a8b253dc9684114e32fe1b21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?= Date: Mon, 30 Oct 2023 14:40:39 +0100 Subject: [PATCH 20/21] Resolve conflicts for 2.0.4 - spark3.3 --- pom.xml | 4 +++- .../scala/com/amazon/deequ/profiles/ColumnProfile.scala | 6 +++++- .../scala/com/amazon/deequ/profiles/ColumnProfiler.scala | 7 ++++++- src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala | 7 ++++++- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 40c378eb1..2a775b6d4 100644 --- a/pom.xml +++ b/pom.xml @@ -6,6 +6,8 @@ com.logicalclocks deequ_${scala.major.version} + + 2.0.4.1-SNAPSHOT deequ @@ -90,7 +92,7 @@ 2.4.2 3.0.0 3.1.1.0 - 3.3.0.0 + 3.3.0 provided diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala index 9c2905a0c..0a88a5a9c 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala @@ -54,7 +54,11 @@ case class StandardColumnProfile( case class StringColumnProfile( column: String, completeness: Double, + distinctness: Option[Double], + entropy: Option[Double], + uniqueness: Option[Double], approximateNumDistinctValues: Long, + exactNumDistinctValues: Option[Long], dataType: DataTypeInstances.Value, isDataTypeInferred: Boolean, typeCounts: Map[String, Long], @@ -218,7 +222,7 @@ object ColumnProfiles { // increase precision for small bucket sizes val fp = if (kllSketch.buckets.nonEmpty && scala.math.abs(kllSketch.buckets.head - .highValue - kllSketch.buckets.head.lowValue) > 0.05) "%.2f" else "%f" + .highValue - kllSketch.buckets.head.lowValue) > 0.05) { "%.2f" } else { "%f" } kllSketch.buckets.foreach{bucket => val histogramEntry = new JsonObject() diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala index b1293aa91..79fa9adc9 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala @@ -390,6 +390,7 @@ object ColumnProfiler { firstPassResults, predefinedTypes) + val stringStatistics = extractStringStatistics(firstPassResults) val numericStatistics = if (correlation) { extractNumericStatistics(firstPassResults, correlationCalculatedColumnNames) @@ -434,7 +435,7 @@ object ColumnProfiler { case _ => Map.empty[String, Distribution] } - createProfiles(relevantColumns, genericStatistics, numericStatistics, + createProfiles(relevantColumns, genericStatistics, stringStatistics, numericStatistics, CategoricalColumnStatistics(secondPassResults)) } @@ -1038,7 +1039,11 @@ object ColumnProfiler { StringColumnProfile( name, completeness, + distinctness, + entropy, + uniqueness, approxNumDistinct, + exactNumDistinct, dataType, isDataTypeInferred, typeCounts, diff --git a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala index c80497eb7..75e696dc0 100644 --- a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala +++ b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala @@ -486,7 +486,11 @@ trait FixtureSupport { NumericColumnProfile( column = columnName, completeness = completeness, + distinctness = Some(1.0), + entropy = Some(1.0), + uniqueness = Some(1.0), approximateNumDistinctValues = 1000, + exactNumDistinctValues = Some(1000L), dataType = dataType, isDataTypeInferred = false, typeCounts = Map[String, Long](), @@ -497,7 +501,8 @@ trait FixtureSupport { minimum = Some(minimum), sum = Some(1000.879), stdDev = Some(1.023), - approxPercentiles = None + approxPercentiles = None, + correlation = None ) } } From 766d412e1d6bb3b66cb3e4238298bd5b82e54cb1 Mon Sep 17 00:00:00 2001 From: Fabio Buso Date: Sat, 18 Nov 2023 17:18:38 +0100 Subject: [PATCH 21/21] Set better artifact name --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2a775b6d4..82a3b75fb 100644 --- a/pom.xml +++ b/pom.xml @@ -8,7 +8,7 @@ deequ_${scala.major.version} - 2.0.4.1-SNAPSHOT + 2.0.4.0-spark-3.3 deequ Deequ is a library built on top of Apache Spark for defining "unit tests for data",