From 850a13d40001efcb60f547ae0a73c81d1b6f60f5 Mon Sep 17 00:00:00 2001
From: Edward Cho <eycho@amazon.com>
Date: Mon, 7 Aug 2023 19:00:52 -0400
Subject: [PATCH 01/21] Update release version to 2.0.4-spark-3.3

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pom.xml b/pom.xml
index 39545d252..44aadccaa 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>com.amazon.deequ</groupId>
     <artifactId>deequ</artifactId>
-    <version>2.0.3-spark-3.3</version>
+    <version>2.0.4-spark-3.3</version>
 
     <properties>
         <maven.compiler.source>1.8</maven.compiler.source>

From 50a57905ca802b09c2a4dba87338c6d93b8e12c5 Mon Sep 17 00:00:00 2001
From: Fabio Buso <fabio@logicalclocks.com>
Date: Tue, 15 Sep 2020 22:45:32 +0200
Subject: [PATCH 02/21] Change groupId and publish on archiva (#2)

---
 pom.xml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 44aadccaa..9798ec271 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
 
-    <groupId>com.amazon.deequ</groupId>
+    <groupId>com.logicalclocks</groupId>
     <artifactId>deequ</artifactId>
     <version>2.0.4-spark-3.3</version>
 
@@ -416,4 +416,12 @@
         </profile>
     </profiles>
 
+    <distributionManagement>
+        <repository>
+            <id>Hops</id>
+            <name>Hops Repo</name>
+            <url>https://archiva.hops.works/repository/Hops/</url>
+        </repository>
+    </distributionManagement>
+
 </project>

From d91b861e4d60622d005b405fe2c4540545639bdd Mon Sep 17 00:00:00 2001
From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com>
Date: Wed, 16 Sep 2020 14:33:08 +0200
Subject: [PATCH 03/21] Adapt profiler for hsfs (#1)

* Add correlation

* make histograms configurable add uniqueness
---
 deequ-scalastyle.xml                          |   2 +-
 .../amazon/deequ/profiles/ColumnProfile.scala |  29 +++-
 .../deequ/profiles/ColumnProfiler.scala       | 155 +++++++++++++-----
 .../profiles/ColumnProfilerRunBuilder.scala   |  20 +++
 .../deequ/profiles/ColumnProfilerRunner.scala |   4 +
 5 files changed, 162 insertions(+), 48 deletions(-)

diff --git a/deequ-scalastyle.xml b/deequ-scalastyle.xml
index b5e9680a3..c726413bc 100644
--- a/deequ-scalastyle.xml
+++ b/deequ-scalastyle.xml
@@ -35,7 +35,7 @@
     </check>
 
     <check customId="argcount" level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
-        <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
+        <parameters><parameter name="maxParameters"><![CDATA[12]]></parameter></parameters>
     </check>
 
     <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
index 543936824..39b54d508 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
@@ -24,6 +24,9 @@ import com.google.gson.{Gson, GsonBuilder, JsonArray, JsonObject, JsonPrimitive}
 abstract class ColumnProfile {
   def column: String
   def completeness: Double
+  def distinctness: Double
+  def entropy: Double
+  def uniqueness: Double
   def approximateNumDistinctValues: Long
   def dataType: DataTypeInstances.Value
   def isDataTypeInferred: Boolean
@@ -34,6 +37,9 @@ abstract class ColumnProfile {
 case class StandardColumnProfile(
     column: String,
     completeness: Double,
+    distinctness: Double,
+    entropy: Double,
+    uniqueness: Double,
     approximateNumDistinctValues: Long,
     dataType: DataTypeInstances.Value,
     isDataTypeInferred: Boolean,
@@ -56,6 +62,9 @@ case class StringColumnProfile(
 case class NumericColumnProfile(
     column: String,
     completeness: Double,
+    distinctness: Double,
+    entropy: Double,
+    uniqueness: Double,
     approximateNumDistinctValues: Long,
     dataType: DataTypeInstances.Value,
     isDataTypeInferred: Boolean,
@@ -67,7 +76,8 @@ case class NumericColumnProfile(
     minimum: Option[Double],
     sum: Option[Double],
     stdDev: Option[Double],
-    approxPercentiles: Option[Seq[Double]])
+    approxPercentiles: Option[Seq[Double]],
+    correlation: Option[Map[String, Double]])
   extends ColumnProfile
 
 case class ColumnProfiles(
@@ -98,6 +108,9 @@ object ColumnProfiles {
       }
 
       columnProfileJson.addProperty("completeness", profile.completeness)
+      columnProfileJson.addProperty("distinctness", profile.distinctness)
+      columnProfileJson.addProperty("entropy", profile.entropy)
+      columnProfileJson.addProperty("uniqueness", profile.uniqueness)
       columnProfileJson.addProperty("approximateNumDistinctValues",
         profile.approximateNumDistinctValues)
 
@@ -134,6 +147,18 @@ object ColumnProfiles {
             columnProfileJson.addProperty("stdDev", stdDev)
           }
 
+          // correlation
+          if (numericColumnProfile.correlation.isDefined) {
+            val correlationsJson = new JsonArray
+            numericColumnProfile.correlation.get.foreach { correlation =>
+              val correlationJson = new JsonObject()
+              correlationJson.addProperty("column", correlation._1)
+              correlationJson.addProperty("correlation", correlation._2)
+              correlationsJson.add(correlationJson)
+            }
+            columnProfileJson.add("correlations", correlationsJson)
+          }
+
           // KLL Sketch
           if (numericColumnProfile.kll.isDefined) {
             val kllSketch = numericColumnProfile.kll.get
@@ -182,7 +207,7 @@ object ColumnProfiles {
     json.add("columns", columns)
 
     val gson = new GsonBuilder()
-      .setPrettyPrinting()
+      // .setPrettyPrinting()
       .create()
 
     gson.toJson(json)
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
index 57c8c3019..d4c42bd42 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
@@ -49,6 +49,9 @@ private[deequ] case class GenericColumnStatistics(
     typeDetectionHistograms: Map[String, Map[String, Long]],
     approximateNumDistincts: Map[String, Long],
     completenesses: Map[String, Double],
+    distinctness: Map[String, Double],
+    entropy: Map[String, Double],
+    uniqueness: Map[String, Double],
     predefinedTypes: Map[String, DataTypeInstances.Value]) {
 
   def typeOf(column: String): DataTypeInstances.Value = {
@@ -69,7 +72,8 @@ private[deequ] case class NumericColumnStatistics(
     maxima: Map[String, Double],
     sums: Map[String, Double],
     kll: Map[String, BucketDistribution],
-    approxPercentiles: Map[String, Seq[Double]]
+    approxPercentiles: Map[String, Seq[Double]],
+    correlation: Map[String, Map[String, Double]]
 )
 
 private[deequ] case class CategoricalColumnStatistics(histograms: Map[String, Distribution])
@@ -119,6 +123,8 @@ object ColumnProfiler {
       reuseExistingResultsUsingKey: Option[ResultKey] = None,
       failIfResultsForReusingMissing: Boolean = false,
       saveInMetricsRepositoryUsingKey: Option[ResultKey] = None,
+      correlation: Boolean = true,
+      histogram: Boolean = true,
       kllProfiling: Boolean = false,
       kllParameters: Option[KLLParameters] = None,
       predefinedTypes: Map[String, DataTypeInstances.Value] = Map.empty)
@@ -179,7 +185,7 @@ object ColumnProfiler {
 
     // We compute mean, stddev, min, max for all numeric columns
     val analyzersForSecondPass = getAnalyzersForSecondPass(relevantColumns,
-      genericStatistics, kllProfiling, kllParameters)
+      genericStatistics, kllProfiling, kllParameters, correlation)
 
     var analysisRunnerSecondPass = AnalysisRunner
       .onData(castedDataForSecondPass)
@@ -196,39 +202,45 @@ object ColumnProfiler {
 
     val numericStatistics = extractNumericStatistics(secondPassResults)
 
-    // Third pass
-    if (printStatusUpdates) {
-      println("### PROFILING: Computing histograms of low-cardinality columns in pass (3/3)...")
-    }
-
-    // We compute exact histograms for all low-cardinality string columns, find those here
-    val targetColumnsForHistograms = findTargetColumnsForHistograms(data.schema, genericStatistics,
-      lowCardinalityHistogramThreshold)
-
-    // Find out, if we have values for those we can reuse
-    val analyzerContextExistingValues = getAnalyzerContextWithHistogramResultsForReusingIfNecessary(
-      metricsRepository,
-      reuseExistingResultsUsingKey,
-      targetColumnsForHistograms
-    )
-
-    // The columns we need to calculate the histograms for
-    val nonExistingHistogramColumns = targetColumnsForHistograms
-      .filter { column => analyzerContextExistingValues.metricMap.get(Histogram(column)).isEmpty }
-
-    // Calculate and save/append results if necessary
-    val histograms: Map[String, Distribution] = getHistogramsForThirdPass(
-      data,
-      nonExistingHistogramColumns,
-      analyzerContextExistingValues,
-      printStatusUpdates,
-      failIfResultsForReusingMissing,
-      metricsRepository,
-      saveInMetricsRepositoryUsingKey)
+    val thirdPassResults = histogram match {
+      case true =>
+        // Third pass
+        if (printStatusUpdates) {
+          println("### PROFILING: Computing histograms of low-cardinality columns in pass (3/3)...")
+        }
 
-    val thirdPassResults = CategoricalColumnStatistics(histograms)
+        // We compute exact histograms for all low-cardinality string columns, find those here
+        val targetColumnsForHistograms = findTargetColumnsForHistograms(data.schema,
+          genericStatistics, lowCardinalityHistogramThreshold)
+
+        // Find out, if we have values for those we can reuse
+        val analyzerContextExistingValues =
+          getAnalyzerContextWithHistogramResultsForReusingIfNecessary(
+          metricsRepository,
+          reuseExistingResultsUsingKey,
+          targetColumnsForHistograms
+        )
+
+        // The columns we need to calculate the histograms for
+        val nonExistingHistogramColumns = targetColumnsForHistograms
+          .filter { column =>
+            analyzerContextExistingValues.metricMap.get(Histogram(column)).isEmpty }
+
+        // Calculate and save/append results if necessary
+        val histograms: Map[String, Distribution] = getHistogramsForThirdPass(
+          data,
+          nonExistingHistogramColumns,
+          analyzerContextExistingValues,
+          printStatusUpdates,
+          failIfResultsForReusingMissing,
+          metricsRepository,
+          saveInMetricsRepositoryUsingKey)
+        histograms
+      case _ => Map.empty[String, Distribution]
+    }
 
-    createProfiles(relevantColumns, genericStatistics, stringStatistics, numericStatistics, thirdPassResults)
+    createProfiles(relevantColumns, genericStatistics, stringStatistics, numericStatistics,
+      CategoricalColumnStatistics(thirdPassResults))
   }
 
   private[this] def getRelevantColumns(
@@ -261,10 +273,12 @@ object ColumnProfiler {
           Seq(
             Completeness(name), ApproxCountDistinct(name), DataType(name),
             MinLength(name, analyzerOptions = Some(analyzerOptions)),
-            MaxLength(name, analyzerOptions = Some(analyzerOptions))
+            MaxLength(name, analyzerOptions = Some(analyzerOptions)),
+            Uniqueness(name), Distinctness(name), Entropy(name)
           )
         } else {
-          Seq(Completeness(name), ApproxCountDistinct(name))
+          Seq(Completeness(name), ApproxCountDistinct(name), Uniqueness(name),
+            Distinctness(name), Entropy(name))
         }
       }
   }
@@ -273,17 +287,23 @@ object ColumnProfiler {
       relevantColumnNames: Seq[String],
       genericStatistics: GenericColumnStatistics,
       kllProfiling: Boolean,
-      kllParameters: Option[KLLParameters] = None)
+      kllParameters: Option[KLLParameters] = None,
+      correlation: Boolean)
     : Seq[Analyzer[_, Metric[_]]] = {
-      relevantColumnNames
+      val numericColumnNames = relevantColumnNames
         .filter { name => Set(Integral, Fractional).contains(genericStatistics.typeOf(name)) }
-        .flatMap { name => getNumericColAnalyzers(name, kllProfiling, kllParameters) }
+      numericColumnNames
+        .flatMap { name =>
+          getNumericColAnalyzers(name, kllProfiling, kllParameters, correlation, numericColumnNames)
+        }
     }
 
   private[this] def getNumericColAnalyzers(
       column: String,
       kllProfiling: Boolean,
-      kllParameters: Option[KLLParameters])
+      kllParameters: Option[KLLParameters],
+      correlation: Boolean,
+      numericColumnNames: Seq[String])
     : Seq[Analyzer[_, Metric[_]]] = {
       val mandatoryAnalyzers = Seq(Minimum(column), Maximum(column), Mean(column),
         StandardDeviation(column), Sum(column))
@@ -294,7 +314,13 @@ object ColumnProfiler {
         Seq.empty
       }
 
-      mandatoryAnalyzers ++ optionalAnalyzers
+      val correlationAnalyzers = if (correlation) {
+        numericColumnNames.map(x => Correlation(column, x))
+      } else {
+        Seq.empty
+      }
+
+      mandatoryAnalyzers ++ optionalAnalyzers ++ correlationAnalyzers
   }
 
   private[this] def setMetricsRepositoryConfigurationIfNecessary(
@@ -446,9 +472,25 @@ object ColumnProfiler {
         analyzer.column -> metric.value.get
       }
 
+    val entropy = results.metricMap
+      .collect { case (analyzer: Entropy, metric: DoubleMetric) =>
+        analyzer.column -> metric.value.get
+      }
+
+    val uniqueness = results.metricMap
+      .collect { case (analyzer: Uniqueness, metric: DoubleMetric) =>
+        // we only compute uniqueness for single columns
+        analyzer.columns.head -> metric.value.get
+      }
+
+    val distinctness = results.metricMap
+      .collect { case (analyzer: Distinctness, metric: DoubleMetric) =>
+        analyzer.columns.head -> metric.value.get
+      }
+
     val knownTypes = schema.fields
       .filter { column => columns.contains(column.name) }
-      .filterNot { column => predefinedTypes.contains(column.name)}
+      .filterNot { column => predefinedTypes.contains(column.name) }
       .filter {
         _.dataType != StringType
       }
@@ -468,7 +510,7 @@ object ColumnProfiler {
       .toMap
 
     GenericColumnStatistics(numRecords, inferredTypes, knownTypes, typeDetectionHistograms,
-      approximateNumDistincts, completenesses, predefinedTypes)
+      approximateNumDistincts, completenesses, distinctness, entropy, uniqueness, predefinedTypes)
   }
 
 
@@ -566,7 +608,7 @@ object ColumnProfiler {
       .toMap
 
     val approxPercentiles = results.metricMap
-      .collect {  case (analyzer: KLLSketch, metric: KLLMetric) =>
+      .collect { case (analyzer: KLLSketch, metric: KLLMetric) =>
         metric.value match {
           case Success(bucketDistribution) =>
 
@@ -579,8 +621,20 @@ object ColumnProfiler {
       .flatten
       .toMap
 
+    val correlation = results.metricMap
+      .collect { case (analyzer: Correlation, metric: DoubleMetric) =>
+        metric.value match {
+          case Success(metricValue) =>
+            Some(analyzer.firstColumn -> Map(analyzer.secondColumn -> metricValue))
+          case _ => None
+        }
+      }
+      .flatten
+      .groupBy(_._1)
+      .map { case (key, value) => value.reduce((x, y) => x._1 -> (x._2.toSeq ++ y._2.toSeq).toMap) }
 
-    NumericColumnStatistics(means, stdDevs, minima, maxima, sums, kll, approxPercentiles)
+    NumericColumnStatistics(means, stdDevs, minima, maxima, sums, kll,
+      approxPercentiles, correlation)
   }
 
   /* Identifies all columns, which:
@@ -723,6 +777,9 @@ object ColumnProfiler {
       .map { name =>
 
         val completeness = genericStats.completenesses(name)
+        val distinctness = genericStats.distinctness(name)
+        val entropy = genericStats.entropy(name)
+        val uniqueness = genericStats.uniqueness(name)
         val approxNumDistinct = genericStats.approximateNumDistincts(name)
         val dataType = genericStats.typeOf(name)
         val isDataTypeInferred = genericStats.inferredTypes.contains(name)
@@ -736,6 +793,9 @@ object ColumnProfiler {
             NumericColumnProfile(
               name,
               completeness,
+              distinctness,
+              entropy,
+              uniqueness,
               approxNumDistinct,
               dataType,
               isDataTypeInferred,
@@ -747,7 +807,9 @@ object ColumnProfiler {
               numericStats.minima.get(name),
               numericStats.sums.get(name),
               numericStats.stdDevs.get(name),
-              numericStats.approxPercentiles.get(name))
+              numericStats.approxPercentiles.get(name),
+              numericStats.correlation.get(name)
+            )
 
           case String =>
             StringColumnProfile(
@@ -766,6 +828,9 @@ object ColumnProfiler {
             StandardColumnProfile(
               name,
               completeness,
+              distinctness,
+              entropy,
+              uniqueness,
               approxNumDistinct,
               dataType,
               isDataTypeInferred,
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala
index 5ac181951..14e3297ad 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala
@@ -39,6 +39,8 @@ class ColumnProfilerRunBuilder(val data: DataFrame) {
   protected var saveColumnProfilesJsonPath: Option[String] = None
   protected var saveConstraintSuggestionsJsonPath: Option[String] = None
   protected var saveEvaluationResultsJsonPath: Option[String] = None
+  protected var correlation = true
+  protected var histogram = true
   protected var kllProfiling = false
   protected var kllParameters: Option[KLLParameters] = None
   protected var predefinedTypes: Map[String, DataTypeInstances.Value] = Map.empty
@@ -110,6 +112,22 @@ class ColumnProfilerRunBuilder(val data: DataFrame) {
     this
   }
 
+  /**
+   * Enable correlation profiling on Numerical columns, enabled by default.
+   */
+  def withCorrelation(correlation: Boolean): this.type = {
+    this.correlation = correlation
+    this
+  }
+
+  /**
+   * Enable histogram profiling on Numerical columns, enabled by default.
+   */
+  def withHistogram(histogram: Boolean): this.type = {
+    this.histogram = histogram
+    this
+  }
+
   /**
    * Enable KLL Sketches profiling on Numerical columns, disabled by default.
    */
@@ -180,6 +198,8 @@ class ColumnProfilerRunBuilder(val data: DataFrame) {
         reuseExistingResultsKey,
         failIfResultsForReusingMissing,
         saveOrAppendResultsKey),
+      correlation,
+      histogram,
       kllProfiling,
       kllParameters,
       predefinedTypes
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala
index 768173053..a02a5d4ee 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala
@@ -48,6 +48,8 @@ class ColumnProfilerRunner {
       cacheInputs: Boolean,
       fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions,
       metricsRepositoryOptions: ColumnProfilerRunBuilderMetricsRepositoryOptions,
+      correlation: Boolean,
+      histogram: Boolean,
       kllProfiling: Boolean,
       kllParameters: Option[KLLParameters],
       predefinedTypes: Map[String, DataTypeInstances.Value])
@@ -67,6 +69,8 @@ class ColumnProfilerRunner {
         metricsRepositoryOptions.reuseExistingResultsKey,
         metricsRepositoryOptions.failIfResultsForReusingMissing,
         metricsRepositoryOptions.saveOrAppendResultsKey,
+        correlation,
+        histogram,
         kllProfiling,
         kllParameters,
         predefinedTypes

From bfa462e1af343c54cbcc869a8892b12fd837fbb3 Mon Sep 17 00:00:00 2001
From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com>
Date: Thu, 17 Sep 2020 14:04:29 +0200
Subject: [PATCH 04/21] make tests compile (#3)

---
 .../com/amazon/deequ/KLL/KLLProfileTest.scala | 11 ++-
 .../deequ/profiles/ColumnProfilerTest.scala   | 54 +++++++++---
 .../rules/ConstraintRulesTest.scala           | 82 +++++++++----------
 3 files changed, 95 insertions(+), 52 deletions(-)

diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala
index 6ee81a7d3..119112e56 100644
--- a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala
+++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala
@@ -64,6 +64,9 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec
         val expectedColumnProfile = NumericColumnProfile(
           "att1",
           1.0,
+          1.0,
+          1.0,
+          1.0,
           6,
           DataTypeInstances.Fractional,
           false,
@@ -85,7 +88,9 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec
             3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
             4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
             5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0,
-            6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)))
+            6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)),
+          None
+        )
 
         assertProfilesEqual(expectedColumnProfile,
           actualColumnProfile.asInstanceOf[NumericColumnProfile])
@@ -104,6 +109,9 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec
         val expectedColumnProfile = NumericColumnProfile(
           "att1",
           1.0,
+          1.0,
+          1.0,
+          1.0,
           30,
           DataTypeInstances.Fractional,
           false,
@@ -120,6 +128,7 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec
           Some(1.0),
           Some(465.0),
           Some(8.65544144839919),
+          None,
           None)
 
         assertProfilesEqual(expectedColumnProfile,
diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
index 6eabc8f8a..62b953fdc 100644
--- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
+++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
@@ -60,6 +60,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "att2",
         2.0 / 3.0,
+        1.0,
+        1.0,
+        1.0,
         2,
         DataTypeInstances.String,
         true,
@@ -111,6 +114,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "item",
         1.0,
+        1.0,
+        1.0,
+        1.0,
         6,
         DataTypeInstances.String,
         false,
@@ -134,6 +140,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "att2",
         2.0 / 3.0,
+        1.0,
+        1.0,
+        1.0,
         2,
         DataTypeInstances.String,
         true,
@@ -163,6 +172,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = NumericColumnProfile(
         "item",
         1.0,
+        1.0,
+        1.0,
+        1.0,
         6,
         DataTypeInstances.Integral,
         true,
@@ -186,7 +198,8 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
           3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
           4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
           5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0,
-          6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)))
+          6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)),
+        None)
 
         assertProfilesEqual(expectedColumnProfile,
           actualColumnProfile.asInstanceOf[NumericColumnProfile])
@@ -203,6 +216,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         val expectedColumnProfile = NumericColumnProfile(
           "item",
           1.0,
+          1.0,
+          1.0,
+          1.0,
           6,
           DataTypeInstances.Integral,
           true,
@@ -226,7 +242,8 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
             3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
             4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
             5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0,
-            6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)))
+            6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)),
+          None)
 
         assertProfilesEqual(expectedColumnProfile,
           actualColumnProfile.asInstanceOf[NumericColumnProfile])
@@ -244,6 +261,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         val expectedColumnProfile = NumericColumnProfile(
           "item",
           1.0,
+          1.0,
+          1.0,
+          1.0,
           6,
           DataTypeInstances.Integral,
           true,
@@ -303,7 +323,8 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
             3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
             4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
             5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0,
-            6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)))
+            6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)),
+          None)
 
         assertProfilesEqual(expectedColumnProfile,
           actualColumnProfile.asInstanceOf[NumericColumnProfile])
@@ -320,6 +341,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = NumericColumnProfile(
         "att1",
         1.0,
+        1.0,
+        1.0,
+        1.0,
         6,
         DataTypeInstances.Fractional,
         false,
@@ -337,7 +361,8 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
           3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
           4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
           5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0,
-          6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)))
+          6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)),
+        None)
 
         assertProfilesEqual(expectedColumnProfile,
           actualColumnProfile.asInstanceOf[NumericColumnProfile])
@@ -353,6 +378,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "att2",
         2.0 / 3.0,
+        1.0,
+        1.0,
+        1.0,
         2,
         DataTypeInstances.String,
         isDataTypeInferred = true,
@@ -561,6 +589,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       StandardColumnProfile(
         "PassengerId",
         1.0,
+        1.0,
+        1.0,
+        1.0,
         891,
         DataTypeInstances.Integral,
         false,
@@ -569,17 +600,20 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       StandardColumnProfile(
         "Survived",
         1.0,
+        1.0,
+        1.0,
+        1.0,
         2,
         DataTypeInstances.Integral,
         false,
         Map.empty,
         None),
-      StandardColumnProfile("Pclass", 1.0, 3, DataTypeInstances.Integral, false, Map.empty, None),
-      StandardColumnProfile("Name", 1.0, 0, DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Sex", 1.0, 2, DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Ticket", 1.0, 681, DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Fare", 1.0, 0, DataTypeInstances.Fractional, false, Map.empty, None),
-      StandardColumnProfile("Cabin", 0.22, 0, DataTypeInstances.String, true, Map.empty, None)
+      StandardColumnProfile("Pclass", 1.0, 1.0, 1.0, 1.0, 3, DataTypeInstances.Integral, false, Map.empty, None),
+      StandardColumnProfile("Name", 1.0, 1.0, 1.0, 1.0, 0, DataTypeInstances.String, true, Map.empty, None),
+      StandardColumnProfile("Sex", 1.0, 1.0, 1.0, 1.0, 2, DataTypeInstances.String, true, Map.empty, None),
+      StandardColumnProfile("Ticket", 1.0, 1.0, 1.0, 1.0, 681, DataTypeInstances.String, true, Map.empty, None),
+      StandardColumnProfile("Fare", 1.0, 1.0, 1.0, 1.0, 0, DataTypeInstances.Fractional, false, Map.empty, None),
+      StandardColumnProfile("Cabin", 0.22, 1.0, 1.0, 1.0, 0, DataTypeInstances.String, true, Map.empty, None)
     )
 
     assertSameColumnProfiles(columnProfiles.profiles, expectedProfiles)
diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
index 075247932..f9cd9dc4d 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
@@ -34,8 +34,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "CompleteIfCompleteRule" should {
     "be applied correctly" in {
 
-      val complete = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None)
-      val incomplete = StandardColumnProfile("col1", .25, 100, String, false, Map.empty, None)
+      val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None)
+      val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None)
 
       val completeInteger =
         getFakeNumericColumnProfileWithMinMaxMeanAndStdDev(
@@ -129,8 +129,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "RetainCompletenessRule" should {
     "be applied correctly" in {
 
-      val complete = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None)
-      val incomplete = StandardColumnProfile("col1", .25, 100, String, false, Map.empty, None)
+      val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None)
+      val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None)
 
       assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000))
       assert(RetainCompletenessRule().shouldBeApplied(incomplete, 1000))
@@ -188,10 +188,10 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "UniqueIfApproximatelyUniqueRule" should {
     "be applied correctly" in {
 
-      val unique = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None)
-      val maybeUnique = StandardColumnProfile("col1", 1.0, 95, String, false, Map.empty, None)
-      val maybeNonUnique = StandardColumnProfile("col1", 1.0, 91, String, false, Map.empty, None)
-      val nonUnique = StandardColumnProfile("col1", 1.0, 20, String, false, Map.empty, None)
+      val unique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None)
+      val maybeUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None)
+      val maybeNonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 91, String, false, Map.empty, None)
+      val nonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, String, false, Map.empty, None)
 
       assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(unique, 100))
       assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(maybeUnique, 100))
@@ -251,19 +251,19 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "RetainTypeRule" should {
     "be applied correctly" in {
 
-      val string = StandardColumnProfile("col1", 1.0, 100, String, true, Map.empty, None)
-      val boolean = StandardColumnProfile("col1", 1.0, 100, Boolean, true, Map.empty, None)
-      val fractional = StandardColumnProfile("col1", 1.0, 100, Fractional, true, Map.empty, None)
-      val integer = StandardColumnProfile("col1", 1.0, 100, Integral, true, Map.empty, None)
-      val unknown = StandardColumnProfile("col1", 1.0, 100, Unknown, true, Map.empty, None)
+      val string = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, true, Map.empty, None)
+      val boolean = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Boolean, true, Map.empty, None)
+      val fractional = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, true, Map.empty, None)
+      val integer = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Integral, true, Map.empty, None)
+      val unknown = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Unknown, true, Map.empty, None)
 
-      val stringNonInferred = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty,
+      val stringNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty,
         None)
-      val booleanNonInferred = StandardColumnProfile("col1", 1.0, 100, Boolean, false, Map.empty,
+      val booleanNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Boolean, false, Map.empty,
         None)
-      val fractionalNonInferred = StandardColumnProfile("col1", 1.0, 100, Fractional, false,
+      val fractionalNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, false,
         Map.empty, None)
-      val integerNonInferred = StandardColumnProfile("col1", 1.0, 100, Integral, false,
+      val integerNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Integral, false,
         Map.empty, None)
 
       assert(!RetainTypeRule().shouldBeApplied(string, 100))
@@ -381,24 +381,24 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
 
       val noDistribution = Distribution(Map.empty, 0)
 
-      val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 100, String, false,
+      val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false,
         Map.empty, Some(nonSkewedDist))
-      val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0,
+      val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDist))
-      val stringWithFlgDist = StandardColumnProfile("flg", 1.0,
+      val stringWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0,
         2, String, false, Map.empty, Some(flgDist))
-      val integralWithFlgDist = StandardColumnProfile("flg", 1.0,
+      val integralWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0,
         2, DataTypeInstances.Integral, false, Map.empty, Some(flgDist))
 
-      val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 100, String, false,
+      val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false,
         Map.empty, Some(skewedDist))
-      val stringNoDist = StandardColumnProfile("col1", 1.0, 95, String, false, Map.empty, None)
-      val boolNoDist = StandardColumnProfile("col1", 1.0, 94, Boolean, false, Map.empty, None)
-      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 20, Boolean, false, Map.empty,
+      val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None)
+      val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, Boolean, false, Map.empty, None)
+      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, Boolean, false, Map.empty,
         Some(noDistribution))
-      val integralWithSkewedDist = StandardColumnProfile("col1", 1.0,
+      val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         100, DataTypeInstances.Integral, false, Map.empty, Some(skewedDist))
-      val integralNoDist = StandardColumnProfile("col1", 1.0,
+      val integralNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         95, DataTypeInstances.Integral, false, Map.empty, None)
 
       assert(CategoricalRangeRule().shouldBeApplied(stringWithNonSkewedDist, 100))
@@ -561,30 +561,30 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
 
       val noDistribution = Distribution(Map.empty, 0)
 
-      val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1", 1.0,
+      val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         100, String, false, Map.empty, Some(nonSkewedDistWithFractionalCategoricalRange))
-      val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0,
+      val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         100, String, false, Map.empty, Some(nonSkewedDistWithActualCategoricalRange))
-      val stringWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 100, String, false,
+      val stringWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false,
         Map.empty, Some(somewhatSkewedDist))
-      val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 100, String, false,
+      val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false,
         Map.empty, Some(skewedDist))
-      val stringNoDist = StandardColumnProfile("col1", 1.0, 95, String, false, Map.empty, None)
-      val boolNoDist = StandardColumnProfile("col1", 1.0, 94, Boolean, false, Map.empty, None)
-      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 20, Boolean, false, Map.empty,
+      val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None)
+      val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, Boolean, false, Map.empty, None)
+      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, Boolean, false, Map.empty,
         Some(noDistribution))
 
       val integralWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1",
-        1.0, 100, DataTypeInstances.Integral, false, Map.empty,
+        1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty,
         Some(nonSkewedIntegralDistWithFractionalCategoricalRange))
-      val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0,
+      val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         100, DataTypeInstances.Integral, false, Map.empty,
         Some(nonSkewedIntegralDistWithActualCategoricalRange))
-      val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0,
+      val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         100, DataTypeInstances.Integral, false, Map.empty, Some(somewhatSkewedIntegralDist))
-      val integralWithSkewedDist = StandardColumnProfile("col1", 1.0,
+      val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         100, DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist))
-      val integralNoDist = StandardColumnProfile("col1", 1.0,
+      val integralNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         95, DataTypeInstances.Integral, false, Map.empty, None)
 
       assert(FractionalCategoricalRangeRule().shouldBeApplied(stringWithSomewhatSkewedDist, 100))
@@ -708,8 +708,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "NonNegativeNumbersRule and PositiveNumbersRule" should {
     "be applied correctly" in {
       def columnProfileWithMinimum(minimum: Double): NumericColumnProfile = {
-        NumericColumnProfile("col1", 1.0, 100, Fractional, isDataTypeInferred = false,
-          Map.empty, None, None, Some(10), Some(100), Some(minimum), Some(10000), Some(1.0), None)
+        NumericColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, isDataTypeInferred = false,
+          Map.empty, None, None, Some(10), Some(100), Some(minimum), Some(10000), Some(1.0), None, None)
       }
 
       val nRecords = 100

From d6753dbde08ed3c9fd735d4e79d93b0677b7cc34 Mon Sep 17 00:00:00 2001
From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com>
Date: Thu, 17 Sep 2020 15:01:19 +0200
Subject: [PATCH 05/21] Fix tests checkstyle and 4 tests (#4)

---
 .../deequ/profiles/ColumnProfilerTest.scala   |  38 ++++---
 .../rules/ConstraintRulesTest.scala           | 106 ++++++++++--------
 2 files changed, 84 insertions(+), 60 deletions(-)

diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
index 62b953fdc..5f92df0f8 100644
--- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
+++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
@@ -60,9 +60,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "att2",
         2.0 / 3.0,
-        1.0,
-        1.0,
-        1.0,
+        0.5,
+        0.5623351446188083,
+        0.25,
         2,
         DataTypeInstances.String,
         true,
@@ -115,7 +115,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         "item",
         1.0,
         1.0,
-        1.0,
+        1.791759469228055,
         1.0,
         6,
         DataTypeInstances.String,
@@ -140,9 +140,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "att2",
         2.0 / 3.0,
-        1.0,
-        1.0,
-        1.0,
+        0.5,
+        0.5623351446188083,
+        0.25,
         2,
         DataTypeInstances.String,
         true,
@@ -378,9 +378,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "att2",
         2.0 / 3.0,
-        1.0,
-        1.0,
-        1.0,
+        0.5,
+        0.5623351446188083,
+        0.25,
         2,
         DataTypeInstances.String,
         isDataTypeInferred = true,
@@ -608,12 +608,18 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         false,
         Map.empty,
         None),
-      StandardColumnProfile("Pclass", 1.0, 1.0, 1.0, 1.0, 3, DataTypeInstances.Integral, false, Map.empty, None),
-      StandardColumnProfile("Name", 1.0, 1.0, 1.0, 1.0, 0, DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Sex", 1.0, 1.0, 1.0, 1.0, 2, DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Ticket", 1.0, 1.0, 1.0, 1.0, 681, DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Fare", 1.0, 1.0, 1.0, 1.0, 0, DataTypeInstances.Fractional, false, Map.empty, None),
-      StandardColumnProfile("Cabin", 0.22, 1.0, 1.0, 1.0, 0, DataTypeInstances.String, true, Map.empty, None)
+      StandardColumnProfile("Pclass", 1.0, 1.0, 1.0, 1.0, 3,
+        DataTypeInstances.Integral, false, Map.empty, None),
+      StandardColumnProfile("Name", 1.0, 1.0, 1.0, 1.0, 0,
+        DataTypeInstances.String, true, Map.empty, None),
+      StandardColumnProfile("Sex", 1.0, 1.0, 1.0, 1.0, 2,
+        DataTypeInstances.String, true, Map.empty, None),
+      StandardColumnProfile("Ticket", 1.0, 1.0, 1.0, 1.0, 681,
+        DataTypeInstances.String, true, Map.empty, None),
+      StandardColumnProfile("Fare", 1.0, 1.0, 1.0, 1.0, 0,
+        DataTypeInstances.Fractional, false, Map.empty, None),
+      StandardColumnProfile("Cabin", 0.22, 1.0, 1.0, 1.0, 0,
+        DataTypeInstances.String, true, Map.empty, None)
     )
 
     assertSameColumnProfiles(columnProfiles.profiles, expectedProfiles)
diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
index f9cd9dc4d..72eab3fd3 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
@@ -34,8 +34,10 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "CompleteIfCompleteRule" should {
     "be applied correctly" in {
 
-      val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None)
-      val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None)
+      val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        String, false, Map.empty, None)
+      val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100,
+        String, false, Map.empty, None)
 
       val completeInteger =
         getFakeNumericColumnProfileWithMinMaxMeanAndStdDev(
@@ -129,8 +131,10 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "RetainCompletenessRule" should {
     "be applied correctly" in {
 
-      val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None)
-      val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None)
+      val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        String, false, Map.empty, None)
+      val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100,
+        String, false, Map.empty, None)
 
       assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000))
       assert(RetainCompletenessRule().shouldBeApplied(incomplete, 1000))
@@ -188,10 +192,14 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "UniqueIfApproximatelyUniqueRule" should {
     "be applied correctly" in {
 
-      val unique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, None)
-      val maybeUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None)
-      val maybeNonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 91, String, false, Map.empty, None)
-      val nonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, String, false, Map.empty, None)
+      val unique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        String, false, Map.empty, None)
+      val maybeUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95,
+        String, false, Map.empty, None)
+      val maybeNonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 91,
+        String, false, Map.empty, None)
+      val nonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20,
+        String, false, Map.empty, None)
 
       assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(unique, 100))
       assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(maybeUnique, 100))
@@ -251,20 +259,24 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "RetainTypeRule" should {
     "be applied correctly" in {
 
-      val string = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, true, Map.empty, None)
-      val boolean = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Boolean, true, Map.empty, None)
-      val fractional = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, true, Map.empty, None)
-      val integer = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Integral, true, Map.empty, None)
-      val unknown = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Unknown, true, Map.empty, None)
-
-      val stringNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty,
-        None)
-      val booleanNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Boolean, false, Map.empty,
-        None)
-      val fractionalNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, false,
-        Map.empty, None)
-      val integerNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Integral, false,
-        Map.empty, None)
+      val string = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        String, true, Map.empty, None)
+      val boolean = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        Boolean, true, Map.empty, None)
+      val fractional = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        Fractional, true, Map.empty, None)
+      val integer = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        Integral, true, Map.empty, None)
+      val unknown = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        Unknown, true, Map.empty, None)
+      val stringNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        String, false, Map.empty, None)
+      val booleanNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        Boolean, false, Map.empty, None)
+      val fractionalNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        Fractional, false, Map.empty, None)
+      val integerNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        Integral, false, Map.empty, None)
 
       assert(!RetainTypeRule().shouldBeApplied(string, 100))
       assert(!RetainTypeRule().shouldBeApplied(unknown, 100))
@@ -381,8 +393,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
 
       val noDistribution = Distribution(Map.empty, 0)
 
-      val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false,
-        Map.empty, Some(nonSkewedDist))
+      val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        String, false, Map.empty, Some(nonSkewedDist))
       val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDist))
       val stringWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0,
@@ -390,12 +402,14 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
       val integralWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0,
         2, DataTypeInstances.Integral, false, Map.empty, Some(flgDist))
 
-      val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false,
-        Map.empty, Some(skewedDist))
-      val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None)
-      val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, Boolean, false, Map.empty, None)
-      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, Boolean, false, Map.empty,
-        Some(noDistribution))
+      val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+        String, false, Map.empty, Some(skewedDist))
+      val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95,
+        String, false, Map.empty, None)
+      val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94,
+        Boolean, false, Map.empty, None)
+      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20,
+        Boolean, false, Map.empty, Some(noDistribution))
       val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         100, DataTypeInstances.Integral, false, Map.empty, Some(skewedDist))
       val integralNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
@@ -561,24 +575,28 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
 
       val noDistribution = Distribution(Map.empty, 0)
 
-      val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
-        100, String, false, Map.empty, Some(nonSkewedDistWithFractionalCategoricalRange))
-      val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
-        100, String, false, Map.empty, Some(nonSkewedDistWithActualCategoricalRange))
-      val stringWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false,
-        Map.empty, Some(somewhatSkewedDist))
-      val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, String, false,
-        Map.empty, Some(skewedDist))
-      val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95, String, false, Map.empty, None)
-      val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94, Boolean, false, Map.empty, None)
-      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20, Boolean, false, Map.empty,
-        Some(noDistribution))
+      val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile(
+        "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty,
+        Some(nonSkewedDistWithFractionalCategoricalRange))
+      val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile(
+        "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty,
+        Some(nonSkewedDistWithActualCategoricalRange))
+      val stringWithSomewhatSkewedDist = StandardColumnProfile(
+        "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(somewhatSkewedDist))
+      val stringWithSkewedDist = StandardColumnProfile(
+        "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(skewedDist))
+      val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95,
+        String, false, Map.empty, None)
+      val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94,
+        Boolean, false, Map.empty, None)
+      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20,
+        Boolean, false, Map.empty, Some(noDistribution))
 
       val integralWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1",
         1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty,
         Some(nonSkewedIntegralDistWithFractionalCategoricalRange))
-      val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
-        100, DataTypeInstances.Integral, false, Map.empty,
+      val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile(
+        "col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty,
         Some(nonSkewedIntegralDistWithActualCategoricalRange))
       val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
         100, DataTypeInstances.Integral, false, Map.empty, Some(somewhatSkewedIntegralDist))

From dac9285b6c5cde2a0e7f1e6f562f5b853be48c95 Mon Sep 17 00:00:00 2001
From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com>
Date: Thu, 17 Sep 2020 15:15:05 +0200
Subject: [PATCH 06/21] Fix test checkstyle (#5)

---
 .../amazon/deequ/suggestions/rules/ConstraintRulesTest.scala | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
index 72eab3fd3..9a90af7ca 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
@@ -726,8 +726,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "NonNegativeNumbersRule and PositiveNumbersRule" should {
     "be applied correctly" in {
       def columnProfileWithMinimum(minimum: Double): NumericColumnProfile = {
-        NumericColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional, isDataTypeInferred = false,
-          Map.empty, None, None, Some(10), Some(100), Some(minimum), Some(10000), Some(1.0), None, None)
+        NumericColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional,
+          isDataTypeInferred = false, Map.empty, None, None, Some(10), Some(100),
+          Some(minimum), Some(10000), Some(1.0), None, None)
       }
 
       val nRecords = 100

From 0007bf194c395c5511ae68be25ca1793e17e383e Mon Sep 17 00:00:00 2001
From: Fabio Buso <buso.fabio@gmail.com>
Date: Tue, 1 Jun 2021 23:17:22 +0200
Subject: [PATCH 07/21] Hopsify Deequ 1.1.0

---
 pom.xml                                       | 136 ++++++++++++++++--
 .../analyzers/catalyst/DeequFunctions.scala   |   2 +-
 2 files changed, 123 insertions(+), 15 deletions(-)

diff --git a/pom.xml b/pom.xml
index 9798ec271..14be4a912 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,21 +5,9 @@
     <modelVersion>4.0.0</modelVersion>
 
     <groupId>com.logicalclocks</groupId>
-    <artifactId>deequ</artifactId>
-    <version>2.0.4-spark-3.3</version>
+    <artifactId>deequ_${scala.major.version}</artifactId>
+    <version>2.0.4.0</version>
 
-    <properties>
-        <maven.compiler.source>1.8</maven.compiler.source>
-        <maven.compiler.target>1.8</maven.compiler.target>
-        <encoding>UTF-8</encoding>
-
-        <scala.major.version>2.12</scala.major.version>
-        <scala.version>${scala.major.version}.10</scala.version>
-        <artifact.scala.version>${scala.major.version}</artifact.scala.version>
-        <scala-maven-plugin.version>4.8.1</scala-maven-plugin.version>
-
-        <spark.version>3.3.0</spark.version>
-    </properties>
 
     <name>deequ</name>
     <description>Deequ is a library built on top of Apache Spark for defining "unit tests for data",
@@ -67,6 +55,47 @@
         <url>https://github.com/awslabs/deequ</url>
     </scm>
 
+    
+    <!-- awslabs/deeque@2.0.4 does only have the following properties. There are not multiple profiles -->
+    <!-- <properties>
+        <maven.compiler.source>1.8</maven.compiler.source>
+        <maven.compiler.target>1.8</maven.compiler.target>
+        <encoding>UTF-8</encoding>
+
+        <scala.major.version>2.12</scala.major.version>
+        <scala.version>${scala.major.version}.10</scala.version>
+        <artifact.scala.version>${scala.major.version}</artifact.scala.version>
+        <scala-maven-plugin.version>4.8.1</scala-maven-plugin.version>
+
+        <spark.version>3.3.0</spark.version>
+    </properties> -->
+
+    <properties>
+        <maven.compiler.source>1.8</maven.compiler.source>
+        <maven.compiler.target>1.8</maven.compiler.target>
+        <encoding>UTF-8</encoding>
+
+        <!-- Scala -->
+        <scala.major.version>${scala-212.major.version}</scala.major.version>
+        <scala.version>${scala.major.version}.10</scala.version>
+        <scala-211.major.version>2.11</scala-211.major.version>
+        <scala-212.major.version>2.12</scala-212.major.version>
+        <artifact.scala.version>_scala-${scala.major.version}</artifact.scala.version>
+        <artifact.spark.version>_spark-${spark.version}</artifact.spark.version>
+        <scala-maven-plugin.version>4.8.1</scala-maven-plugin.version>
+
+        <!-- Spark -->
+        <spark.version>${spark-33.version}</spark.version>
+        <spark-22.version>2.2.2</spark-22.version>
+        <spark-23.version>2.3.2</spark-23.version>
+        <spark-24.version>2.4.2</spark-24.version>
+        <spark-30.version>3.0.0</spark-30.version>
+        <spark-31.version>3.1.1.0</spark-31.version>
+        <spark-33.version>3.3.0.0</spark-33.version>
+        <artifact.spark.version></artifact.spark.version>
+        <spark.scope>provided</spark.scope>
+    </properties>
+
     <dependencies>
         <dependency>
             <groupId>org.scala-lang</groupId>
@@ -86,12 +115,14 @@
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_${scala.major.version}</artifactId>
             <version>${spark.version}</version>
+            <scope>${spark.scope}</scope>
         </dependency>
 
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-sql_${scala.major.version}</artifactId>
             <version>${spark.version}</version>
+            <scope>${spark.scope}</scope>
         </dependency>
 
         <dependency>
@@ -414,8 +445,85 @@
                 </plugins>
             </build>
         </profile>
+
+        <!-- In logicalclocks/deequ@1.1.0 we have multiple profiles. They are not anymore in awslabs/deequ-->
+
+        <!-- <profile>
+            <id>spark-2.2-scala-2.11</id>
+            <properties>
+                <spark.version>${spark-22.version}</spark.version>
+                <scala.major.version>${scala-211.major.version}</scala.major.version>
+                <scala.version>${scala.major.version}.10</scala.version>
+                <artifact.scala.version>_scala-${scala.major.version}</artifact.scala.version>
+                <artifact.spark.version>_spark-${spark.version}</artifact.spark.version>
+            </properties>
+        </profile>
+        <profile>
+            <id>spark-2.3-scala-2.11</id>
+            <properties>
+                <spark.version>${spark-23.version}</spark.version>
+                <scala.major.version>${scala-211.major.version}</scala.major.version>
+                <scala.version>${scala.major.version}.10</scala.version>
+                <artifact.scala.version>_scala-${scala.major.version}</artifact.scala.version>
+                <artifact.spark.version>_spark-${spark.version}</artifact.spark.version>
+            </properties>
+        </profile>
+        <profile>
+            <id>spark-2.4-scala-2.11</id>
+            <properties>
+                <spark.version>${spark-24.version}</spark.version>
+                <scala.major.version>${scala-211.major.version}</scala.major.version>
+                <scala.version>${scala.major.version}.10</scala.version>
+                <artifact.scala.version>_scala-${scala.major.version}</artifact.scala.version>
+                <artifact.spark.version>_spark-${spark.version}</artifact.spark.version>
+            </properties>
+        </profile>
+        <profile>
+            <id>spark-3.0-scala-2.12</id>
+            <properties>
+                <spark.version>${spark-30.version}</spark.version>
+                <scala.major.version>${scala-212.major.version}</scala.major.version>
+                <scala.version>${scala.major.version}.10</scala.version>
+                <artifact.scala.version>_scala-${scala.major.version}</artifact.scala.version>
+                <artifact.spark.version>_spark-${spark.version}</artifact.spark.version>
+            </properties>
+        </profile>
+        <profile>
+            <id>spark-3.1-scala-2.12</id>
+            <properties>
+                <spark.version>${spark-31.version}</spark.version>
+                <scala.major.version>${scala-212.major.version}</scala.major.version>
+                <scala.version>${scala.major.version}.10</scala.version>
+                <artifact.scala.version>_scala-${scala.major.version}</artifact.scala.version>
+                <artifact.spark.version>_spark-${spark.version}</artifact.spark.version>
+            </properties>
+        </profile>
+        <profile>
+            <id>spark-3.3-scala-2.12</id>
+            <properties>
+                <spark.version>${spark-330.version}</spark.version>
+                <scala.major.version>${scala-212.major.version}</scala.major.version>
+                <scala.version>${scala.major.version}.10</scala.version>
+                <artifact.scala.version>_scala-${scala.major.version}</artifact.scala.version>
+                <artifact.spark.version>_spark-${spark.version}</artifact.spark.version>
+            </properties>
+        </profile> -->
     </profiles>
 
+    <repositories>
+        <repository>
+            <id>Hops</id>
+            <name>Hops Repo</name>
+            <url>https://archiva.hops.works/repository/Hops/</url>
+            <releases>
+                <enabled>true</enabled>
+            </releases>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+        </repository>
+    </repositories>
+
     <distributionManagement>
         <repository>
             <id>Hops</id>
diff --git a/src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala b/src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala
index dd973b301..3bef80fe1 100644
--- a/src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala
+++ b/src/main/scala/com/amazon/deequ/analyzers/catalyst/DeequFunctions.scala
@@ -47,7 +47,7 @@ object DeequFunctions {
 
   /** Standard deviation with state */
   def stateful_stddev_pop(column: Column): Column = withAggregateFunction {
-    StatefulStdDevPop(column.expr)
+    StatefulStdDevPop(column.expr, true)
   }
 
   /** Approximate number of distinct values with state via HLL's */

From a090645ffb25c99b2d6cd4cf69376fe985dc48c2 Mon Sep 17 00:00:00 2001
From: Fabio Buso <buso.fabio@gmail.com>
Date: Mon, 7 Jun 2021 23:57:42 +0200
Subject: [PATCH 08/21] Bump deequ hops version

---
 pom.xml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index 14be4a912..6c17cabc8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,8 +6,7 @@
 
     <groupId>com.logicalclocks</groupId>
     <artifactId>deequ_${scala.major.version}</artifactId>
-    <version>2.0.4.0</version>
-
+    <version>2.0.4.1</version>
 
     <name>deequ</name>
     <description>Deequ is a library built on top of Apache Spark for defining "unit tests for data",

From 556d55a7e4e3a7b990c52f61fecc2b669344a6bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?=
 <javier@logicalclocks.com>
Date: Mon, 30 Oct 2023 10:27:31 +0100
Subject: [PATCH 09/21] Prepare for 2.0.4.1-SNAPSHOT development

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 6c17cabc8..40c378eb1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>com.logicalclocks</groupId>
     <artifactId>deequ_${scala.major.version}</artifactId>
-    <version>2.0.4.1</version>
+    <version>2.0.4.1-SNAPSHOT</version>
 
     <name>deequ</name>
     <description>Deequ is a library built on top of Apache Spark for defining "unit tests for data",

From 3fe618857c3682145df8ffcd9f83425e761763de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Till=20D=C3=B6hmen?= <till.doehmen@web.de>
Date: Tue, 10 Aug 2021 11:27:18 +0200
Subject: [PATCH 10/21] Fix for NaNs and Infinity values in profile JSON (#7)

Co-authored-by: doehmen-admin <doehmen-admin@muschas.ad.fit.fraunhofer.de>
---
 .../amazon/deequ/profiles/ColumnProfile.scala | 40 ++++++++++++-------
 .../deequ/profiles/ColumnProfilerTest.scala   | 31 ++++++++++++++
 2 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
index 39b54d508..58ff9ef0b 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
@@ -107,10 +107,10 @@ object ColumnProfiles {
         }
       }
 
-      columnProfileJson.addProperty("completeness", profile.completeness)
-      columnProfileJson.addProperty("distinctness", profile.distinctness)
-      columnProfileJson.addProperty("entropy", profile.entropy)
-      columnProfileJson.addProperty("uniqueness", profile.uniqueness)
+      columnProfileJson.addProperty("completeness", normalizeDouble(profile.completeness))
+      columnProfileJson.addProperty("distinctness", normalizeDouble(profile.distinctness))
+      columnProfileJson.addProperty("entropy", normalizeDouble(profile.entropy))
+      columnProfileJson.addProperty("uniqueness", normalizeDouble(profile.uniqueness))
       columnProfileJson.addProperty("approximateNumDistinctValues",
         profile.approximateNumDistinctValues)
 
@@ -122,7 +122,7 @@ object ColumnProfiles {
           val histogramEntry = new JsonObject()
           histogramEntry.addProperty("value", name)
           histogramEntry.addProperty("count", distributionValue.absolute)
-          histogramEntry.addProperty("ratio", distributionValue.ratio)
+          histogramEntry.addProperty("ratio", normalizeDouble(distributionValue.ratio))
           histogramJson.add(histogramEntry)
         }
 
@@ -132,19 +132,19 @@ object ColumnProfiles {
       profile match {
         case numericColumnProfile: NumericColumnProfile =>
           numericColumnProfile.mean.foreach { mean =>
-            columnProfileJson.addProperty("mean", mean)
+            columnProfileJson.addProperty("mean", normalizeDouble(mean))
           }
           numericColumnProfile.maximum.foreach { maximum =>
-            columnProfileJson.addProperty("maximum", maximum)
+            columnProfileJson.addProperty("maximum", normalizeDouble(maximum))
           }
           numericColumnProfile.minimum.foreach { minimum =>
-            columnProfileJson.addProperty("minimum", minimum)
+            columnProfileJson.addProperty("minimum", normalizeDouble(minimum))
           }
           numericColumnProfile.sum.foreach { sum =>
-            columnProfileJson.addProperty("sum", sum)
+            columnProfileJson.addProperty("sum", normalizeDouble(sum))
           }
           numericColumnProfile.stdDev.foreach { stdDev =>
-            columnProfileJson.addProperty("stdDev", stdDev)
+            columnProfileJson.addProperty("stdDev", normalizeDouble(stdDev))
           }
 
           // correlation
@@ -153,7 +153,7 @@ object ColumnProfiles {
             numericColumnProfile.correlation.get.foreach { correlation =>
               val correlationJson = new JsonObject()
               correlationJson.addProperty("column", correlation._1)
-              correlationJson.addProperty("correlation", correlation._2)
+              correlationJson.addProperty("correlation", normalizeDouble(correlation._2))
               correlationsJson.add(correlationJson)
             }
             columnProfileJson.add("correlations", correlationsJson)
@@ -167,8 +167,8 @@ object ColumnProfiles {
             val tmp = new JsonArray()
             kllSketch.buckets.foreach{bucket =>
               val entry = new JsonObject()
-              entry.addProperty("low_value", bucket.lowValue)
-              entry.addProperty("high_value", bucket.highValue)
+              entry.addProperty("low_value", normalizeDouble(bucket.lowValue))
+              entry.addProperty("high_value", normalizeDouble(bucket.highValue))
               entry.addProperty("count", bucket.count)
               tmp.add(entry)
             }
@@ -206,10 +206,22 @@ object ColumnProfiles {
 
     json.add("columns", columns)
 
-    val gson = new GsonBuilder()
+    val gson = new GsonBuilder().serializeNulls()
       // .setPrettyPrinting()
       .create()
 
     gson.toJson(json)
   }
+
+  def normalizeDouble(numeric: Double): java.lang.Double ={
+    if (numeric.isNaN) {
+      null.asInstanceOf[java.lang.Double]
+    } else if (numeric.isNegInfinity) {
+      Double.MinValue
+    } else if(numeric.isPosInfinity) {
+      Double.MaxValue
+    } else {
+      numeric
+    }
+  }
 }
diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
index 5f92df0f8..b63394991 100644
--- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
+++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
@@ -22,6 +22,7 @@ import com.amazon.deequ.analyzers.Histogram.NullFieldReplacement
 import com.amazon.deequ.metrics.{BucketDistribution, BucketValue, Distribution, DistributionValue}
 import com.amazon.deequ.utils.FixtureSupport
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.types._
 import org.scalatest.{Matchers, WordSpec}
 
@@ -625,6 +626,36 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
     assertSameColumnProfiles(columnProfiles.profiles, expectedProfiles)
   }
 
+  "return correct JSON for NumericColumnProfiles with NaNs" in
+    withSparkSession { session =>
+
+      val nRows = 100
+
+      import session.implicits._
+      import org.apache.spark.sql.functions
+
+      var data = session.sparkContext.range(0,nRows).toDF().select(functions.col("value"))
+      data = data.withColumnRenamed("value","att0")
+      data = data.withColumn("att1",lit(0.0).cast(LongType))
+      data = data.withColumn("att2",lit(0.0).cast(LongType))
+
+      val profile = ColumnProfiler.profile(data, Option(Seq("att1","att2")))
+      val profiles = profile.profiles.map{pro => pro._2}.toSeq
+      val json_profile = ColumnProfiles.toJson(profiles)
+      val correct_profile = "{\"columns\":[{\"column\":\"att1\",\"dataType\":\"Integral\"," +
+        "\"isDataTypeInferred\":\"false\",\"completeness\":1.0,\"distinctness\":0.01,\"entropy\":0.0," +
+        "\"uniqueness\":0.0,\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\",\"count\":100," +
+        "\"ratio\":1.0}],\"mean\":0.0,\"maximum\":0.0,\"minimum\":0.0,\"sum\":0.0,\"stdDev\":0.0," +
+        "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\",\"correlation\":null}]," +
+        "\"approxPercentiles\":[]},{\"column\":\"att2\",\"dataType\":\"Integral\",\"isDataTypeInferred\":\"false\"," +
+        "\"completeness\":1.0,\"distinctness\":0.01,\"entropy\":0.0,\"uniqueness\":0.0," +
+        "\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\",\"count\":100,\"ratio\":1.0}]," +
+        "\"mean\":0.0,\"maximum\":0.0,\"minimum\":0.0,\"sum\":0.0,\"stdDev\":0.0," +
+        "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\",\"correlation\":null}]," +
+        "\"approxPercentiles\":[]}]}"
+        assert(json_profile == correct_profile)
+    }
+
   private[this] def assertSameColumnProfiles(
       actualProfiles: Map[String, ColumnProfile],
       expectedProfiles: List[ColumnProfile])

From 9e10b6c2cc8711537fb01ea8491ada8e87f733b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Till=20D=C3=B6hmen?= <till.doehmen@web.de>
Date: Tue, 10 Aug 2021 12:16:06 +0200
Subject: [PATCH 11/21]  Fix for NaNs and Infinity values in profile JSON
 (stylecheck) (#8)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: doehmen-admin <doehmen-admin@muschas.ad.fit.fraunhofer.de>
Co-authored-by: Till Döhmen <tdoehmen@users.noreply.github.com>
---
 .../amazon/deequ/profiles/ColumnProfile.scala |  4 +--
 .../deequ/profiles/ColumnProfilerTest.scala   | 28 +++++++++++--------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
index 58ff9ef0b..7ba21ee1a 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
@@ -213,12 +213,12 @@ object ColumnProfiles {
     gson.toJson(json)
   }
 
-  def normalizeDouble(numeric: Double): java.lang.Double ={
+  def normalizeDouble(numeric: Double): java.lang.Double = {
     if (numeric.isNaN) {
       null.asInstanceOf[java.lang.Double]
     } else if (numeric.isNegInfinity) {
       Double.MinValue
-    } else if(numeric.isPosInfinity) {
+    } else if (numeric.isPosInfinity) {
       Double.MaxValue
     } else {
       numeric
diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
index b63394991..e0441be2a 100644
--- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
+++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
@@ -634,24 +634,30 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       import session.implicits._
       import org.apache.spark.sql.functions
 
-      var data = session.sparkContext.range(0,nRows).toDF().select(functions.col("value"))
-      data = data.withColumnRenamed("value","att0")
-      data = data.withColumn("att1",lit(0.0).cast(LongType))
-      data = data.withColumn("att2",lit(0.0).cast(LongType))
+      var data = session.sparkContext.range(0, nRows).toDF().select(functions.col("value"))
+      data = data.withColumnRenamed("value", "att0")
+      data = data.withColumn("att1", lit(0.0).cast(LongType))
+      data = data.withColumn("att2", lit(0.0).cast(LongType))
 
-      val profile = ColumnProfiler.profile(data, Option(Seq("att1","att2")))
+      val profile = ColumnProfiler.profile(data, Option(Seq("att1", "att2")))
       val profiles = profile.profiles.map{pro => pro._2}.toSeq
       val json_profile = ColumnProfiles.toJson(profiles)
       val correct_profile = "{\"columns\":[{\"column\":\"att1\",\"dataType\":\"Integral\"," +
-        "\"isDataTypeInferred\":\"false\",\"completeness\":1.0,\"distinctness\":0.01,\"entropy\":0.0," +
-        "\"uniqueness\":0.0,\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\",\"count\":100," +
+        "\"isDataTypeInferred\":\"false\",\"completeness\":1.0,\"distinctness\":0.01," +
+        "\"entropy\":0.0," +
+        "\"uniqueness\":0.0,\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\"," +
+        "\"count\":100," +
         "\"ratio\":1.0}],\"mean\":0.0,\"maximum\":0.0,\"minimum\":0.0,\"sum\":0.0,\"stdDev\":0.0," +
-        "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\",\"correlation\":null}]," +
-        "\"approxPercentiles\":[]},{\"column\":\"att2\",\"dataType\":\"Integral\",\"isDataTypeInferred\":\"false\"," +
+        "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\"," +
+        "\"correlation\":null}]," +
+        "\"approxPercentiles\":[]},{\"column\":\"att2\",\"dataType\":\"Integral\"," +
+        "\"isDataTypeInferred\":\"false\"," +
         "\"completeness\":1.0,\"distinctness\":0.01,\"entropy\":0.0,\"uniqueness\":0.0," +
-        "\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\",\"count\":100,\"ratio\":1.0}]," +
+        "\"approximateNumDistinctValues\":1,\"histogram\":[{\"value\":\"0\",\"count\":100," +
+        "\"ratio\":1.0}]," +
         "\"mean\":0.0,\"maximum\":0.0,\"minimum\":0.0,\"sum\":0.0,\"stdDev\":0.0," +
-        "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\",\"correlation\":null}]," +
+        "\"correlations\":[{\"column\":\"att2\",\"correlation\":null},{\"column\":\"att1\"," +
+        "\"correlation\":null}]," +
         "\"approxPercentiles\":[]}]}"
         assert(json_profile == correct_profile)
     }

From 8791fbb55aab54cb9ee39fbec6374a19c9cea390 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Till=20D=C3=B6hmen?= <till.doehmen@web.de>
Date: Thu, 26 Aug 2021 16:51:56 +0200
Subject: [PATCH 12/21] [HOPSWORKS-2681] Profiling optimization (#6)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Till Döhmen <tdoehmen@users.noreply.github.com>
---
 .../amazon/deequ/analyzers/Completeness.scala |   4 +-
 .../com/amazon/deequ/analyzers/DataType.scala |   1 +
 .../deequ/examples/DataProfilingExample.scala |   1 +
 .../amazon/deequ/profiles/ColumnProfile.scala |  50 +-
 .../deequ/profiles/ColumnProfiler.scala       | 224 +++++++-
 .../profiles/ColumnProfilerRunBuilder.scala   |  78 ++-
 .../deequ/profiles/ColumnProfilerRunner.scala |  59 ++-
 .../ConstraintSuggestionRunner.scala          |   1 +
 .../FractionalCategoricalRangeRule.scala      |   4 +-
 .../rules/RetainCompletenessRule.scala        |   1 +
 .../com/amazon/deequ/KLL/KLLProfileTest.scala |  12 +-
 .../deequ/KLL/KLLProfileTestApprox.scala      | 481 ++++++++++++++++++
 .../amazon/deequ/VerificationResultTest.scala |   1 +
 .../deequ/analyzers/AnalyzerTests.scala       |   7 +-
 .../amazon/deequ/analyzers/StatesTest.scala   |  11 -
 .../com/amazon/deequ/checks/CheckTest.scala   |   7 -
 .../profiles/ColumnProfilerRunnerTest.scala   |  54 +-
 .../deequ/profiles/ColumnProfilerTest.scala   |  72 +--
 .../ConstraintSuggestionResultTest.scala      |   3 +
 .../ConstraintSuggestionRunnerTest.scala      |   2 +-
 ...ConstraintSuggestionsIntegrationTest.scala |  28 +-
 .../rules/ConstraintRulesTest.scala           | 122 ++---
 22 files changed, 1023 insertions(+), 200 deletions(-)
 create mode 100644 src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala

diff --git a/src/main/scala/com/amazon/deequ/analyzers/Completeness.scala b/src/main/scala/com/amazon/deequ/analyzers/Completeness.scala
index 5e80e2f6e..f4e30739e 100644
--- a/src/main/scala/com/amazon/deequ/analyzers/Completeness.scala
+++ b/src/main/scala/com/amazon/deequ/analyzers/Completeness.scala
@@ -16,7 +16,7 @@
 
 package com.amazon.deequ.analyzers
 
-import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNotNested}
+import com.amazon.deequ.analyzers.Preconditions.{hasColumn}
 import org.apache.spark.sql.functions.sum
 import org.apache.spark.sql.types.{IntegerType, StructType}
 import Analyzers._
@@ -44,7 +44,7 @@ case class Completeness(column: String, where: Option[String] = None) extends
   }
 
   override protected def additionalPreconditions(): Seq[StructType => Unit] = {
-    hasColumn(column) :: isNotNested(column) :: Nil
+    hasColumn(column) :: Nil
   }
 
   override def filterCondition: Option[String] = where
diff --git a/src/main/scala/com/amazon/deequ/analyzers/DataType.scala b/src/main/scala/com/amazon/deequ/analyzers/DataType.scala
index fb3c1ca06..d0ec2a7ac 100644
--- a/src/main/scala/com/amazon/deequ/analyzers/DataType.scala
+++ b/src/main/scala/com/amazon/deequ/analyzers/DataType.scala
@@ -35,6 +35,7 @@ object DataTypeInstances extends Enumeration {
   val Integral: Value = Value(2)
   val Boolean: Value = Value(3)
   val String: Value = Value(4)
+  val Decimal: Value = Value(5)
 }
 
 case class DataTypeHistogram(
diff --git a/src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala b/src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala
index ecb17dae3..c5e350819 100644
--- a/src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala
+++ b/src/main/scala/com/amazon/deequ/examples/DataProfilingExample.scala
@@ -43,6 +43,7 @@ private[examples] object DataProfilingExample extends App {
        any shuffles. */
     val result = ColumnProfilerRunner()
       .onData(rawData)
+      .nonOptimized()
       .run()
 
     /* We get a profile for each column which allows to inspect the completeness of the column,
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
index 7ba21ee1a..84df99511 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
@@ -24,9 +24,9 @@ import com.google.gson.{Gson, GsonBuilder, JsonArray, JsonObject, JsonPrimitive}
 abstract class ColumnProfile {
   def column: String
   def completeness: Double
-  def distinctness: Double
-  def entropy: Double
-  def uniqueness: Double
+  def distinctness: Option[Double]
+  def entropy: Option[Double]
+  def uniqueness: Option[Double]
   def approximateNumDistinctValues: Long
   def dataType: DataTypeInstances.Value
   def isDataTypeInferred: Boolean
@@ -37,9 +37,9 @@ abstract class ColumnProfile {
 case class StandardColumnProfile(
     column: String,
     completeness: Double,
-    distinctness: Double,
-    entropy: Double,
-    uniqueness: Double,
+    distinctness: Option[Double],
+    entropy: Option[Double],
+    uniqueness: Option[Double],
     approximateNumDistinctValues: Long,
     dataType: DataTypeInstances.Value,
     isDataTypeInferred: Boolean,
@@ -62,9 +62,9 @@ case class StringColumnProfile(
 case class NumericColumnProfile(
     column: String,
     completeness: Double,
-    distinctness: Double,
-    entropy: Double,
-    uniqueness: Double,
+    distinctness: Option[Double],
+    entropy: Option[Double],
+    uniqueness: Option[Double],
     approximateNumDistinctValues: Long,
     dataType: DataTypeInstances.Value,
     isDataTypeInferred: Boolean,
@@ -108,9 +108,16 @@ object ColumnProfiles {
       }
 
       columnProfileJson.addProperty("completeness", normalizeDouble(profile.completeness))
-      columnProfileJson.addProperty("distinctness", normalizeDouble(profile.distinctness))
-      columnProfileJson.addProperty("entropy", normalizeDouble(profile.entropy))
-      columnProfileJson.addProperty("uniqueness", normalizeDouble(profile.uniqueness))
+      if (profile.distinctness.isDefined) {
+        columnProfileJson.addProperty("distinctness", normalizeDouble(profile.distinctness.get))
+      }
+      if (profile.entropy.isDefined) {
+        columnProfileJson.addProperty("entropy", normalizeDouble(profile.entropy.get))
+      }
+      if (profile.uniqueness.isDefined) {
+        columnProfileJson.addProperty("uniqueness", normalizeDouble(profile.uniqueness.get))
+      }
+
       columnProfileJson.addProperty("approximateNumDistinctValues",
         profile.approximateNumDistinctValues)
 
@@ -165,14 +172,33 @@ object ColumnProfiles {
             val kllSketchJson = new JsonObject()
 
             val tmp = new JsonArray()
+            var totalCount = kllSketch.buckets.foldLeft(0.0)(_ + _.count)
+            if (totalCount == 0) totalCount = 1
+
             kllSketch.buckets.foreach{bucket =>
               val entry = new JsonObject()
               entry.addProperty("low_value", normalizeDouble(bucket.lowValue))
               entry.addProperty("high_value", normalizeDouble(bucket.highValue))
               entry.addProperty("count", bucket.count)
+              entry.addProperty("ratio", bucket.count/totalCount)
               tmp.add(entry)
             }
 
+            if (profile.histogram.isEmpty) {
+              val histogramJson = new JsonArray()
+              kllSketch.buckets.foreach{bucket =>
+                val histogramEntry = new JsonObject()
+                histogramEntry.addProperty("value", "%.2f".formatLocal(java.util.Locale.US,
+                  bucket.lowValue) + "-" + "%.2f".formatLocal(java.util.Locale.US, bucket
+                  .highValue))
+                histogramEntry.addProperty("count", bucket.count)
+                histogramEntry.addProperty("ratio", bucket.count/totalCount)
+                histogramJson.add(histogramEntry)
+              }
+
+              columnProfileJson.add("histogram", histogramJson)
+            }
+
             kllSketchJson.add("buckets", tmp)
             val entry = new JsonObject()
             entry.addProperty("c", kllSketch.parameters(0))
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
index d4c42bd42..9de7e3b25 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
@@ -16,6 +16,8 @@
 
 package com.amazon.deequ.profiles
 
+import scala.util.Success
+import scala.collection.mutable.ListBuffer
 import com.amazon.deequ.analyzers.DataTypeInstances._
 import com.amazon.deequ.analyzers._
 import com.amazon.deequ.analyzers.runners.AnalysisRunBuilder
@@ -41,6 +43,9 @@ import org.apache.spark.sql.types.TimestampType
 import org.apache.spark.sql.types.{DataType => SparkDataType}
 
 import scala.util.Success
+import com.amazon.deequ.repository.{MetricsRepository, ResultKey}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.{BinaryType, BooleanType, ByteType, DateType, DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType, StringType, StructType, TimestampType, DataType => SparkDataType}
 
 private[deequ] case class GenericColumnStatistics(
     numRecords: Long,
@@ -243,6 +248,188 @@ object ColumnProfiler {
       CategoricalColumnStatistics(thirdPassResults))
   }
 
+
+  /**
+   * Profile a (potentially very large) dataset.
+   *
+   * @param data                             data dataset as dataframe
+   * @param restrictToColumns                an contain a subset of columns to profile, otherwise
+   *                                         all columns will be considered
+   * @param printStatusUpdates
+   * @param lowCardinalityHistogramThreshold the maximum (estimated) number of distinct values
+   *                                         in a column until which we should compute exact
+   *                                         histograms for it (defaults to 120)
+   * @param metricsRepository                the repo to store metrics
+   * @param reuseExistingResultsUsingKey     key for reuse existing result
+   * @param failIfResultsForReusingMissing   true if we have results for reusing
+   * @param saveInMetricsRepositoryUsingKey  key for saving in metrics repo
+   * @param kllParameters                    parameters for KLL Sketches
+   *
+   * @return the profile of columns
+   */
+  // scalastyle:off argcount
+  private[deequ] def profileOptimized(
+                              data: DataFrame,
+                              restrictToColumns: Option[Seq[String]] = None,
+                              printStatusUpdates: Boolean = false,
+                              lowCardinalityHistogramThreshold: Int = ColumnProfiler
+                                .DEFAULT_CARDINALITY_THRESHOLD,
+                              metricsRepository: Option[MetricsRepository] = None,
+                              reuseExistingResultsUsingKey: Option[ResultKey] = None,
+                              failIfResultsForReusingMissing: Boolean = false,
+                              saveInMetricsRepositoryUsingKey: Option[ResultKey] = None,
+                              correlation: Boolean = true,
+                              histogram: Boolean = false,
+                              exactUniqueness: Boolean = false,
+                              exactUniquenessCols: Option[Seq[String]] = None,
+                              maxCorrelationCols: Option[Int] = None,
+                              kllParameters: Option[KLLParameters] = None
+                                   )
+  : ColumnProfiles = {
+
+    // Ensure that all desired columns exist
+    restrictToColumns.foreach { restrictToColumns =>
+      restrictToColumns.foreach { columnName =>
+        require(data.schema.fieldNames.contains(columnName), s"Unable to find column $columnName")
+      }
+    }
+
+    // Find columns we want to profile
+    val relevantColumns = getRelevantColumns(data.schema, restrictToColumns)
+
+    // We assume that data types are predefined by the schema, and skip the data type detection
+    val predefinedTypes = data.schema.fields
+      .filter { column => relevantColumns.contains(column.name) }
+      .map { field =>
+        val knownType = field.dataType match {
+          case ByteType | ShortType | IntegerType | LongType => Integral
+          case FloatType | DoubleType => Fractional
+          case DecimalType() => Decimal
+          case BooleanType => Boolean
+          case StringType | TimestampType | DateType | BinaryType => String
+          case _ =>
+            println(s"Unable to map type ${field.dataType}")
+            Unknown
+        }
+
+        field.name -> knownType
+      }
+      .toMap
+
+    val numericColumnNames = relevantColumns
+      .filter { name => Set(Integral, Fractional, Decimal).contains(predefinedTypes(name)) }
+
+    // First pass
+    if (printStatusUpdates) {
+      println("### PROFILING: Computing generic column statistics in pass (1/2)...")
+    }
+
+    // We compute completeness, approximate number of distinct values for all cols
+    // and min, max, mean, stddev, sum, kll and correlations for numeric cols
+    // and uniqueness, distinctness and entropy for optional cols
+    var correlationCalculatedColumnNames = new ListBuffer[String]()
+    val analyzersForGenericStats = relevantColumns.flatMap { name =>
+          val analyzers = ListBuffer[Analyzer[_, Metric[_]]]()
+
+          // Add default analyzers.
+          analyzers ++= Seq(Completeness(name), ApproxCountDistinct(name))
+
+          if (numericColumnNames.contains(name)) {
+            // Add numeric analyzers.
+            analyzers ++= Seq(Minimum(name), Maximum(name), Mean(name),
+               StandardDeviation(name), Sum(name))
+            // Add KLL analyzer.
+            if (histogram && predefinedTypes(name) != Decimal) {
+              analyzers += KLLSketch(name, kllParameters)
+            }
+            if (correlation && (maxCorrelationCols.isEmpty || (numericColumnNames.length <=
+              maxCorrelationCols.get))) {
+              // Add correlation analyzers.
+              correlationCalculatedColumnNames += name
+              analyzers ++= numericColumnNames
+                .filterNot(x => correlationCalculatedColumnNames.contains(x))
+                .map(x => Correlation(name, x))
+            }
+          }
+
+          if (exactUniqueness && (exactUniquenessCols.isEmpty ||
+            (exactUniquenessCols.isDefined && exactUniquenessCols.get.contains(name)))
+            && predefinedTypes(name) != Unknown) {
+            // Add grouping analyzers.
+            analyzers ++= Seq(Uniqueness(name), Distinctness(name), Entropy(name))
+          }
+
+          analyzers
+        }
+
+    var analysisRunnerFirstPass = AnalysisRunner
+      .onData(data)
+      .addAnalyzers(analyzersForGenericStats)
+      .addAnalyzer(Size())
+
+    analysisRunnerFirstPass = setMetricsRepositoryConfigurationIfNecessary(
+      analysisRunnerFirstPass,
+      metricsRepository,
+      reuseExistingResultsUsingKey,
+      failIfResultsForReusingMissing,
+      saveInMetricsRepositoryUsingKey)
+
+    val firstPassResults = analysisRunnerFirstPass.run()
+
+    val genericStatistics = extractGenericStatistics(
+      relevantColumns,
+      data.schema,
+      firstPassResults,
+      predefinedTypes)
+
+
+    val numericStatistics = if (correlation) {
+      extractNumericStatistics(firstPassResults, correlationCalculatedColumnNames)
+    } else {
+      extractNumericStatistics(firstPassResults)
+    }
+
+    val secondPassResults = histogram match {
+      case true =>
+        // Second pass
+        if (printStatusUpdates) {
+          println("### PROFILING: Computing histograms of low-cardinality columns in pass (2/2)...")
+        }
+
+        // We compute exact histograms for all low-cardinality string columns, find those here
+        val targetColumnsForHistograms = findTargetColumnsForHistograms(data.schema,
+          genericStatistics, lowCardinalityHistogramThreshold)
+
+        // Find out, if we have values for those we can reuse
+        val analyzerContextExistingValues =
+          getAnalyzerContextWithHistogramResultsForReusingIfNecessary(
+            metricsRepository,
+            reuseExistingResultsUsingKey,
+            targetColumnsForHistograms
+          )
+
+        // The columns we need to calculate the histograms for
+        val nonExistingHistogramColumns = targetColumnsForHistograms
+          .filter { column =>
+            analyzerContextExistingValues.metricMap.get(Histogram(column)).isEmpty }
+
+        // Calculate and save/append results if necessary
+        val histograms: Map[String, Distribution] = getHistogramsForThirdPass(
+          data,
+          nonExistingHistogramColumns,
+          analyzerContextExistingValues,
+          printStatusUpdates,
+          failIfResultsForReusingMissing,
+          metricsRepository,
+          saveInMetricsRepositoryUsingKey)
+        histograms
+      case _ => Map.empty[String, Distribution]
+    }
+
+    createProfiles(relevantColumns, genericStatistics, numericStatistics,
+      CategoricalColumnStatistics(secondPassResults))
+  }
+
   private[this] def getRelevantColumns(
       schema: StructType,
       restrictToColumns: Option[Seq[String]])
@@ -291,7 +478,8 @@ object ColumnProfiler {
       correlation: Boolean)
     : Seq[Analyzer[_, Metric[_]]] = {
       val numericColumnNames = relevantColumnNames
-        .filter { name => Set(Integral, Fractional).contains(genericStatistics.typeOf(name)) }
+        .filter { name => Set(Integral, Fractional).contains(genericStatistics.typeOf
+        (name)) }
       numericColumnNames
         .flatMap { name =>
           getNumericColAnalyzers(name, kllProfiling, kllParameters, correlation, numericColumnNames)
@@ -543,7 +731,9 @@ object ColumnProfiler {
   }
 
 
-  private[this] def extractNumericStatistics(results: AnalyzerContext): NumericColumnStatistics = {
+  private[this] def extractNumericStatistics(results: AnalyzerContext,
+                                             correlationCols: Seq[String] = Seq[String]())
+  : NumericColumnStatistics = {
 
     val means = results.metricMap
       .collect { case (analyzer: Mean, metric: DoubleMetric) =>
@@ -621,7 +811,10 @@ object ColumnProfiler {
       .flatten
       .toMap
 
-    val correlation = results.metricMap
+    val correlationDiagonal = correlationCols.map { name =>
+      Some((name -> Map(name -> 1.0)))
+    }
+    val correlationLower = results.metricMap
       .collect { case (analyzer: Correlation, metric: DoubleMetric) =>
         metric.value match {
           case Success(metricValue) =>
@@ -629,9 +822,19 @@ object ColumnProfiler {
           case _ => None
         }
       }
-      .flatten
+    val correlationUpper = results.metricMap
+      .collect { case (analyzer: Correlation, metric: DoubleMetric) =>
+        metric.value match {
+          case Success(metricValue) =>
+            Some(analyzer.secondColumn -> Map(analyzer.firstColumn -> metricValue))
+          case _ => None
+        }
+      }
+    val correlation = (correlationLower ++ correlationDiagonal ++ correlationUpper).flatten
       .groupBy(_._1)
-      .map { case (key, value) => value.reduce((x, y) => x._1 -> (x._2.toSeq ++ y._2.toSeq).toMap) }
+      .map { case (key, value) => value.reduce((x, y) => x._1 -> (x._2.toSeq ++ y._2.toSeq).toMap
+        )}
+
 
     NumericColumnStatistics(means, stdDevs, minima, maxima, sums, kll,
       approxPercentiles, correlation)
@@ -659,7 +862,8 @@ object ColumnProfiler {
     genericStatistics.approximateNumDistincts
       .filter { case (column, _) =>
         originalStringNumericOrBooleanColumns.contains(column) &&
-          Set(String, Boolean, Integral, Fractional).contains(genericStatistics.typeOf(column))
+          Set(String, Boolean, Integral, Fractional).contains(genericStatistics.typeOf
+          (column))
       }
       .filter { case (_, count) => count <= lowCardinalityHistogramThreshold }
       .map { case (column, _) => column }
@@ -777,9 +981,9 @@ object ColumnProfiler {
       .map { name =>
 
         val completeness = genericStats.completenesses(name)
-        val distinctness = genericStats.distinctness(name)
-        val entropy = genericStats.entropy(name)
-        val uniqueness = genericStats.uniqueness(name)
+        val distinctness = genericStats.distinctness.get(name)
+        val entropy = genericStats.entropy.get(name)
+        val uniqueness = genericStats.uniqueness.get(name)
         val approxNumDistinct = genericStats.approximateNumDistincts(name)
         val dataType = genericStats.typeOf(name)
         val isDataTypeInferred = genericStats.inferredTypes.contains(name)
@@ -789,7 +993,7 @@ object ColumnProfiler {
 
         val profile = genericStats.typeOf(name) match {
 
-          case Integral | Fractional =>
+          case Integral | Fractional | Decimal =>
             NumericColumnProfile(
               name,
               completeness,
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala
index 14e3297ad..ed2bee395 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunBuilder.scala
@@ -17,7 +17,7 @@
 package com.amazon.deequ.profiles
 
 import com.amazon.deequ.repository._
-import com.amazon.deequ.analyzers.{DataTypeInstances, KLLParameters}
+import com.amazon.deequ.analyzers.{DataTypeInstances, KLLParameters, KLLSketch}
 import org.apache.spark.sql.{DataFrame, SparkSession}
 
 /** A class to build a Constraint Suggestion run using a fluent API */
@@ -39,11 +39,15 @@ class ColumnProfilerRunBuilder(val data: DataFrame) {
   protected var saveColumnProfilesJsonPath: Option[String] = None
   protected var saveConstraintSuggestionsJsonPath: Option[String] = None
   protected var saveEvaluationResultsJsonPath: Option[String] = None
-  protected var correlation = true
-  protected var histogram = true
+  protected var correlation = false
+  protected var histogram = false
   protected var kllProfiling = false
   protected var kllParameters: Option[KLLParameters] = None
   protected var predefinedTypes: Map[String, DataTypeInstances.Value] = Map.empty
+  protected var maxCorrelationCols: Option[Int] = None
+  protected var exactUniqueness = false
+  protected var exactUniquenessCols: Option[Seq[String]] = None
+  protected var optimized = true
 
   protected def this(constraintSuggestionRunBuilder: ColumnProfilerRunBuilder) {
 
@@ -66,9 +70,18 @@ class ColumnProfilerRunBuilder(val data: DataFrame) {
     saveConstraintSuggestionsJsonPath = constraintSuggestionRunBuilder
       .saveConstraintSuggestionsJsonPath
     saveEvaluationResultsJsonPath = constraintSuggestionRunBuilder.saveEvaluationResultsJsonPath
+
+    restrictToColumns = constraintSuggestionRunBuilder.restrictToColumns
+    correlation = constraintSuggestionRunBuilder.correlation
+    maxCorrelationCols = constraintSuggestionRunBuilder.maxCorrelationCols
+    histogram = constraintSuggestionRunBuilder.histogram
+
     kllProfiling = constraintSuggestionRunBuilder.kllProfiling
     kllParameters = constraintSuggestionRunBuilder.kllParameters
     predefinedTypes = constraintSuggestionRunBuilder.predefinedTypes
+    exactUniqueness = constraintSuggestionRunBuilder.exactUniqueness
+    exactUniquenessCols = constraintSuggestionRunBuilder.exactUniquenessCols
+    optimized = constraintSuggestionRunBuilder.optimized
   }
 
   /**
@@ -93,7 +106,7 @@ class ColumnProfilerRunBuilder(val data: DataFrame) {
 
   /**
     * Set the thresholds of values until it is considered to expensive to
-    * calculate the histograms
+    * calculate the histograms (for backwards compatability)
     *
     * @param lowCardinalityHistogramThreshold The threshold
     */
@@ -113,23 +126,64 @@ class ColumnProfilerRunBuilder(val data: DataFrame) {
   }
 
   /**
-   * Enable correlation profiling on Numerical columns, enabled by default.
+   * Enable correlation profiling on Numerical columns, disabled by default.
+   *
+   * @param correlation Enable oder disable correlation profiling
+   * @param maxCorrelationCols The maximum number of columns to calculate correlations on
    */
-  def withCorrelation(correlation: Boolean): this.type = {
+  def withCorrelation(correlation: Boolean, maxCorrelationCols: Int = 100): this.type = {
     this.correlation = correlation
+    this.maxCorrelationCols = Some(maxCorrelationCols)
     this
   }
 
   /**
-   * Enable histogram profiling on Numerical columns, enabled by default.
+   * Enable histogram profiling on Numerical and Categorial columns, disabled by default.
+   *
+   * @param histogram Enable oder disable histogram profiling
+   * @param maxBuckets The maximum number of distinct values to calculate the histogram for
    */
-  def withHistogram(histogram: Boolean): this.type = {
+  def withHistogram(histogram: Boolean, maxBuckets: Int = 20): this.type = {
     this.histogram = histogram
+    this.kllProfiling = histogram
+    this.lowCardinalityHistogramThreshold = maxBuckets
+    this.kllParameters = Some(KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch
+      .DEFAULT_SHRINKING_FACTOR, maxBuckets));
+    this
+  }
+
+  /**
+   * Enables exact Uniqueness, Entropy and Distinctness for all columns
+   *
+   * @param exactUniqueness Enable oder disable uniqueness, entropy and distinctness profiling
+   */
+  def withExactUniqueness(exactUniqueness: Boolean): this.type = {
+    this.exactUniqueness = exactUniqueness
+    this
+  }
+
+  /**
+   * Enables exact Uniqueness, Entropy and Distinctness for specified columns
+   *
+   * @param exactUniquenessColumns List of columns that should be selected for uniqueness profiling
+   */
+  def restrictExactUniquenessColumns(exactUniquenessColumns: Seq[String]): this.type = {
+    this.exactUniquenessCols = Some(exactUniquenessColumns)
+    this
+  }
+
+  /**
+   * Use unoptimized version of profiler (optimizations on by default)
+   *
+   */
+  def nonOptimized(): this.type = {
+    this.optimized = false
     this
   }
 
   /**
    * Enable KLL Sketches profiling on Numerical columns, disabled by default.
+   * (for backwards compatability)
    */
   def withKLLProfiling(): this.type = {
     this.kllProfiling = true
@@ -138,6 +192,7 @@ class ColumnProfilerRunBuilder(val data: DataFrame) {
 
   /**
    * Set KLL parameters.
+   * (for backwards compatability)
    *
    * @param kllParameters kllParameters(sketchSize, shrinkingFactor, numberOfBuckets)
    */
@@ -148,6 +203,7 @@ class ColumnProfilerRunBuilder(val data: DataFrame) {
 
   /**
    * Set predefined data types for each column (e.g. baseline)
+   * (for backwards compatability)
    *
    * @param dataTypes dataType map for baseline columns
    */
@@ -202,7 +258,11 @@ class ColumnProfilerRunBuilder(val data: DataFrame) {
       histogram,
       kllProfiling,
       kllParameters,
-      predefinedTypes
+      predefinedTypes,
+      optimized,
+      maxCorrelationCols,
+      exactUniqueness,
+      exactUniquenessCols
     )
   }
 }
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala
index a02a5d4ee..49ac84a11 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala
@@ -52,29 +52,54 @@ class ColumnProfilerRunner {
       histogram: Boolean,
       kllProfiling: Boolean,
       kllParameters: Option[KLLParameters],
-      predefinedTypes: Map[String, DataTypeInstances.Value])
+      predefinedTypes: Map[String, DataTypeInstances.Value],
+      optimized: Boolean,
+      maxCorrelationCols: Option[Int],
+      exactUniqueness: Boolean,
+      exactUniquenessCols: Option[Seq[String]])
     : ColumnProfiles = {
 
     if (cacheInputs) {
       data.cache()
     }
 
-    val columnProfiles = ColumnProfiler
-      .profile(
-        data,
-        restrictToColumns,
-        printStatusUpdates,
-        lowCardinalityHistogramThreshold,
-        metricsRepositoryOptions.metricsRepository,
-        metricsRepositoryOptions.reuseExistingResultsKey,
-        metricsRepositoryOptions.failIfResultsForReusingMissing,
-        metricsRepositoryOptions.saveOrAppendResultsKey,
-        correlation,
-        histogram,
-        kllProfiling,
-        kllParameters,
-        predefinedTypes
-      )
+    val columnProfiles: ColumnProfiles = {
+      if (!optimized) {
+        ColumnProfiler.profile(
+            data,
+            restrictToColumns,
+            printStatusUpdates,
+            lowCardinalityHistogramThreshold,
+            metricsRepositoryOptions.metricsRepository,
+            metricsRepositoryOptions.reuseExistingResultsKey,
+            metricsRepositoryOptions.failIfResultsForReusingMissing,
+            metricsRepositoryOptions.saveOrAppendResultsKey,
+            correlation,
+            histogram,
+            kllProfiling,
+            kllParameters,
+            predefinedTypes
+          )
+      } else {
+        ColumnProfiler.profileOptimized(
+            data,
+            restrictToColumns,
+            printStatusUpdates,
+            lowCardinalityHistogramThreshold,
+            metricsRepositoryOptions.metricsRepository,
+            metricsRepositoryOptions.reuseExistingResultsKey,
+            metricsRepositoryOptions.failIfResultsForReusingMissing,
+            metricsRepositoryOptions.saveOrAppendResultsKey,
+            correlation,
+            histogram,
+            exactUniqueness,
+            exactUniquenessCols,
+            maxCorrelationCols,
+            kllParameters
+          )
+      }
+    }
+
 
     saveColumnProfilesJsonToFileSystemIfNecessary(
       fileOutputOptions,
diff --git a/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala b/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala
index de915956d..9e46e5e81 100644
--- a/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala
+++ b/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala
@@ -181,6 +181,7 @@ class ConstraintSuggestionRunner {
 
     var columnProfilerRunner = ColumnProfilerRunner()
       .onData(trainingData)
+      .nonOptimized()
       .printStatusUpdates(printStatusUpdates)
       .withLowCardinalityHistogramThreshold(lowCardinalityHistogramThreshold)
 
diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala
index 55e410f33..be2029079 100644
--- a/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala
+++ b/src/main/scala/com/amazon/deequ/suggestions/rules/FractionalCategoricalRangeRule.scala
@@ -99,7 +99,9 @@ case class FractionalCategoricalRangeRule(
       description,
       this,
       s""".isContainedIn("${profile.column}", Array($categoriesCode),
-         | _ >= $targetCompliance, Some("$hint"))""".stripMargin.replaceAll("\n", ""),
+         | _ >= $targetCompliance, Some("$hint"))"""
+        .stripMargin.replaceAll("\n", "")
+        .stripMargin.replaceAll("\r", ""),
       valuesByPopularity.toSeq
     )
   }
diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala
index 67ae61f92..71382d1b4 100644
--- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala
+++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala
@@ -58,6 +58,7 @@ case class RetainCompletenessRule() extends ConstraintRule[ColumnProfile] {
       s""".hasCompleteness("${profile.column}", _ >= $targetCompleteness,
          | Some("It should be above $targetCompleteness!"))"""
         .stripMargin.replaceAll("\n", "")
+        .stripMargin.replaceAll("\r", "")
     )
   }
 
diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala
index 119112e56..e462b26d9 100644
--- a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala
+++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala
@@ -64,9 +64,9 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec
         val expectedColumnProfile = NumericColumnProfile(
           "att1",
           1.0,
-          1.0,
-          1.0,
-          1.0,
+          Some(1.0),
+          Some(1.0),
+          Some(1.0),
           6,
           DataTypeInstances.Fractional,
           false,
@@ -109,9 +109,9 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec
         val expectedColumnProfile = NumericColumnProfile(
           "att1",
           1.0,
-          1.0,
-          1.0,
-          1.0,
+          Some(1.0),
+          Some(1.0),
+          Some(1.0),
           30,
           DataTypeInstances.Fractional,
           false,
diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala
new file mode 100644
index 000000000..a64f8071d
--- /dev/null
+++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala
@@ -0,0 +1,481 @@
+/**
+ * Copyright 2021 Logical Clocks AB. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not
+ * use this file except in compliance with the License. A copy of the License
+ * is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+package com.amazon.deequ.KLL
+
+import com.amazon.deequ.SparkContextSpec
+import com.amazon.deequ.analyzers.{DataTypeInstances, KLLParameters, KLLSketch}
+import com.amazon.deequ.metrics.{BucketDistribution, BucketValue, Distribution, DistributionValue}
+import com.amazon.deequ.profiles.{ColumnProfiler, ColumnProfiles, NumericColumnProfile, StandardColumnProfile}
+import com.amazon.deequ.utils.FixtureSupport
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types._
+import org.scalatest.{Matchers, WordSpec}
+
+class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
+  with FixtureSupport {
+
+  def assertProfilesEqual(expected: NumericColumnProfile, actual: NumericColumnProfile): Unit = {
+
+    assert(expected.column == actual.column)
+    assert(expected.completeness == actual.completeness)
+    assert(math.abs(expected.approximateNumDistinctValues -
+      actual.approximateNumDistinctValues) <= 1)
+    assert(expected.uniqueness == actual.uniqueness)
+    assert(expected.distinctness == actual.distinctness)
+    assert(expected.entropy == actual.entropy)
+    assert(expected.dataType == actual.dataType)
+    assert(expected.isDataTypeInferred == expected.isDataTypeInferred)
+    assert(expected.typeCounts == actual.typeCounts)
+    assert(expected.histogram == actual.histogram)
+    assert(expected.mean == actual.mean)
+    assert(expected.maximum == actual.maximum)
+    assert(expected.minimum == actual.minimum)
+    assert(expected.sum == actual.sum)
+    assert(expected.stdDev == actual.stdDev)
+    assert(expected.kll == actual.kll)
+    assert(expected.approxPercentiles == actual.approxPercentiles)
+    assert(expected.correlation == actual.correlation)
+  }
+
+
+  def assertStandardProfilesEqual(expected: StandardColumnProfile,
+                                  actual: StandardColumnProfile): Unit = {
+
+    assert(expected.column == actual.column)
+    assert(expected.completeness == actual.completeness)
+    assert(expected.uniqueness == actual.uniqueness)
+    assert(expected.distinctness == actual.distinctness)
+    assert(expected.entropy == actual.entropy)
+    assert(math.abs(expected.approximateNumDistinctValues -
+      actual.approximateNumDistinctValues) <= 1)
+    assert(expected.dataType == actual.dataType)
+    assert(expected.isDataTypeInferred == expected.isDataTypeInferred)
+    assert(expected.typeCounts == actual.typeCounts)
+    assert(expected.histogram == actual.histogram)
+  }
+
+  "Column Profiler" should {
+
+    "return correct NumericColumnProfiles for numeric columns with correct DataType" in
+      withSparkSession { session =>
+
+        val data = getDfWithNumericFractionalValues(session)
+
+        val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("att1",
+          "att2")), kllParameters = Some(KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch
+          .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true)
+          .profiles("att1")
+
+        val expectedColumnProfile = NumericColumnProfile(
+          "att1",
+          1.0,
+          None,
+          None,
+          None,
+          6,
+          DataTypeInstances.Fractional,
+          false,
+          Map.empty,
+          actualColumnProfile.histogram,
+          Some(BucketDistribution(List(BucketValue(1.0, 1.25, 1),
+            BucketValue(1.25, 1.5, 0),
+            BucketValue(1.5, 1.75, 0),
+            BucketValue(1.75, 2.0, 0),
+            BucketValue(2.0, 2.25, 1),
+            BucketValue(2.25, 2.5, 0),
+            BucketValue(2.5, 2.75, 0),
+            BucketValue(2.75, 3.0, 0),
+            BucketValue(3.0, 3.25, 1),
+            BucketValue(3.25, 3.5, 0),
+            BucketValue(3.5, 3.75, 0),
+            BucketValue(3.75, 4.0, 0),
+            BucketValue(4.0, 4.25, 1),
+            BucketValue(4.25, 4.5, 0),
+            BucketValue(4.5, 4.75, 0),
+            BucketValue(4.75, 5.0, 0),
+            BucketValue(5.0, 5.25, 1),
+            BucketValue(5.25, 5.5, 0),
+            BucketValue(5.5, 5.75, 0),
+            BucketValue(5.75, 6.0, 1)),
+            List(0.64, 2048.0),
+            Array(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)))),
+          Some(3.5),
+          Some(6.0),
+          Some(1.0),
+          Some(21.0),
+          Some(1.707825127659933),
+          Some(Seq(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0,
+            2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
+            2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0,
+            3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0,
+            3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0,
+            4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
+            4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+            5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+            5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0,
+            6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)),
+          Some(Map[String, Double]("att1" -> 1.0, "att2" -> 0.9263710192499128))
+        )
+
+        assertProfilesEqual(expectedColumnProfile,
+          actualColumnProfile.asInstanceOf[NumericColumnProfile])
+      }
+
+    "return correct JSON for NumericColumnProfiles" in
+      withSparkSession { session =>
+
+        val data = getDfWithNumericFractionalValues(session)
+
+        val profile = ColumnProfiler.profileOptimized(data, Option(Seq("att1", "att2")),
+          kllParameters = Some(KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch
+          .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true)
+        val profiles = profile.profiles.map{pro => pro._2}.toSeq
+        val json_profile = ColumnProfiles.toJson(profiles)
+        val correct_profile = "{\"columns\":[{\"column\":\"att1\",\"dataType\":\"Fractional\"," +
+          "\"isDataTypeInferred\":\"false\",\"completeness\":1.0," +
+          "\"approximateNumDistinctValues\":6,\"histogram\":[{\"value\":\"6.0\",\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"value\":\"3.0\",\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"value\":\"2.0\",\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"value\":\"4.0\",\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"value\":\"1.0\",\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"value\":\"5.0\",\"count\":1," +
+          "\"ratio\":0.16666666666666666}],\"mean\":3.5,\"maximum\":6.0,\"minimum\":1.0," +
+          "\"sum\":21.0,\"stdDev\":1.707825127659933,\"correlations\":[{\"column\":\"att2\"," +
+          "\"correlation\":0.9263710192499128},{\"column\":\"att1\",\"correlation\":1.0}]," +
+          "\"kll\":{\"buckets\":[{\"low_value\":1.0,\"high_value\":1.25,\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"low_value\":1.25,\"high_value\":1.5,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":1.5,\"high_value\":1.75,\"count\":0,\"ratio\":0.0}," +
+          "{\"low_value\":1.75,\"high_value\":2.0,\"count\":0,\"ratio\":0.0},{\"low_value\":2.0," +
+          "\"high_value\":2.25,\"count\":1,\"ratio\":0.16666666666666666},{\"low_value\":2.25," +
+          "\"high_value\":2.5,\"count\":0,\"ratio\":0.0},{\"low_value\":2.5,\"high_value\":2.75," +
+          "\"count\":0,\"ratio\":0.0},{\"low_value\":2.75,\"high_value\":3.0,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":3.0,\"high_value\":3.25,\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"low_value\":3.25,\"high_value\":3.5,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":3.5,\"high_value\":3.75,\"count\":0,\"ratio\":0.0}," +
+          "{\"low_value\":3.75,\"high_value\":4.0,\"count\":0,\"ratio\":0.0},{\"low_value\":4.0," +
+          "\"high_value\":4.25,\"count\":1,\"ratio\":0.16666666666666666},{\"low_value\":4.25," +
+          "\"high_value\":4.5,\"count\":0,\"ratio\":0.0},{\"low_value\":4.5,\"high_value\":4.75," +
+          "\"count\":0,\"ratio\":0.0},{\"low_value\":4.75,\"high_value\":5.0,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":5.0,\"high_value\":5.25,\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"low_value\":5.25,\"high_value\":5.5,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":5.5,\"high_value\":5.75,\"count\":0,\"ratio\":0.0}," +
+          "{\"low_value\":5.75,\"high_value\":6.0,\"count\":1,\"ratio\":0.16666666666666666}]," +
+          "\"sketch\":{\"parameters\":{\"c\":0.64,\"k\":2048.0},\"data\":\"[[1.0,2.0,3.0,4.0,5.0," +
+          "6.0]]\"}},\"approxPercentiles\":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0," +
+          "1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0," +
+          "3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0," +
+          "4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0," +
+          "5.0,5.0,5.0,5.0,5.0,5.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0," +
+          "6.0,6.0]},{\"column\":\"att2\",\"dataType\":\"Fractional\"," +
+          "\"isDataTypeInferred\":\"false\",\"completeness\":1.0," +
+          "\"approximateNumDistinctValues\":4,\"histogram\":[{\"value\":\"0.0\",\"count\":3," +
+          "\"ratio\":0.5},{\"value\":\"6.0\",\"count\":1,\"ratio\":0.16666666666666666}," +
+          "{\"value\":\"7.0\",\"count\":1,\"ratio\":0.16666666666666666},{\"value\":\"5.0\"," +
+          "\"count\":1,\"ratio\":0.16666666666666666}],\"mean\":3.0,\"maximum\":7.0," +
+          "\"minimum\":0.0,\"sum\":18.0,\"stdDev\":3.0550504633038935," +
+          "\"correlations\":[{\"column\":\"att2\",\"correlation\":1.0},{\"column\":\"att1\"," +
+          "\"correlation\":0.9263710192499128}],\"kll\":{\"buckets\":[{\"low_value\":0.0," +
+          "\"high_value\":0.35,\"count\":3,\"ratio\":0.5},{\"low_value\":0.35,\"high_value\":0.7," +
+          "\"count\":0,\"ratio\":0.0},{\"low_value\":0.7,\"high_value\":1.05,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":1.05,\"high_value\":1.4,\"count\":0,\"ratio\":0.0}," +
+          "{\"low_value\":1.4,\"high_value\":1.75,\"count\":0,\"ratio\":0.0},{\"low_value\":1.75," +
+          "\"high_value\":2.1,\"count\":0,\"ratio\":0.0},{\"low_value\":2.1,\"high_value\":2.45," +
+          "\"count\":0,\"ratio\":0.0},{\"low_value\":2.45,\"high_value\":2.8,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":2.8,\"high_value\":3.15,\"count\":0,\"ratio\":0.0}," +
+          "{\"low_value\":3.15,\"high_value\":3.5,\"count\":0,\"ratio\":0.0},{\"low_value\":3.5," +
+          "\"high_value\":3.85,\"count\":0,\"ratio\":0.0},{\"low_value\":3.85,\"high_value\":4.2," +
+          "\"count\":0,\"ratio\":0.0},{\"low_value\":4.2,\"high_value\":4.55,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":4.55,\"high_value\":4.9,\"count\":0,\"ratio\":0.0}," +
+          "{\"low_value\":4.9,\"high_value\":5.25,\"count\":1,\"ratio\":0.16666666666666666}," +
+          "{\"low_value\":5.25,\"high_value\":5.6,\"count\":0,\"ratio\":0.0},{\"low_value\":5.6," +
+          "\"high_value\":5.95,\"count\":0,\"ratio\":0.0},{\"low_value\":5.95,\"high_value\":6.3," +
+          "\"count\":1,\"ratio\":0.16666666666666666},{\"low_value\":6.3,\"high_value\":6.65," +
+          "\"count\":0,\"ratio\":0.0},{\"low_value\":6.65,\"high_value\":7.0,\"count\":1," +
+          "\"ratio\":0.16666666666666666}],\"sketch\":{\"parameters\":{\"c\":0.64,\"k\":2048.0}," +
+          "\"data\":\"[[0.0,0.0,0.0,5.0,6.0,7.0]]\"}},\"approxPercentiles\":[0.0,0.0,0.0,0.0,0.0," +
+          "0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0," +
+          "0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0," +
+          "0.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,6.0,6.0," +
+          "6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,7.0,7.0,7.0,7.0,7.0,7.0," +
+          "7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0]}]}"
+        assert(json_profile == correct_profile)
+      }
+
+    "return correct NumericColumnProfiles with uniqueness, distinctness and entropy " in
+      withSparkSession { session =>
+
+        val data = getDfWithNumericFractionalValues(session)
+
+        val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("att1")),
+          exactUniqueness = true, exactUniquenessCols = Some(Seq("att1")), kllParameters = Some
+          (KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch
+            .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true).profiles("att1")
+
+        val expectedColumnProfile = NumericColumnProfile(
+          "att1",
+          1.0,
+          Some(1.0),
+          Some(1.791759469228055),
+          Some(1.0),
+          6,
+          DataTypeInstances.Fractional,
+          false,
+          Map.empty,
+          actualColumnProfile.histogram,
+          Some(BucketDistribution(List(BucketValue(1.0, 1.25, 1),
+            BucketValue(1.25, 1.5, 0),
+            BucketValue(1.5, 1.75, 0),
+            BucketValue(1.75, 2.0, 0),
+            BucketValue(2.0, 2.25, 1),
+            BucketValue(2.25, 2.5, 0),
+            BucketValue(2.5, 2.75, 0),
+            BucketValue(2.75, 3.0, 0),
+            BucketValue(3.0, 3.25, 1),
+            BucketValue(3.25, 3.5, 0),
+            BucketValue(3.5, 3.75, 0),
+            BucketValue(3.75, 4.0, 0),
+            BucketValue(4.0, 4.25, 1),
+            BucketValue(4.25, 4.5, 0),
+            BucketValue(4.5, 4.75, 0),
+            BucketValue(4.75, 5.0, 0),
+            BucketValue(5.0, 5.25, 1),
+            BucketValue(5.25, 5.5, 0),
+            BucketValue(5.5, 5.75, 0),
+            BucketValue(5.75, 6.0, 1)),
+            List(0.64, 2048.0),
+            Array(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)))),
+          Some(3.5),
+          Some(6.0),
+          Some(1.0),
+          Some(21.0),
+          Some(1.707825127659933),
+          Some(Seq(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0,
+            2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
+            2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0,
+            3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0,
+            3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0,
+            4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
+            4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+            5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+            5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0,
+            6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)),
+          Some(Map[String, Double]("att1" -> 1.0))
+        )
+
+        assertProfilesEqual(expectedColumnProfile,
+          actualColumnProfile.asInstanceOf[NumericColumnProfile])
+      }
+
+    "return correct StandardColumnProfile plus histogram for String column" in
+      withSparkSession { session =>
+
+        val data = getDfWithNumericFractionalValues(session)
+
+        val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("item")),
+          exactUniqueness = true, exactUniquenessCols = Some(Seq("item")), histogram = true)
+          .profiles("item")
+
+        val expectedColumnProfile = StandardColumnProfile(
+          "item",
+          1.0,
+          Some(1.0),
+          Some(1.791759469228055),
+          Some(1.0),
+          6,
+          DataTypeInstances.String,
+          false,
+          Map.empty,
+          Some(Distribution(Map("4" -> DistributionValue(1, 0.16666666666666666),
+            "5" -> DistributionValue(1, 0.16666666666666666),
+            "6" -> DistributionValue(1, 0.16666666666666666),
+            "1" -> DistributionValue(1, 0.16666666666666666),
+            "2" -> DistributionValue(1, 0.16666666666666666),
+            "3" -> DistributionValue(1, 0.16666666666666666)), 6))
+        )
+
+        assertStandardProfilesEqual(expectedColumnProfile,
+          actualColumnProfile.asInstanceOf[StandardColumnProfile])
+      }
+
+    "return correct StandardColumnProfile plus histogram for Decimal column" in
+      withSparkSession { session =>
+
+        val schema =
+          StructType(Seq(StructField(name = "num", dataType = DecimalType.SYSTEM_DEFAULT),
+            StructField(name = "num2", dataType = DecimalType.SYSTEM_DEFAULT)))
+
+        val rows = session.sparkContext.parallelize(Seq(
+          Row(BigDecimal(1), BigDecimal(4)),
+          Row(BigDecimal(2), BigDecimal(3)),
+          Row(BigDecimal(3), BigDecimal(2)),
+          Row(BigDecimal(4), BigDecimal(1))))
+
+        val data = session.createDataFrame(rows, schema)
+
+        val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("num", "num2")),
+          histogram = true).profiles("num").asInstanceOf[NumericColumnProfile]
+
+        val expectedColumnProfile = NumericColumnProfile(
+          "num",
+          1.0,
+          None,
+          None,
+          None,
+          4,
+          DataTypeInstances.Decimal,
+          false,
+          Map.empty,
+          None,
+          None,
+          Some(2.5),
+          Some(4),
+          Some(1),
+          Some(10),
+          Some(1.118033988749895),
+          None,
+          Some(Map("num2" -> -1.0, "num" -> 1.0))
+        )
+
+        assertProfilesEqual(expectedColumnProfile, actualColumnProfile)
+      }
+
+    "return correct StandardColumnProfile for Decimal column and correlations off" in
+      withSparkSession { session =>
+
+        val schema =
+          StructType(Seq(StructField(name = "num", dataType = DecimalType.SYSTEM_DEFAULT),
+            StructField(name = "num2", dataType = DecimalType.SYSTEM_DEFAULT)))
+
+        val rows = session.sparkContext.parallelize(Seq(
+          Row(BigDecimal(1), BigDecimal(4)),
+          Row(BigDecimal(2), BigDecimal(3)),
+          Row(BigDecimal(3), BigDecimal(2)),
+          Row(BigDecimal(4), BigDecimal(1))))
+
+        val data = session.createDataFrame(rows, schema)
+
+        val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("num", "num2")),
+          histogram = true, correlation = false).profiles("num").asInstanceOf[NumericColumnProfile]
+
+        val expectedColumnProfile = NumericColumnProfile(
+          "num",
+          1.0,
+          None,
+          None,
+          None,
+          4,
+          DataTypeInstances.Decimal,
+          false,
+          Map.empty,
+          None,
+          None,
+          Some(2.5),
+          Some(4),
+          Some(1),
+          Some(10),
+          Some(1.118033988749895),
+          None,
+          None
+        )
+
+        assertProfilesEqual(expectedColumnProfile, actualColumnProfile)
+      }
+
+    "return correct NumericColumnProfiles With KLL for numeric columns with correct DataType" in
+      withSparkSession { session =>
+
+        val data = getDfWithNumericFractionalValuesForKLL(session)
+
+        val actualColumnProfile = ColumnProfiler.profile(data, Option(Seq("att1")), false, 1,
+          kllProfiling = true,
+          kllParameters = Option(KLLParameters(2, 0.64, 2)))
+          .profiles("att1")
+
+        val expectedColumnProfile = NumericColumnProfile(
+          "att1",
+          1.0,
+          Some(1.0),
+          Some(3.4011973816621546),
+          Some(1.0),
+          30,
+          DataTypeInstances.Fractional,
+          false,
+          Map.empty,
+          None,
+          Some(BucketDistribution(List(BucketValue(1.0, 15.5, 16),
+            BucketValue(15.5, 30.0, 14)),
+            List(0.64, 2.0),
+            Array(Array(27.0, 28.0, 29.0, 30.0),
+              Array(25.0),
+              Array(1.0, 6.0, 10.0, 15.0, 19.0, 23.0)))),
+          Some(15.5),
+          Some(30.0),
+          Some(1.0),
+          Some(465.0),
+          Some(8.65544144839919),
+          Some(Seq(1.0, 1.0, 1.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0,
+            6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 10.0, 10.0, 10.0,
+            10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0,
+            10.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0,
+            15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 19.0, 19.0, 19.0,
+            19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0,
+            19.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0,
+            23.0, 23.0, 23.0, 23.0, 23.0, 25.0, 25.0, 25.0, 25.0,
+            25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0,
+            25.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 28.0, 28.0,
+            28.0, 28.0, 29.0, 29.0, 29.0, 30.0, 30.0, 30.0)),
+          Some(Map[String, Double]("att1" -> 1.0)))
+
+        assertProfilesEqual(expectedColumnProfile,
+          actualColumnProfile.asInstanceOf[NumericColumnProfile])
+      }
+
+    "return KLL Sketches for ShortType columns" in withSparkSession { session =>
+      val attribute = "attribute"
+      val data = com.amazon.deequ.dataFrameWithColumn(
+        attribute,
+        ShortType,
+        session,
+        Row(1: Short),
+        Row(2: Short),
+        Row(3: Short),
+        Row(4: Short),
+        Row(5: Short),
+        Row(6: Short),
+        Row(null)
+      )
+
+      val actualColumnProfile = ColumnProfiler.profile(data,
+        kllProfiling = true,
+        kllParameters = Option(KLLParameters(2, 0.64, 2)))
+        .profiles(attribute)
+      val numericalProfile = actualColumnProfile.asInstanceOf[NumericColumnProfile]
+      assert(numericalProfile.kll.isDefined)
+      val kll = numericalProfile.kll
+      assert(kll.get.buckets == List(BucketValue(1.0, 3.5, 4), BucketValue(3.5, 6.0, 2)))
+      assert(kll.get.parameters == List(0.64, 2.0))
+      assert(kll.get.data.length == 2)
+      val target = Array(Array(5.0, 6.0), Array(1.0, 3.0))
+      for (i <- kll.get.data.indices) {
+        assert(kll.get.data(i).sameElements(target(i)))
+      }
+    }
+  }
+}
+
diff --git a/src/test/scala/com/amazon/deequ/VerificationResultTest.scala b/src/test/scala/com/amazon/deequ/VerificationResultTest.scala
index 93aa73201..1f891e68e 100644
--- a/src/test/scala/com/amazon/deequ/VerificationResultTest.scala
+++ b/src/test/scala/com/amazon/deequ/VerificationResultTest.scala
@@ -168,6 +168,7 @@ class VerificationResultTest extends WordSpec with Matchers with SparkContextSpe
               |"constraint_message":"Value: 1.0 does not meet the constraint requirement!
               | Should be smaller than 0.8!"}]"""
               .stripMargin.replaceAll("\n", "")
+              .stripMargin.replaceAll("\r", "")
 
           assertSameResultsJson(checkResultsAsJson, expectedJson)
         }
diff --git a/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala b/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala
index 03787b886..7674654b4 100644
--- a/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala
+++ b/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala
@@ -55,7 +55,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
     "compute correct metrics" in withSparkSession { sparkSession =>
       val dfMissing = getDfMissing(sparkSession)
 
-      assert(Completeness("someMissingColumn").preconditions.size == 2,
+      assert(Completeness("someMissingColumn").preconditions.size == 1,
         "should check column name availability")
       val result1 = Completeness("att1").calculate(dfMissing)
       assert(result1 == DoubleMetric(Entity.Column,
@@ -346,7 +346,9 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
       val nonZeroValuesWithStringKeys = nonZeroValues.toSeq
         .map { case (instance, distValue) => instance.toString -> distValue }
 
-      val dataTypes = DataTypeInstances.values.map { _.toString }
+      val dataTypes = DataTypeInstances.values.filterNot(_.equals(DataTypeInstances.Decimal)).map {
+        _.toString
+      }
 
       val zeros = dataTypes
         .diff { nonZeroValuesWithStringKeys.map { case (distKey, _) => distKey }.toSet }
@@ -572,7 +574,6 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
         Row(BigDecimal(678))))
 
       val data = session.createDataFrame(rows, schema)
-
       val result = Minimum("num").calculate(data)
 
       assert(result.value.isSuccess)
diff --git a/src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala b/src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala
index efae77f51..b8861bff5 100644
--- a/src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala
+++ b/src/test/scala/com/amazon/deequ/analyzers/StatesTest.scala
@@ -36,17 +36,6 @@ class StatesTest extends AnyWordSpec with Matchers with SparkContextSpec with Fi
 
       val stateAB = stateA.sum(stateB)
 
-      println(stateA.frequencies.schema)
-      stateA.frequencies.collect().foreach { println }
-      println()
-
-      println(stateB.frequencies.schema)
-      stateB.frequencies.collect().foreach { println }
-      println()
-
-      println(stateAB.frequencies.schema)
-      stateAB.frequencies.collect().foreach { println }
-
       val mergedFrequencies = stateAB.frequencies.collect()
         .map { row => row.getString(0) -> row.getLong(1) }
         .toMap
diff --git a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala
index 70e998ee5..b2e45bc51 100644
--- a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala
+++ b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala
@@ -55,8 +55,6 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix
       val context = runChecks(getDfCompleteAndInCompleteColumns(sparkSession),
         check1, check2, check3)
 
-      context.metricMap.foreach { println }
-
       assertEvaluatesTo(check1, context, CheckStatus.Success)
       assertEvaluatesTo(check2, context, CheckStatus.Error)
       assertEvaluatesTo(check3, context, CheckStatus.Warning)
@@ -82,8 +80,6 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix
         val context = runChecks(getDfCompleteAndInCompleteColumns(sparkSession),
           check1, check2, check3)
 
-        context.metricMap.foreach { println }
-
         assertEvaluatesTo(check1, context, CheckStatus.Success)
         assertEvaluatesTo(check2, context, CheckStatus.Error)
         assertEvaluatesTo(check3, context, CheckStatus.Warning)
@@ -130,8 +126,6 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix
         val context = runChecks(getDfMissing(sparkSession),
           check1, check2, check3)
 
-        context.metricMap.foreach { println }
-
         assertEvaluatesTo(check1, context, CheckStatus.Success)
         assertEvaluatesTo(check2, context, CheckStatus.Error)
         assertEvaluatesTo(check3, context, CheckStatus.Warning)
@@ -823,7 +817,6 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix
       val check = Check(CheckLevel.Error, "some description")
         .containsCreditCardNumber(col, _ == 1.0)
       val context = runChecks(df, check)
-      context.allMetrics.foreach(println)
       assertEvaluatesTo(check, context, CheckStatus.Success)
     }
 
diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerRunnerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerRunnerTest.scala
index c5b3164f8..6ca25b95a 100644
--- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerRunnerTest.scala
+++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerRunnerTest.scala
@@ -60,7 +60,7 @@ class ColumnProfilerRunnerTest extends WordSpec with Matchers with SparkContextS
             (results, stat.jobCount)
           }
 
-        assert(jobNumberAllCalculations == 3)
+        assert(jobNumberAllCalculations == 1)
         assert(jobNumberReusing == 0)
         assertConstraintSuggestionResultsEquals(separateResults, resultsReusingMetrics)
       }
@@ -191,6 +191,7 @@ class ColumnProfilerRunnerTest extends WordSpec with Matchers with SparkContextS
       val results = ColumnProfilerRunner()
         .onData(df)
         .withKLLProfiling()
+        .nonOptimized()
         .run()
 
       assert(results.profiles("att1").asInstanceOf[NumericColumnProfile].kll.isDefined)
@@ -198,6 +199,57 @@ class ColumnProfilerRunnerTest extends WordSpec with Matchers with SparkContextS
       assert(results.profiles("att3").asInstanceOf[NumericColumnProfile].kll.isDefined)
     }
 
+    "should run optimized Profiler with two exact uniqueness columns" in
+      withMonitorableSparkSession {(sparkSession, sparkMonitor) =>
+
+      val df = getDfWithNumericValues(sparkSession)
+
+      val (results: ColumnProfiles, jobNumberAllCalculations) = sparkMonitor
+        .withMonitoringSession { stat =>
+          val results = ColumnProfilerRunner()
+            .onData(df)
+            .withExactUniqueness(true)
+            .restrictExactUniquenessColumns(Seq("att1", "att2"))
+            .run()
+
+          (results, stat.jobCount)
+        }
+
+      assert(jobNumberAllCalculations == 5)
+      assert(results.profiles("att1").asInstanceOf[NumericColumnProfile].uniqueness.isDefined)
+      assert(results.profiles("att2").asInstanceOf[NumericColumnProfile].uniqueness.isDefined)
+      assert(results.profiles("att3").asInstanceOf[NumericColumnProfile].uniqueness.isEmpty)
+
+    }
+
+    "should run less jobs with optimized Profiler" in
+      withMonitorableSparkSession { (sparkSession, sparkMonitor) =>
+
+      val df = getDfWithNumericValues(sparkSession)
+
+      val jobNumberUnoptimized = sparkMonitor
+        .withMonitoringSession { stat =>
+          val results = ColumnProfilerRunner()
+            .onData(df)
+            .nonOptimized()
+            .run()
+
+          stat.jobCount
+        }
+
+      val jobNumberOptimized = sparkMonitor
+        .withMonitoringSession { stat =>
+          val results = ColumnProfilerRunner()
+            .onData(df)
+            .run()
+
+          stat.jobCount
+        }
+
+      assert(jobNumberUnoptimized == 10)
+      assert(jobNumberOptimized == 1)
+    }
+
   }
 
   private[this] def assertConstraintSuggestionResultsEquals(
diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
index e0441be2a..a02382ebe 100644
--- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
+++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
@@ -61,9 +61,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "att2",
         2.0 / 3.0,
-        0.5,
-        0.5623351446188083,
-        0.25,
+        Some(0.5),
+        Some(0.5623351446188083),
+        Some(0.25),
         2,
         DataTypeInstances.String,
         true,
@@ -115,9 +115,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "item",
         1.0,
-        1.0,
-        1.791759469228055,
-        1.0,
+        Some(1.0),
+        Some(1.791759469228055),
+        Some(1.0),
         6,
         DataTypeInstances.String,
         false,
@@ -141,9 +141,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "att2",
         2.0 / 3.0,
-        0.5,
-        0.5623351446188083,
-        0.25,
+        Some(0.5),
+        Some(0.5623351446188083),
+        Some(0.25),
         2,
         DataTypeInstances.String,
         true,
@@ -173,9 +173,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = NumericColumnProfile(
         "item",
         1.0,
-        1.0,
-        1.0,
-        1.0,
+        Some(1.0),
+        Some(1.0),
+        Some(1.0),
         6,
         DataTypeInstances.Integral,
         true,
@@ -217,9 +217,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         val expectedColumnProfile = NumericColumnProfile(
           "item",
           1.0,
-          1.0,
-          1.0,
-          1.0,
+          Some(1.0),
+          Some(1.0),
+          Some(1.0),
           6,
           DataTypeInstances.Integral,
           true,
@@ -262,9 +262,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         val expectedColumnProfile = NumericColumnProfile(
           "item",
           1.0,
-          1.0,
-          1.0,
-          1.0,
+          Some(1.0),
+          Some(1.0),
+          Some(1.0),
           6,
           DataTypeInstances.Integral,
           true,
@@ -342,9 +342,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = NumericColumnProfile(
         "att1",
         1.0,
-        1.0,
-        1.0,
-        1.0,
+        Some(1.0),
+        Some(1.0),
+        Some(1.0),
         6,
         DataTypeInstances.Fractional,
         false,
@@ -379,9 +379,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       val expectedColumnProfile = StringColumnProfile(
         "att2",
         2.0 / 3.0,
-        0.5,
-        0.5623351446188083,
-        0.25,
+        Some(0.5),
+        Some(0.5623351446188083),
+        Some(0.25),
         2,
         DataTypeInstances.String,
         isDataTypeInferred = true,
@@ -590,9 +590,9 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       StandardColumnProfile(
         "PassengerId",
         1.0,
-        1.0,
-        1.0,
-        1.0,
+        Some(1.0),
+        Some(1.0),
+        Some(1.0),
         891,
         DataTypeInstances.Integral,
         false,
@@ -601,25 +601,25 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
       StandardColumnProfile(
         "Survived",
         1.0,
-        1.0,
-        1.0,
-        1.0,
+        Some(1.0),
+        Some(1.0),
+        Some(1.0),
         2,
         DataTypeInstances.Integral,
         false,
         Map.empty,
         None),
-      StandardColumnProfile("Pclass", 1.0, 1.0, 1.0, 1.0, 3,
+      StandardColumnProfile("Pclass", 1.0, Some(1.0), Some(1.0), Some(1.0), 3,
         DataTypeInstances.Integral, false, Map.empty, None),
-      StandardColumnProfile("Name", 1.0, 1.0, 1.0, 1.0, 0,
+      StandardColumnProfile("Name", 1.0, Some(1.0), Some(1.0), Some(1.0), 0,
         DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Sex", 1.0, 1.0, 1.0, 1.0, 2,
+      StandardColumnProfile("Sex", 1.0, Some(1.0), Some(1.0), Some(1.0), 2,
         DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Ticket", 1.0, 1.0, 1.0, 1.0, 681,
+      StandardColumnProfile("Ticket", 1.0, Some(1.0), Some(1.0), Some(1.0), 681,
         DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Fare", 1.0, 1.0, 1.0, 1.0, 0,
+      StandardColumnProfile("Fare", 1.0, Some(1.0), Some(1.0), Some(1.0), 0,
         DataTypeInstances.Fractional, false, Map.empty, None),
-      StandardColumnProfile("Cabin", 0.22, 1.0, 1.0, 1.0, 0,
+      StandardColumnProfile("Cabin", 0.22, Some(1.0), Some(1.0), Some(1.0), 0,
         DataTypeInstances.String, true, Map.empty, None)
     )
 
diff --git a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResultTest.scala b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResultTest.scala
index 6a98bf3c6..5927608af 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResultTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionResultTest.scala
@@ -278,6 +278,7 @@ class ConstraintSuggestionResultTest extends WordSpec with Matchers with SparkCo
               |  ]
               |}"""
               .stripMargin.replaceAll("\n", "")
+              .stripMargin.replaceAll("\r", "")
 
           assertJsonStringsAreEqual(constraintSuggestionJson, expectedJson)
         }
@@ -366,6 +367,7 @@ class ConstraintSuggestionResultTest extends WordSpec with Matchers with SparkCo
               |  ]
               |}"""
               .stripMargin.replaceAll("\n", "")
+              .stripMargin.replaceAll("\r", "")
 
           assertJsonStringsAreEqual(evaluationResultsJson, expectedJson)
         }
@@ -453,6 +455,7 @@ class ConstraintSuggestionResultTest extends WordSpec with Matchers with SparkCo
               |  ]
               |}"""
               .stripMargin.replaceAll("\n", "")
+              .stripMargin.replaceAll("\r", "")
 
           assertJsonStringsAreEqual(evaluationResultsJson, expectedJson)
         }
diff --git a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunnerTest.scala b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunnerTest.scala
index 9ec88f90b..1cc5883e7 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunnerTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunnerTest.scala
@@ -78,7 +78,7 @@ class ConstraintSuggestionRunnerTest extends WordSpec with Matchers with SparkCo
             (results, stat.jobCount)
           }
 
-        assert(jobNumberAllCalculations == 3)
+        assert(jobNumberAllCalculations == 10)
         assert(jobNumberReusing == 0)
         assertConstraintSuggestionResultsEquals(separateResults, resultsReusingMetrics)
       }
diff --git a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionsIntegrationTest.scala b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionsIntegrationTest.scala
index d90b16ef7..920c5ff27 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionsIntegrationTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/ConstraintSuggestionsIntegrationTest.scala
@@ -34,9 +34,7 @@ case class Record(
     propertyA: String,
     measurement2: String,
     measurement3: String,
-    description: String,
-    allNullColumn: String,
-    allNullColumn2: java.lang.Double
+    description: String
 )
 
 class ConstraintSuggestionsIntegrationTest extends WordSpec with SparkContextSpec {
@@ -76,7 +74,7 @@ class ConstraintSuggestionsIntegrationTest extends WordSpec with SparkContextSpe
           val randomLength = minLength + rng.nextInt(maxLength - minLength + 1)
           val description = rng.nextString(randomLength)
 
-          Record(id, marketplace, measurement, propertyA, measurement2, measurement3, description, null, null)
+          Record(id, marketplace, measurement, propertyA, measurement2, measurement3, description)
         }
 
       val data = session.createDataFrame(records)
@@ -114,28 +112,6 @@ class ConstraintSuggestionsIntegrationTest extends WordSpec with SparkContextSpe
         analyzer == Completeness("marketplace") && assertionFunc(1.0)
       }
 
-      // Categorical range for "marketplace"
-      assertConstraintExistsIn(constraintSuggestionResult) { (analyzer, assertionFunc) =>
-
-        assertionFunc(1.0) &&
-          analyzer.isInstanceOf[Compliance] &&
-          analyzer.asInstanceOf[Compliance]
-            .instance.startsWith(s"'marketplace' has value range")
-      }
-
-      // Categorical range for "marketplace" with values
-      assert(
-        constraintSuggestionResult.constraintSuggestions
-          .getOrElse("marketplace", Seq.empty)
-          .exists {
-            case value: ConstraintSuggestionWithValue[Seq[String]] =>
-              val constraintWithValue = value.value
-              println(constraintWithValue)
-              constraintWithValue.sorted == categories.toSeq.sorted
-            case _ => false
-          }
-      )
-
       // IS NOT NULL for "measurement"
       assertConstraintExistsIn(constraintSuggestionResult) { (analyzer, assertionFunc) =>
         analyzer == Completeness("measurement") && assertionFunc(1.0)
diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
index 9a90af7ca..d18282901 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
@@ -34,9 +34,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "CompleteIfCompleteRule" should {
     "be applied correctly" in {
 
-      val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+      val complete = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
         String, false, Map.empty, None)
-      val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100,
+      val incomplete = StandardColumnProfile("col1", .25, Some(1.0), Some(1.0), Some(1.0), 100,
         String, false, Map.empty, None)
 
       val completeInteger =
@@ -131,9 +131,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "RetainCompletenessRule" should {
     "be applied correctly" in {
 
-      val complete = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+      val complete = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
         String, false, Map.empty, None)
-      val incomplete = StandardColumnProfile("col1", .25, 1.0, 1.0, 1.0, 100,
+      val incomplete = StandardColumnProfile("col1", .25, Some(1.0), Some(1.0), Some(1.0), 100,
         String, false, Map.empty, None)
 
       assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000))
@@ -170,8 +170,11 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
       val codeForConstraint = RetainCompletenessRule().candidate(fakeColumnProfile, 100)
         .codeForConstraint
 
-      val expectedCodeForConstraint = """.hasCompleteness("att1", _ >= 0.4,
-          | Some("It should be above 0.4!"))""".stripMargin.replaceAll("\n", "")
+        val expectedCodeForConstraint =
+          """.hasCompleteness("att1", _ >= 0.4,
+            | Some("It should be above 0.4!"))"""
+            .stripMargin.replaceAll("\n", "")
+            .stripMargin.replaceAll("\r", "")
 
       assert(expectedCodeForConstraint == codeForConstraint)
 
@@ -192,13 +195,13 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "UniqueIfApproximatelyUniqueRule" should {
     "be applied correctly" in {
 
-      val unique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+      val unique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
         String, false, Map.empty, None)
-      val maybeUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95,
+      val maybeUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95,
         String, false, Map.empty, None)
-      val maybeNonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 91,
+      val maybeNonUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 91,
         String, false, Map.empty, None)
-      val nonUnique = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20,
+      val nonUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 20,
         String, false, Map.empty, None)
 
       assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(unique, 100))
@@ -259,24 +262,24 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "RetainTypeRule" should {
     "be applied correctly" in {
 
-      val string = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+      val string = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
         String, true, Map.empty, None)
-      val boolean = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+      val boolean = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
         Boolean, true, Map.empty, None)
-      val fractional = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+      val fractional = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
         Fractional, true, Map.empty, None)
-      val integer = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+      val integer = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
         Integral, true, Map.empty, None)
-      val unknown = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
+      val unknown = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
         Unknown, true, Map.empty, None)
-      val stringNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
-        String, false, Map.empty, None)
-      val booleanNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
-        Boolean, false, Map.empty, None)
-      val fractionalNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
-        Fractional, false, Map.empty, None)
-      val integerNonInferred = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
-        Integral, false, Map.empty, None)
+      val stringNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
+        100, String, false, Map.empty, None)
+      val booleanNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
+        100, Boolean, false, Map.empty, None)
+      val fractionalNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
+        Some(1.0), 100, Fractional, false, Map.empty, None)
+      val integerNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
+        100, Integral, false, Map.empty, None)
 
       assert(!RetainTypeRule().shouldBeApplied(string, 100))
       assert(!RetainTypeRule().shouldBeApplied(unknown, 100))
@@ -393,26 +396,26 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
 
       val noDistribution = Distribution(Map.empty, 0)
 
-      val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
-        String, false, Map.empty, Some(nonSkewedDist))
-      val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
-        100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDist))
-      val stringWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0,
+      val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
+        Some(1.0), 100, String, false, Map.empty, Some(nonSkewedDist))
+      val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
+        Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDist))
+      val stringWithFlgDist = StandardColumnProfile("flg", 1.0, Some(1.0), Some(1.0), Some(1.0),
         2, String, false, Map.empty, Some(flgDist))
-      val integralWithFlgDist = StandardColumnProfile("flg", 1.0, 1.0, 1.0, 1.0,
+      val integralWithFlgDist = StandardColumnProfile("flg", 1.0, Some(1.0), Some(1.0), Some(1.0),
         2, DataTypeInstances.Integral, false, Map.empty, Some(flgDist))
 
-      val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100,
-        String, false, Map.empty, Some(skewedDist))
-      val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95,
+      val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
+        Some(1.0), 100, String, false, Map.empty, Some(skewedDist))
+      val stringNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95,
         String, false, Map.empty, None)
-      val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94,
+      val boolNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 94,
         Boolean, false, Map.empty, None)
-      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20,
-        Boolean, false, Map.empty, Some(noDistribution))
-      val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
-        100, DataTypeInstances.Integral, false, Map.empty, Some(skewedDist))
-      val integralNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
+      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
+        20, Boolean, false, Map.empty, Some(noDistribution))
+      val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
+        Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedDist))
+      val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
         95, DataTypeInstances.Integral, false, Map.empty, None)
 
       assert(CategoricalRangeRule().shouldBeApplied(stringWithNonSkewedDist, 100))
@@ -576,33 +579,36 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
       val noDistribution = Distribution(Map.empty, 0)
 
       val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile(
-        "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty,
+        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty,
         Some(nonSkewedDistWithFractionalCategoricalRange))
       val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile(
-        "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty,
+        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty,
         Some(nonSkewedDistWithActualCategoricalRange))
       val stringWithSomewhatSkewedDist = StandardColumnProfile(
-        "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(somewhatSkewedDist))
+        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty,
+        Some(somewhatSkewedDist))
       val stringWithSkewedDist = StandardColumnProfile(
-        "col1", 1.0, 1.0, 1.0, 1.0, 100, String, false, Map.empty, Some(skewedDist))
-      val stringNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 95,
-        String, false, Map.empty, None)
-      val boolNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 94,
+        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty,
+        Some(skewedDist))
+      val stringNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
+        95, String, false, Map.empty, None)
+      val boolNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 94,
         Boolean, false, Map.empty, None)
-      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 20,
-        Boolean, false, Map.empty, Some(noDistribution))
+      val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
+        20, Boolean, false, Map.empty, Some(noDistribution))
 
       val integralWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1",
-        1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty,
+        1.0, Some(1.0), Some(1.0), Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty,
         Some(nonSkewedIntegralDistWithFractionalCategoricalRange))
       val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile(
-        "col1", 1.0, 1.0, 1.0, 1.0, 100, DataTypeInstances.Integral, false, Map.empty,
-        Some(nonSkewedIntegralDistWithActualCategoricalRange))
-      val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
-        100, DataTypeInstances.Integral, false, Map.empty, Some(somewhatSkewedIntegralDist))
-      val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
-        100, DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist))
-      val integralNoDist = StandardColumnProfile("col1", 1.0, 1.0, 1.0, 1.0,
+        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, DataTypeInstances.Integral, false,
+        Map.empty, Some(nonSkewedIntegralDistWithActualCategoricalRange))
+      val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
+        Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty,
+        Some(somewhatSkewedIntegralDist))
+      val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
+        Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist))
+      val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
         95, DataTypeInstances.Integral, false, Map.empty, None)
 
       assert(FractionalCategoricalRangeRule().shouldBeApplied(stringWithSomewhatSkewedDist, 100))
@@ -700,8 +706,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
       val codeForConstraint = FractionalCategoricalRangeRule().candidate(fakeColumnProfile, 100)
         .codeForConstraint
 
-      val expectedCodeForConstraint = ".isContainedIn(\"categoricalColumn\", Array(\"_b%%__\"," +
-        " \"'_[a_[]}!@'\"), _ >= 0.9, Some(\"It should be above 0.9!\"))"
+      val expectedCodeForConstraint = ".isContainedIn(\"categoricalColumn\", Array(\"_b%%__\", " +
+        "\"'_[a_[]}!@'\"), _ >= 0.9, Some(\"It should be above 0.9!\"))"
 
       assert(expectedCodeForConstraint == codeForConstraint)
 
@@ -726,7 +732,7 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "NonNegativeNumbersRule and PositiveNumbersRule" should {
     "be applied correctly" in {
       def columnProfileWithMinimum(minimum: Double): NumericColumnProfile = {
-        NumericColumnProfile("col1", 1.0, 1.0, 1.0, 1.0, 100, Fractional,
+        NumericColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Fractional,
           isDataTypeInferred = false, Map.empty, None, None, Some(10), Some(100),
           Some(minimum), Some(10000), Some(1.0), None, None)
       }

From 07f15ef9ae901423606ed441341d975879b30cfe Mon Sep 17 00:00:00 2001
From: Fabio Buso <buso.fabio@gmail.com>
Date: Thu, 26 Aug 2021 22:07:35 +0200
Subject: [PATCH 13/21] Increase scala-style max method parameters check

---
 deequ-scalastyle.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deequ-scalastyle.xml b/deequ-scalastyle.xml
index c726413bc..f97dfc64e 100644
--- a/deequ-scalastyle.xml
+++ b/deequ-scalastyle.xml
@@ -35,7 +35,7 @@
     </check>
 
     <check customId="argcount" level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
-        <parameters><parameter name="maxParameters"><![CDATA[12]]></parameter></parameters>
+        <parameters><parameter name="maxParameters"><![CDATA[20]]></parameter></parameters>
     </check>
 
     <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>

From 6ce9015727286b8dedbd8371a027320d0944ff9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Till=20D=C3=B6hmen?= <till.doehmen@web.de>
Date: Tue, 14 Sep 2021 10:43:23 +0200
Subject: [PATCH 14/21] Support for Decimal-type histograms (#10)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: doehmen-admin <doehmen-admin@muschas.ad.fit.fraunhofer.de>
Co-authored-by: Till Döhmen <tdoehmen@users.noreply.github.com>
---
 .../deequ/analyzers/runners/KLLRunner.scala   |  10 +-
 .../deequ/profiles/ColumnProfiler.scala       |   9 +-
 .../deequ/KLL/KLLProfileTestApprox.scala      | 160 ++++++++----------
 .../amazon/deequ/utils/FixtureSupport.scala   |  12 ++
 4 files changed, 101 insertions(+), 90 deletions(-)

diff --git a/src/main/scala/com/amazon/deequ/analyzers/runners/KLLRunner.scala b/src/main/scala/com/amazon/deequ/analyzers/runners/KLLRunner.scala
index 64a008009..8f3739ec9 100644
--- a/src/main/scala/com/amazon/deequ/analyzers/runners/KLLRunner.scala
+++ b/src/main/scala/com/amazon/deequ/analyzers/runners/KLLRunner.scala
@@ -18,7 +18,7 @@ package com.amazon.deequ.analyzers.runners
 
 import com.amazon.deequ.analyzers.{Analyzer, KLLParameters, KLLSketch, KLLState, QuantileNonSample, State, StateLoader, StatePersister}
 import com.amazon.deequ.metrics.Metric
-import org.apache.spark.sql.types.{ByteType, DoubleType, FloatType, IntegerType, LongType, ShortType, StructType}
+import org.apache.spark.sql.types.{ByteType, DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType, StructType}
 import org.apache.spark.sql.{DataFrame, Row}
 
 @SerialVersionUID(1L)
@@ -84,6 +84,13 @@ class FloatQuantileNonSample(sketchSize: Int, shrinkingFactor: Double)
   override def itemAsDouble(item: Any): Double = item.asInstanceOf[Float].toDouble
 }
 
+@SerialVersionUID(1L)
+class DecimalQuantileNonSample(sketchSize: Int, shrinkingFactor: Double)
+  extends UntypedQuantileNonSample(sketchSize, shrinkingFactor) with Serializable {
+  override def itemAsDouble(item: Any): Double = item.asInstanceOf[java.math.BigDecimal]
+    .doubleValue()
+}
+
 object KLLRunner {
 
   def computeKLLSketchesInExtraPass(
@@ -139,6 +146,7 @@ object KLLRunner {
         case ShortType => new ShortQuantileNonSample(sketchSize, shrinkingFactor)
         case IntegerType => new IntQuantileNonSample(sketchSize, shrinkingFactor)
         case LongType => new LongQuantileNonSample(sketchSize, shrinkingFactor)
+        case DecimalType() => new DecimalQuantileNonSample(sketchSize, shrinkingFactor)
         // TODO at the moment, we will throw exceptions for Decimals
         case _ => throw new IllegalArgumentException(s"Cannot handle ${schema(column).dataType}")
       }
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
index 9de7e3b25..0e1143a4b 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
@@ -339,7 +339,7 @@ object ColumnProfiler {
             analyzers ++= Seq(Minimum(name), Maximum(name), Mean(name),
                StandardDeviation(name), Sum(name))
             // Add KLL analyzer.
-            if (histogram && predefinedTypes(name) != Decimal) {
+            if (histogram) {
               analyzers += KLLSketch(name, kllParameters)
             }
             if (correlation && (maxCorrelationCols.isEmpty || (numericColumnNames.length <=
@@ -842,7 +842,7 @@ object ColumnProfiler {
 
   /* Identifies all columns, which:
    *
-   * (1) have string, boolean, double, float, integer, long, or short data type
+   * (1) have string, boolean, double, float, integer, long, decimal, or short data type
    * (2) have less than `lowCardinalityHistogramThreshold` approximate distinct values
    */
   private[this] def findTargetColumnsForHistograms(
@@ -855,14 +855,15 @@ object ColumnProfiler {
       StringType, BooleanType, DoubleType, FloatType, IntegerType, LongType, ShortType
     )
     val originalStringNumericOrBooleanColumns = schema
-      .filter { field => validSparkDataTypesForHistograms.contains(field.dataType) }
+      .filter { field => validSparkDataTypesForHistograms.contains(field.dataType) ||
+        genericStatistics.typeOf(field.name) == Decimal }
       .map { field => field.name }
       .toSet
 
     genericStatistics.approximateNumDistincts
       .filter { case (column, _) =>
         originalStringNumericOrBooleanColumns.contains(column) &&
-          Set(String, Boolean, Integral, Fractional).contains(genericStatistics.typeOf
+          Set(String, Boolean, Integral, Fractional, Decimal).contains(genericStatistics.typeOf
           (column))
       }
       .filter { case (_, count) => count <= lowCardinalityHistogramThreshold }
diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala
index a64f8071d..572e177b3 100644
--- a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala
+++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala
@@ -68,8 +68,81 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
     assert(expected.histogram == actual.histogram)
   }
 
+
   "Column Profiler" should {
 
+    "return correct NumericColumnProfiles for decimal column" in
+      withSparkSession { session =>
+
+        val data = getDfWithDecimalFractionalValues(session)
+
+        val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("att1",
+          "att2")), kllParameters = Some(KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch
+          .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true)
+          .profiles("att1")
+
+        val expectedColumnProfile = NumericColumnProfile(
+          "att1",
+          1.0,
+          None,
+          None,
+          None,
+          6,
+          DataTypeInstances.Decimal,
+          false,
+          Map.empty,
+          Some(Distribution(Map[String, DistributionValue](
+            "4.000000000000000000" -> DistributionValue(1, 0.16666666666666666),
+            "1.000000000000000000" -> DistributionValue(1, 0.16666666666666666),
+            "5.000000000000000000" -> DistributionValue(1, 0.16666666666666666),
+            "6.000000000000000000" -> DistributionValue(1, 0.16666666666666666),
+            "2.000000000000000000" -> DistributionValue(1, 0.16666666666666666),
+            "3.000000000000000000" -> DistributionValue(1, 0.16666666666666666)), 6)),
+          Some(BucketDistribution(List(BucketValue(1.0, 1.25, 1),
+            BucketValue(1.25, 1.5, 0),
+            BucketValue(1.5, 1.75, 0),
+            BucketValue(1.75, 2.0, 0),
+            BucketValue(2.0, 2.25, 1),
+            BucketValue(2.25, 2.5, 0),
+            BucketValue(2.5, 2.75, 0),
+            BucketValue(2.75, 3.0, 0),
+            BucketValue(3.0, 3.25, 1),
+            BucketValue(3.25, 3.5, 0),
+            BucketValue(3.5, 3.75, 0),
+            BucketValue(3.75, 4.0, 0),
+            BucketValue(4.0, 4.25, 1),
+            BucketValue(4.25, 4.5, 0),
+            BucketValue(4.5, 4.75, 0),
+            BucketValue(4.75, 5.0, 0),
+            BucketValue(5.0, 5.25, 1),
+            BucketValue(5.25, 5.5, 0),
+            BucketValue(5.5, 5.75, 0),
+            BucketValue(5.75, 6.0, 1)),
+            List(0.64, 2048.0),
+            Array(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)))),
+          Some(3.5),
+          Some(6.0),
+          Some(1.0),
+          Some(21.0),
+          Some(1.707825127659933),
+          Some(Seq(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0,
+            2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
+            2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0,
+            3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0,
+            3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0,
+            4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,
+            4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+            5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+            5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0,
+            6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0)),
+          Some(Map[String, Double]("att1" -> 1.0, "att2" -> 0.9263710192499128))
+        )
+
+        assertProfilesEqual(expectedColumnProfile,
+          actualColumnProfile.asInstanceOf[NumericColumnProfile])
+      }
+
     "return correct NumericColumnProfiles for numeric columns with correct DataType" in
       withSparkSession { session =>
 
@@ -301,7 +374,8 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
           DataTypeInstances.String,
           false,
           Map.empty,
-          Some(Distribution(Map("4" -> DistributionValue(1, 0.16666666666666666),
+          Some(Distribution(Map[String, DistributionValue](
+            "4" -> DistributionValue(1, 0.16666666666666666),
             "5" -> DistributionValue(1, 0.16666666666666666),
             "6" -> DistributionValue(1, 0.16666666666666666),
             "1" -> DistributionValue(1, 0.16666666666666666),
@@ -313,90 +387,6 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
           actualColumnProfile.asInstanceOf[StandardColumnProfile])
       }
 
-    "return correct StandardColumnProfile plus histogram for Decimal column" in
-      withSparkSession { session =>
-
-        val schema =
-          StructType(Seq(StructField(name = "num", dataType = DecimalType.SYSTEM_DEFAULT),
-            StructField(name = "num2", dataType = DecimalType.SYSTEM_DEFAULT)))
-
-        val rows = session.sparkContext.parallelize(Seq(
-          Row(BigDecimal(1), BigDecimal(4)),
-          Row(BigDecimal(2), BigDecimal(3)),
-          Row(BigDecimal(3), BigDecimal(2)),
-          Row(BigDecimal(4), BigDecimal(1))))
-
-        val data = session.createDataFrame(rows, schema)
-
-        val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("num", "num2")),
-          histogram = true).profiles("num").asInstanceOf[NumericColumnProfile]
-
-        val expectedColumnProfile = NumericColumnProfile(
-          "num",
-          1.0,
-          None,
-          None,
-          None,
-          4,
-          DataTypeInstances.Decimal,
-          false,
-          Map.empty,
-          None,
-          None,
-          Some(2.5),
-          Some(4),
-          Some(1),
-          Some(10),
-          Some(1.118033988749895),
-          None,
-          Some(Map("num2" -> -1.0, "num" -> 1.0))
-        )
-
-        assertProfilesEqual(expectedColumnProfile, actualColumnProfile)
-      }
-
-    "return correct StandardColumnProfile for Decimal column and correlations off" in
-      withSparkSession { session =>
-
-        val schema =
-          StructType(Seq(StructField(name = "num", dataType = DecimalType.SYSTEM_DEFAULT),
-            StructField(name = "num2", dataType = DecimalType.SYSTEM_DEFAULT)))
-
-        val rows = session.sparkContext.parallelize(Seq(
-          Row(BigDecimal(1), BigDecimal(4)),
-          Row(BigDecimal(2), BigDecimal(3)),
-          Row(BigDecimal(3), BigDecimal(2)),
-          Row(BigDecimal(4), BigDecimal(1))))
-
-        val data = session.createDataFrame(rows, schema)
-
-        val actualColumnProfile = ColumnProfiler.profileOptimized(data, Option(Seq("num", "num2")),
-          histogram = true, correlation = false).profiles("num").asInstanceOf[NumericColumnProfile]
-
-        val expectedColumnProfile = NumericColumnProfile(
-          "num",
-          1.0,
-          None,
-          None,
-          None,
-          4,
-          DataTypeInstances.Decimal,
-          false,
-          Map.empty,
-          None,
-          None,
-          Some(2.5),
-          Some(4),
-          Some(1),
-          Some(10),
-          Some(1.118033988749895),
-          None,
-          None
-        )
-
-        assertProfilesEqual(expectedColumnProfile, actualColumnProfile)
-      }
-
     "return correct NumericColumnProfiles With KLL for numeric columns with correct DataType" in
       withSparkSession { session =>
 
diff --git a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
index 9b6ad9d4e..7b56c744e 100644
--- a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
+++ b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
@@ -274,6 +274,18 @@ trait FixtureSupport {
     ).toDF("item", "att1", "att2")
   }
 
+  def getDfWithDecimalFractionalValues(sparkSession: SparkSession): DataFrame = {
+    import sparkSession.implicits._
+    Seq(
+      ("1", BigDecimal(1.0), BigDecimal(0.0)),
+      ("2", BigDecimal(2.0), BigDecimal(0.0)),
+      ("3", BigDecimal(3.0), BigDecimal(0.0)),
+      ("4", BigDecimal(4.0), BigDecimal(5.0)),
+      ("5", BigDecimal(5.0), BigDecimal(6.0)),
+      ("6", BigDecimal(6.0), BigDecimal(7.0))
+    ).toDF("item", "att1", "att2")
+  }
+
   def getDfWithNumericFractionalValuesForKLL(sparkSession: SparkSession): DataFrame = {
     import sparkSession.implicits._
     Seq(

From 0cd154bfa33fe9088015ca14232e4a0a358498ea Mon Sep 17 00:00:00 2001
From: moritzmeister <meister.mo@gmail.com>
Date: Wed, 1 Dec 2021 17:10:12 +0100
Subject: [PATCH 15/21] Fix NaN bug for histograms

---
 .../scala/com/amazon/deequ/profiles/ColumnProfile.scala  | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
index 84df99511..324108653 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
@@ -206,8 +206,9 @@ object ColumnProfiles {
             val store = new JsonObject()
             store.add("parameters", entry)
 
-            val gson = new Gson()
-            val dataJson = gson.toJson(kllSketch.data)
+            val gson = new GsonBuilder().serializeNulls().create();
+            val dataJson = gson.toJson(kllSketch.data.map(
+              subarr => subarr.map(value => normalizeDouble(value))))
 
             store.addProperty("data", dataJson)
 
@@ -218,10 +219,10 @@ object ColumnProfiles {
           val approxPercentilesJson = new JsonArray()
           numericColumnProfile.approxPercentiles.foreach {
             _.foreach { percentile =>
-              approxPercentilesJson.add(new JsonPrimitive(percentile))
+              approxPercentilesJson.add(
+                if (percentile.isNaN) null else new JsonPrimitive(normalizeDouble(percentile)))
             }
           }
-
           columnProfileJson.add("approxPercentiles", approxPercentilesJson)
 
         case _ =>

From d8a78e30e1e362a68c39b0f48ce8a29c550ddc2d Mon Sep 17 00:00:00 2001
From: moritzmeister <meister.mo@gmail.com>
Date: Mon, 25 Apr 2022 16:58:23 +0200
Subject: [PATCH 16/21] columns need to be filtered also when getting results

---
 .../scala/com/amazon/deequ/profiles/ColumnProfiler.scala  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
index 0e1143a4b..a06395216 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
@@ -216,7 +216,7 @@ object ColumnProfiler {
 
         // We compute exact histograms for all low-cardinality string columns, find those here
         val targetColumnsForHistograms = findTargetColumnsForHistograms(data.schema,
-          genericStatistics, lowCardinalityHistogramThreshold)
+          genericStatistics, lowCardinalityHistogramThreshold, restrictToColumns)
 
         // Find out, if we have values for those we can reuse
         val analyzerContextExistingValues =
@@ -398,7 +398,7 @@ object ColumnProfiler {
 
         // We compute exact histograms for all low-cardinality string columns, find those here
         val targetColumnsForHistograms = findTargetColumnsForHistograms(data.schema,
-          genericStatistics, lowCardinalityHistogramThreshold)
+          genericStatistics, lowCardinalityHistogramThreshold, restrictToColumns)
 
         // Find out, if we have values for those we can reuse
         val analyzerContextExistingValues =
@@ -848,13 +848,15 @@ object ColumnProfiler {
   private[this] def findTargetColumnsForHistograms(
       schema: StructType,
       genericStatistics: GenericColumnStatistics,
-      lowCardinalityHistogramThreshold: Long)
+      lowCardinalityHistogramThreshold: Long,
+      restrictToColumns: Option[Seq[String]] = None)
     : Seq[String] = {
 
     val validSparkDataTypesForHistograms: Set[SparkDataType] = Set(
       StringType, BooleanType, DoubleType, FloatType, IntegerType, LongType, ShortType
     )
     val originalStringNumericOrBooleanColumns = schema
+      .filter{ field => restrictToColumns.isEmpty || restrictToColumns.get.contains(field.name) }
       .filter { field => validSparkDataTypesForHistograms.contains(field.dataType) ||
         genericStatistics.typeOf(field.name) == Decimal }
       .map { field => field.name }

From 1674da56bf11e78064c5c8561b652ab25507197f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Till=20D=C3=B6hmen?= <tdoehmen@users.noreply.github.com>
Date: Fri, 20 May 2022 16:18:01 +0200
Subject: [PATCH 17/21] fixed NaN issues and improved statistics JSON

---
 .../amazon/deequ/analyzers/Histogram.scala    |  2 +-
 .../amazon/deequ/profiles/ColumnProfile.scala | 43 ++++++++-
 .../deequ/profiles/ColumnProfiler.scala       | 37 +++++---
 .../com/amazon/deequ/KLL/KLLProfileTest.scala |  2 +
 .../deequ/KLL/KLLProfileTestApprox.scala      | 73 +++++++++-------
 .../profiles/ColumnProfilerNaNTest.scala      | 48 ++++++++++
 .../deequ/profiles/ColumnProfilerTest.scala   | 22 +++--
 .../rules/ConstraintRulesTest.scala           | 87 ++++++++++---------
 .../amazon/deequ/utils/FixtureSupport.scala   | 38 +++++++-
 9 files changed, 252 insertions(+), 100 deletions(-)
 create mode 100644 src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala

diff --git a/src/main/scala/com/amazon/deequ/analyzers/Histogram.scala b/src/main/scala/com/amazon/deequ/analyzers/Histogram.scala
index 42a7e72e5..78707a2b3 100644
--- a/src/main/scala/com/amazon/deequ/analyzers/Histogram.scala
+++ b/src/main/scala/com/amazon/deequ/analyzers/Histogram.scala
@@ -133,7 +133,7 @@ case class Histogram(
 }
 
 object Histogram {
-  val NullFieldReplacement = "NullValue"
+  val NullFieldReplacement = "-null-"
   val MaximumAllowedDetailBins = 1000
   val count_function = "count"
   val sum_function = "sum"
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
index 324108653..6cfe29e7d 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
@@ -20,6 +20,8 @@ import com.amazon.deequ.analyzers.DataTypeInstances
 import com.amazon.deequ.metrics.{BucketDistribution, Distribution}
 import com.google.gson.{Gson, GsonBuilder, JsonArray, JsonObject, JsonPrimitive}
 
+import scala.collection.immutable.ListMap
+
 /* Profiling results for the columns which will be given to the constraint suggestion engine */
 abstract class ColumnProfile {
   def column: String
@@ -28,6 +30,7 @@ abstract class ColumnProfile {
   def entropy: Option[Double]
   def uniqueness: Option[Double]
   def approximateNumDistinctValues: Long
+  def exactNumDistinctValues: Option[Long]
   def dataType: DataTypeInstances.Value
   def isDataTypeInferred: Boolean
   def typeCounts: Map[String, Long]
@@ -41,6 +44,7 @@ case class StandardColumnProfile(
     entropy: Option[Double],
     uniqueness: Option[Double],
     approximateNumDistinctValues: Long,
+    exactNumDistinctValues: Option[Long],
     dataType: DataTypeInstances.Value,
     isDataTypeInferred: Boolean,
     typeCounts: Map[String, Long],
@@ -66,6 +70,7 @@ case class NumericColumnProfile(
     entropy: Option[Double],
     uniqueness: Option[Double],
     approximateNumDistinctValues: Long,
+    exactNumDistinctValues: Option[Long],
     dataType: DataTypeInstances.Value,
     isDataTypeInferred: Boolean,
     typeCounts: Map[String, Long],
@@ -87,7 +92,16 @@ case class ColumnProfiles(
 
 object ColumnProfiles {
 
+  def toJson(columnProfiles: ColumnProfiles): String = {
+    toJson(columnProfiles.profiles.values.toSeq, columnProfiles.numRecords)
+  }
+
   def toJson(columnProfiles: Seq[ColumnProfile]): String = {
+    // for backwards compatability with hsfs API
+    toJson(columnProfiles,  -1)
+  }
+
+  def toJson(columnProfiles: Seq[ColumnProfile], numRecords: Long): String = {
 
     val json = new JsonObject()
 
@@ -108,6 +122,13 @@ object ColumnProfiles {
       }
 
       columnProfileJson.addProperty("completeness", normalizeDouble(profile.completeness))
+
+      if (numRecords >= 0) {
+        columnProfileJson.addProperty("numRecordsNonNull",
+          math.round(normalizeDouble(profile.completeness * numRecords)))
+        columnProfileJson.addProperty("numRecordsNull",
+          numRecords - math.round(normalizeDouble(profile.completeness * numRecords)))
+      }
       if (profile.distinctness.isDefined) {
         columnProfileJson.addProperty("distinctness", normalizeDouble(profile.distinctness.get))
       }
@@ -121,11 +142,19 @@ object ColumnProfiles {
       columnProfileJson.addProperty("approximateNumDistinctValues",
         profile.approximateNumDistinctValues)
 
+      if (profile.exactNumDistinctValues.isDefined) {
+        columnProfileJson.addProperty("exactNumDistinctValues", profile.exactNumDistinctValues.get)
+      }
+
       if (profile.histogram.isDefined) {
         val histogram = profile.histogram.get
         val histogramJson = new JsonArray()
 
-        histogram.values.foreach { case (name, distributionValue) =>
+        // sort histogram by descending quantity, then by key
+        val sorted = ListMap(histogram.values.toSeq.sortBy(kv => (kv._2.absolute, kv._1))
+          (Ordering.Tuple2(Ordering[Long].reverse, Ordering.String)): _*)
+
+        sorted.foreach { case (name, distributionValue) =>
           val histogramEntry = new JsonObject()
           histogramEntry.addProperty("value", name)
           histogramEntry.addProperty("count", distributionValue.absolute)
@@ -186,11 +215,16 @@ object ColumnProfiles {
 
             if (profile.histogram.isEmpty) {
               val histogramJson = new JsonArray()
+
+              // increase precision for small bucket sizes
+              val fp = if (kllSketch.buckets.nonEmpty && scala.math.abs(kllSketch.buckets.head
+                .highValue - kllSketch.buckets.head.lowValue) > 0.05) "%.2f" else "%f"
+
               kllSketch.buckets.foreach{bucket =>
                 val histogramEntry = new JsonObject()
-                histogramEntry.addProperty("value", "%.2f".formatLocal(java.util.Locale.US,
-                  bucket.lowValue) + "-" + "%.2f".formatLocal(java.util.Locale.US, bucket
-                  .highValue))
+                histogramEntry.addProperty("value", fp.formatLocal(java.util.Locale.US,
+                  bucket.lowValue) + " to " + fp.formatLocal(java.util.Locale.US,
+                  bucket.highValue))
                 histogramEntry.addProperty("count", bucket.count)
                 histogramEntry.addProperty("ratio", bucket.count/totalCount)
                 histogramJson.add(histogramEntry)
@@ -251,4 +285,5 @@ object ColumnProfiles {
       numeric
     }
   }
+
 }
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
index a06395216..b1293aa91 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
@@ -18,6 +18,7 @@ package com.amazon.deequ.profiles
 
 import scala.util.Success
 import scala.collection.mutable.ListBuffer
+import scala.collection.immutable.ListMap
 import com.amazon.deequ.analyzers.DataTypeInstances._
 import com.amazon.deequ.analyzers._
 import com.amazon.deequ.analyzers.runners.AnalysisRunBuilder
@@ -53,6 +54,7 @@ private[deequ] case class GenericColumnStatistics(
     knownTypes: Map[String, DataTypeInstances.Value],
     typeDetectionHistograms: Map[String, Map[String, Long]],
     approximateNumDistincts: Map[String, Long],
+    exactNumDistincts: Map[String, Long],
     completenesses: Map[String, Double],
     distinctness: Map[String, Double],
     entropy: Map[String, Double],
@@ -252,7 +254,7 @@ object ColumnProfiler {
   /**
    * Profile a (potentially very large) dataset.
    *
-   * @param data                             data dataset as dataframe
+   * @param dataInp                             data dataset as dataframe
    * @param restrictToColumns                an contain a subset of columns to profile, otherwise
    *                                         all columns will be considered
    * @param printStatusUpdates
@@ -269,7 +271,7 @@ object ColumnProfiler {
    */
   // scalastyle:off argcount
   private[deequ] def profileOptimized(
-                              data: DataFrame,
+                              dataInp: DataFrame,
                               restrictToColumns: Option[Seq[String]] = None,
                               printStatusUpdates: Boolean = false,
                               lowCardinalityHistogramThreshold: Int = ColumnProfiler
@@ -290,15 +292,16 @@ object ColumnProfiler {
     // Ensure that all desired columns exist
     restrictToColumns.foreach { restrictToColumns =>
       restrictToColumns.foreach { columnName =>
-        require(data.schema.fieldNames.contains(columnName), s"Unable to find column $columnName")
+        require(dataInp.schema.fieldNames.contains(columnName), s"Unable to find column " +
+          s"$columnName")
       }
     }
 
     // Find columns we want to profile
-    val relevantColumns = getRelevantColumns(data.schema, restrictToColumns)
+    val relevantColumns = getRelevantColumns(dataInp.schema, restrictToColumns)
 
     // We assume that data types are predefined by the schema, and skip the data type detection
-    val predefinedTypes = data.schema.fields
+    val predefinedTypes = dataInp.schema.fields
       .filter { column => relevantColumns.contains(column.name) }
       .map { field =>
         val knownType = field.dataType match {
@@ -319,6 +322,10 @@ object ColumnProfiler {
     val numericColumnNames = relevantColumns
       .filter { name => Set(Integral, Fractional, Decimal).contains(predefinedTypes(name)) }
 
+    // replace NaNs with null in numeric columns
+    val na_replacement = numericColumnNames.map((_, "null")).toMap
+    val data = dataInp.na.fill(na_replacement)
+
     // First pass
     if (printStatusUpdates) {
       println("### PROFILING: Computing generic column statistics in pass (1/2)...")
@@ -356,7 +363,8 @@ object ColumnProfiler {
             (exactUniquenessCols.isDefined && exactUniquenessCols.get.contains(name)))
             && predefinedTypes(name) != Unknown) {
             // Add grouping analyzers.
-            analyzers ++= Seq(Uniqueness(name), Distinctness(name), Entropy(name))
+            analyzers ++= Seq(Uniqueness(name), Distinctness(name), Entropy(name),
+              CountDistinct(name))
           }
 
           analyzers
@@ -655,24 +663,29 @@ object ColumnProfiler {
         analyzer.column -> metric.value.get.toLong
       }
 
+    val exactNumDistincts = results.metricMap
+      .collect { case (analyzer: CountDistinct, metric: DoubleMetric) =>
+        analyzer.columns.head -> metric.value.get.toLong
+      }
+
     val completenesses = results.metricMap
       .collect { case (analyzer: Completeness, metric: DoubleMetric) =>
         analyzer.column -> metric.value.get
       }
 
     val entropy = results.metricMap
-      .collect { case (analyzer: Entropy, metric: DoubleMetric) =>
+      .collect { case (analyzer: Entropy, metric: DoubleMetric) if metric.value.isSuccess =>
         analyzer.column -> metric.value.get
       }
 
     val uniqueness = results.metricMap
-      .collect { case (analyzer: Uniqueness, metric: DoubleMetric) =>
+      .collect { case (analyzer: Uniqueness, metric: DoubleMetric) if metric.value.isSuccess =>
         // we only compute uniqueness for single columns
         analyzer.columns.head -> metric.value.get
       }
 
     val distinctness = results.metricMap
-      .collect { case (analyzer: Distinctness, metric: DoubleMetric) =>
+      .collect { case (analyzer: Distinctness, metric: DoubleMetric) if metric.value.isSuccess =>
         analyzer.columns.head -> metric.value.get
       }
 
@@ -698,7 +711,8 @@ object ColumnProfiler {
       .toMap
 
     GenericColumnStatistics(numRecords, inferredTypes, knownTypes, typeDetectionHistograms,
-      approximateNumDistincts, completenesses, distinctness, entropy, uniqueness, predefinedTypes)
+      approximateNumDistincts, exactNumDistincts, completenesses, distinctness, entropy,
+      uniqueness, predefinedTypes)
   }
 
 
@@ -988,6 +1002,7 @@ object ColumnProfiler {
         val entropy = genericStats.entropy.get(name)
         val uniqueness = genericStats.uniqueness.get(name)
         val approxNumDistinct = genericStats.approximateNumDistincts(name)
+        val exactNumDistinct = genericStats.exactNumDistincts.get(name)
         val dataType = genericStats.typeOf(name)
         val isDataTypeInferred = genericStats.inferredTypes.contains(name)
         val histogram = categoricalStats.histograms.get(name)
@@ -1004,6 +1019,7 @@ object ColumnProfiler {
               entropy,
               uniqueness,
               approxNumDistinct,
+              exactNumDistinct,
               dataType,
               isDataTypeInferred,
               typeCounts,
@@ -1039,6 +1055,7 @@ object ColumnProfiler {
               entropy,
               uniqueness,
               approxNumDistinct,
+              exactNumDistinct,
               dataType,
               isDataTypeInferred,
               typeCounts,
diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala
index e462b26d9..81cedeb3c 100644
--- a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala
+++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTest.scala
@@ -68,6 +68,7 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec
           Some(1.0),
           Some(1.0),
           6,
+          Some(6),
           DataTypeInstances.Fractional,
           false,
           Map.empty,
@@ -113,6 +114,7 @@ class KLLProfileTest extends WordSpec with Matchers with SparkContextSpec
           Some(1.0),
           Some(1.0),
           30,
+          Some(30),
           DataTypeInstances.Fractional,
           false,
           Map.empty,
diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala
index 572e177b3..a47b14497 100644
--- a/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala
+++ b/src/test/scala/com/amazon/deequ/KLL/KLLProfileTestApprox.scala
@@ -88,6 +88,7 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
           None,
           None,
           6,
+          None,
           DataTypeInstances.Decimal,
           false,
           Map.empty,
@@ -160,6 +161,7 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
           None,
           None,
           6,
+          None,
           DataTypeInstances.Fractional,
           false,
           Map.empty,
@@ -216,17 +218,17 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
 
         val profile = ColumnProfiler.profileOptimized(data, Option(Seq("att1", "att2")),
           kllParameters = Some(KLLParameters(KLLSketch.DEFAULT_SKETCH_SIZE, KLLSketch
-          .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true)
-        val profiles = profile.profiles.map{pro => pro._2}.toSeq
-        val json_profile = ColumnProfiles.toJson(profiles)
+          .DEFAULT_SHRINKING_FACTOR, 20)), histogram = true, exactUniqueness = true)
+        val json_profile = ColumnProfiles.toJson(profile)
         val correct_profile = "{\"columns\":[{\"column\":\"att1\",\"dataType\":\"Fractional\"," +
-          "\"isDataTypeInferred\":\"false\",\"completeness\":1.0," +
-          "\"approximateNumDistinctValues\":6,\"histogram\":[{\"value\":\"6.0\",\"count\":1," +
-          "\"ratio\":0.16666666666666666},{\"value\":\"3.0\",\"count\":1," +
-          "\"ratio\":0.16666666666666666},{\"value\":\"2.0\",\"count\":1," +
-          "\"ratio\":0.16666666666666666},{\"value\":\"4.0\",\"count\":1," +
-          "\"ratio\":0.16666666666666666},{\"value\":\"1.0\",\"count\":1," +
+          "\"isDataTypeInferred\":\"false\",\"completeness\":1.0,\"numRecordsNonNull\":6," +
+          "\"numRecordsNull\":0,\"distinctness\":1.0,\"entropy\":1.791759469228055," +
+          "\"uniqueness\":1.0,\"approximateNumDistinctValues\":6,\"exactNumDistinctValues\":6," +
+          "\"histogram\":[{\"value\":\"1.0\",\"count\":1,\"ratio\":0.16666666666666666}," +
+          "{\"value\":\"2.0\",\"count\":1,\"ratio\":0.16666666666666666},{\"value\":\"3.0\"," +
+          "\"count\":1,\"ratio\":0.16666666666666666},{\"value\":\"4.0\",\"count\":1," +
           "\"ratio\":0.16666666666666666},{\"value\":\"5.0\",\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"value\":\"6.0\",\"count\":1," +
           "\"ratio\":0.16666666666666666}],\"mean\":3.5,\"maximum\":6.0,\"minimum\":1.0," +
           "\"sum\":21.0,\"stdDev\":1.707825127659933,\"correlations\":[{\"column\":\"att2\"," +
           "\"correlation\":0.9263710192499128},{\"column\":\"att1\",\"correlation\":1.0}]," +
@@ -255,30 +257,32 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
           "4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0," +
           "5.0,5.0,5.0,5.0,5.0,5.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0," +
           "6.0,6.0]},{\"column\":\"att2\",\"dataType\":\"Fractional\"," +
-          "\"isDataTypeInferred\":\"false\",\"completeness\":1.0," +
-          "\"approximateNumDistinctValues\":4,\"histogram\":[{\"value\":\"0.0\",\"count\":3," +
-          "\"ratio\":0.5},{\"value\":\"6.0\",\"count\":1,\"ratio\":0.16666666666666666}," +
-          "{\"value\":\"7.0\",\"count\":1,\"ratio\":0.16666666666666666},{\"value\":\"5.0\"," +
-          "\"count\":1,\"ratio\":0.16666666666666666}],\"mean\":3.0,\"maximum\":7.0," +
-          "\"minimum\":0.0,\"sum\":18.0,\"stdDev\":3.0550504633038935," +
-          "\"correlations\":[{\"column\":\"att2\",\"correlation\":1.0},{\"column\":\"att1\"," +
-          "\"correlation\":0.9263710192499128}],\"kll\":{\"buckets\":[{\"low_value\":0.0," +
-          "\"high_value\":0.35,\"count\":3,\"ratio\":0.5},{\"low_value\":0.35,\"high_value\":0.7," +
-          "\"count\":0,\"ratio\":0.0},{\"low_value\":0.7,\"high_value\":1.05,\"count\":0," +
-          "\"ratio\":0.0},{\"low_value\":1.05,\"high_value\":1.4,\"count\":0,\"ratio\":0.0}," +
-          "{\"low_value\":1.4,\"high_value\":1.75,\"count\":0,\"ratio\":0.0},{\"low_value\":1.75," +
-          "\"high_value\":2.1,\"count\":0,\"ratio\":0.0},{\"low_value\":2.1,\"high_value\":2.45," +
-          "\"count\":0,\"ratio\":0.0},{\"low_value\":2.45,\"high_value\":2.8,\"count\":0," +
-          "\"ratio\":0.0},{\"low_value\":2.8,\"high_value\":3.15,\"count\":0,\"ratio\":0.0}," +
-          "{\"low_value\":3.15,\"high_value\":3.5,\"count\":0,\"ratio\":0.0},{\"low_value\":3.5," +
-          "\"high_value\":3.85,\"count\":0,\"ratio\":0.0},{\"low_value\":3.85,\"high_value\":4.2," +
-          "\"count\":0,\"ratio\":0.0},{\"low_value\":4.2,\"high_value\":4.55,\"count\":0," +
-          "\"ratio\":0.0},{\"low_value\":4.55,\"high_value\":4.9,\"count\":0,\"ratio\":0.0}," +
-          "{\"low_value\":4.9,\"high_value\":5.25,\"count\":1,\"ratio\":0.16666666666666666}," +
-          "{\"low_value\":5.25,\"high_value\":5.6,\"count\":0,\"ratio\":0.0},{\"low_value\":5.6," +
-          "\"high_value\":5.95,\"count\":0,\"ratio\":0.0},{\"low_value\":5.95,\"high_value\":6.3," +
-          "\"count\":1,\"ratio\":0.16666666666666666},{\"low_value\":6.3,\"high_value\":6.65," +
-          "\"count\":0,\"ratio\":0.0},{\"low_value\":6.65,\"high_value\":7.0,\"count\":1," +
+          "\"isDataTypeInferred\":\"false\",\"completeness\":1.0,\"numRecordsNonNull\":6," +
+          "\"numRecordsNull\":0,\"distinctness\":0.6666666666666666,\"entropy\":1.242453324894," +
+          "\"uniqueness\":0.5,\"approximateNumDistinctValues\":4,\"exactNumDistinctValues\":4," +
+          "\"histogram\":[{\"value\":\"0.0\",\"count\":3,\"ratio\":0.5},{\"value\":\"5.0\"," +
+          "\"count\":1,\"ratio\":0.16666666666666666},{\"value\":\"6.0\",\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"value\":\"7.0\",\"count\":1," +
+          "\"ratio\":0.16666666666666666}],\"mean\":3.0,\"maximum\":7.0,\"minimum\":0.0," +
+          "\"sum\":18.0,\"stdDev\":3.0550504633038935,\"correlations\":[{\"column\":\"att2\"," +
+          "\"correlation\":1.0},{\"column\":\"att1\",\"correlation\":0.9263710192499128}]," +
+          "\"kll\":{\"buckets\":[{\"low_value\":0.0,\"high_value\":0.35,\"count\":3," +
+          "\"ratio\":0.5},{\"low_value\":0.35,\"high_value\":0.7,\"count\":0,\"ratio\":0.0}," +
+          "{\"low_value\":0.7,\"high_value\":1.05,\"count\":0,\"ratio\":0.0},{\"low_value\":1.05," +
+          "\"high_value\":1.4,\"count\":0,\"ratio\":0.0},{\"low_value\":1.4,\"high_value\":1.75," +
+          "\"count\":0,\"ratio\":0.0},{\"low_value\":1.75,\"high_value\":2.1,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":2.1,\"high_value\":2.45,\"count\":0,\"ratio\":0.0}," +
+          "{\"low_value\":2.45,\"high_value\":2.8,\"count\":0,\"ratio\":0.0},{\"low_value\":2.8," +
+          "\"high_value\":3.15,\"count\":0,\"ratio\":0.0},{\"low_value\":3.15,\"high_value\":3.5," +
+          "\"count\":0,\"ratio\":0.0},{\"low_value\":3.5,\"high_value\":3.85,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":3.85,\"high_value\":4.2,\"count\":0,\"ratio\":0.0}," +
+          "{\"low_value\":4.2,\"high_value\":4.55,\"count\":0,\"ratio\":0.0},{\"low_value\":4.55," +
+          "\"high_value\":4.9,\"count\":0,\"ratio\":0.0},{\"low_value\":4.9,\"high_value\":5.25," +
+          "\"count\":1,\"ratio\":0.16666666666666666},{\"low_value\":5.25,\"high_value\":5.6," +
+          "\"count\":0,\"ratio\":0.0},{\"low_value\":5.6,\"high_value\":5.95,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":5.95,\"high_value\":6.3,\"count\":1," +
+          "\"ratio\":0.16666666666666666},{\"low_value\":6.3,\"high_value\":6.65,\"count\":0," +
+          "\"ratio\":0.0},{\"low_value\":6.65,\"high_value\":7.0,\"count\":1," +
           "\"ratio\":0.16666666666666666}],\"sketch\":{\"parameters\":{\"c\":0.64,\"k\":2048.0}," +
           "\"data\":\"[[0.0,0.0,0.0,5.0,6.0,7.0]]\"}},\"approxPercentiles\":[0.0,0.0,0.0,0.0,0.0," +
           "0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0," +
@@ -306,6 +310,7 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
           Some(1.791759469228055),
           Some(1.0),
           6,
+          Some(6),
           DataTypeInstances.Fractional,
           false,
           Map.empty,
@@ -371,6 +376,7 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
           Some(1.791759469228055),
           Some(1.0),
           6,
+          Some(6),
           DataTypeInstances.String,
           false,
           Map.empty,
@@ -404,6 +410,7 @@ class KLLProfileTestApprox extends WordSpec with Matchers with SparkContextSpec
           Some(3.4011973816621546),
           Some(1.0),
           30,
+          Some(30),
           DataTypeInstances.Fractional,
           false,
           Map.empty,
diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala
new file mode 100644
index 000000000..c30c83399
--- /dev/null
+++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2021 Logical Clocks AB. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"). You may not
+ * use this file except in compliance with the License. A copy of the License
+ * is located at
+ *
+ * http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+package com.amazon.deequ.profiles
+
+import com.amazon.deequ.SparkContextSpec
+import com.amazon.deequ.utils.FixtureSupport
+import org.scalatest.{Matchers, color}
+import org.scalatest.wordspec.AnyWordSpec
+class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport {
+
+  "Column Profiler NaN Test" should {
+    "return results for data frame with NaN and null values without failure" in withSparkSession {
+      sparkSession =>
+        val df = getDfWithNas(sparkSession)
+
+        val runner: ColumnProfilerRunBuilder = new ColumnProfilerRunner()
+          .onData(df)
+          .withCorrelation(true, 50)
+          .withHistogram(true, 20)
+          .withExactUniqueness (true)
+
+        val result = runner.run()
+
+        val matches = result.profiles.map { case (colname: String, profile: ColumnProfile) =>
+          val nacount = df.filter(df(colname).isNull || df(colname).isNaN).count()
+          val nacount_profile = result.numRecords - scala.math.round(profile.completeness *
+                                                                     result.numRecords)
+          nacount == nacount_profile
+        }.toSeq
+
+        assert(matches.forall(_ == true))
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
index a02382ebe..4bbf5abb0 100644
--- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
+++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerTest.scala
@@ -65,6 +65,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         Some(0.5623351446188083),
         Some(0.25),
         2,
+        None,
         DataTypeInstances.String,
         true,
         Map(
@@ -119,6 +120,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         Some(1.791759469228055),
         Some(1.0),
         6,
+        None,
         DataTypeInstances.String,
         false,
         Map(),
@@ -145,6 +147,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         Some(0.5623351446188083),
         Some(0.25),
         2,
+        None,
         DataTypeInstances.String,
         true,
         Map(
@@ -177,6 +180,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         Some(1.0),
         Some(1.0),
         6,
+        Some(6),
         DataTypeInstances.Integral,
         true,
         Map(
@@ -221,6 +225,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
           Some(1.0),
           Some(1.0),
           6,
+          Some(6),
           DataTypeInstances.Integral,
           true,
           Map(
@@ -266,6 +271,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
           Some(1.0),
           Some(1.0),
           6,
+          Some(6),
           DataTypeInstances.Integral,
           true,
           Map(
@@ -346,6 +352,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         Some(1.0),
         Some(1.0),
         6,
+        Some(6),
         DataTypeInstances.Fractional,
         false,
         Map.empty,
@@ -383,6 +390,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         Some(0.5623351446188083),
         Some(0.25),
         2,
+        None,
         DataTypeInstances.String,
         isDataTypeInferred = true,
         Map(
@@ -594,6 +602,7 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         Some(1.0),
         Some(1.0),
         891,
+        Some(891),
         DataTypeInstances.Integral,
         false,
         Map.empty,
@@ -605,21 +614,22 @@ class ColumnProfilerTest extends WordSpec with Matchers with SparkContextSpec
         Some(1.0),
         Some(1.0),
         2,
+        Some(2),
         DataTypeInstances.Integral,
         false,
         Map.empty,
         None),
-      StandardColumnProfile("Pclass", 1.0, Some(1.0), Some(1.0), Some(1.0), 3,
+      StandardColumnProfile("Pclass", 1.0, Some(1.0), Some(1.0), Some(1.0), 3, Some(3),
         DataTypeInstances.Integral, false, Map.empty, None),
-      StandardColumnProfile("Name", 1.0, Some(1.0), Some(1.0), Some(1.0), 0,
+      StandardColumnProfile("Name", 1.0, Some(1.0), Some(1.0), Some(1.0), 0, Some(0),
         DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Sex", 1.0, Some(1.0), Some(1.0), Some(1.0), 2,
+      StandardColumnProfile("Sex", 1.0, Some(1.0), Some(1.0), Some(1.0), 2, Some(2),
         DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Ticket", 1.0, Some(1.0), Some(1.0), Some(1.0), 681,
+      StandardColumnProfile("Ticket", 1.0, Some(1.0), Some(1.0), Some(1.0), 681, Some(681),
         DataTypeInstances.String, true, Map.empty, None),
-      StandardColumnProfile("Fare", 1.0, Some(1.0), Some(1.0), Some(1.0), 0,
+      StandardColumnProfile("Fare", 1.0, Some(1.0), Some(1.0), Some(1.0), 0, Some(0),
         DataTypeInstances.Fractional, false, Map.empty, None),
-      StandardColumnProfile("Cabin", 0.22, Some(1.0), Some(1.0), Some(1.0), 0,
+      StandardColumnProfile("Cabin", 0.22, Some(1.0), Some(1.0), Some(1.0), 0, Some(0),
         DataTypeInstances.String, true, Map.empty, None)
     )
 
diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
index d18282901..be29496dc 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
@@ -35,9 +35,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
     "be applied correctly" in {
 
       val complete = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
-        String, false, Map.empty, None)
+        Some(100), String, false, Map.empty, None)
       val incomplete = StandardColumnProfile("col1", .25, Some(1.0), Some(1.0), Some(1.0), 100,
-        String, false, Map.empty, None)
+        Some(100), String, false, Map.empty, None)
 
       val completeInteger =
         getFakeNumericColumnProfileWithMinMaxMeanAndStdDev(
@@ -132,9 +132,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
     "be applied correctly" in {
 
       val complete = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
-        String, false, Map.empty, None)
+        Some(100), String, false, Map.empty, None)
       val incomplete = StandardColumnProfile("col1", .25, Some(1.0), Some(1.0), Some(1.0), 100,
-        String, false, Map.empty, None)
+        Some(100), String, false, Map.empty, None)
 
       assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000))
       assert(RetainCompletenessRule().shouldBeApplied(incomplete, 1000))
@@ -196,13 +196,13 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
     "be applied correctly" in {
 
       val unique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
-        String, false, Map.empty, None)
+        Some(100), String, false, Map.empty, None)
       val maybeUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95,
-        String, false, Map.empty, None)
+        Some(95), String, false, Map.empty, None)
       val maybeNonUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 91,
-        String, false, Map.empty, None)
+        Some(91), String, false, Map.empty, None)
       val nonUnique = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 20,
-        String, false, Map.empty, None)
+        Some(20), String, false, Map.empty, None)
 
       assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(unique, 100))
       assert(UniqueIfApproximatelyUniqueRule().shouldBeApplied(maybeUnique, 100))
@@ -263,23 +263,23 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
     "be applied correctly" in {
 
       val string = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
-        String, true, Map.empty, None)
+        Some(100), String, true, Map.empty, None)
       val boolean = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
-        Boolean, true, Map.empty, None)
+        Some(100), Boolean, true, Map.empty, None)
       val fractional = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
-        Fractional, true, Map.empty, None)
+        Some(100), Fractional, true, Map.empty, None)
       val integer = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
-        Integral, true, Map.empty, None)
+        Some(100), Integral, true, Map.empty, None)
       val unknown = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100,
-        Unknown, true, Map.empty, None)
+        Some(100), Unknown, true, Map.empty, None)
       val stringNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
-        100, String, false, Map.empty, None)
+        100, Some(100), String, false, Map.empty, None)
       val booleanNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
-        100, Boolean, false, Map.empty, None)
+        100, Some(100), Boolean, false, Map.empty, None)
       val fractionalNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
-        Some(1.0), 100, Fractional, false, Map.empty, None)
+        Some(1.0), 100, Some(100), Fractional, false, Map.empty, None)
       val integerNonInferred = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
-        100, Integral, false, Map.empty, None)
+        100, Some(100), Integral, false, Map.empty, None)
 
       assert(!RetainTypeRule().shouldBeApplied(string, 100))
       assert(!RetainTypeRule().shouldBeApplied(unknown, 100))
@@ -397,26 +397,27 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
       val noDistribution = Distribution(Map.empty, 0)
 
       val stringWithNonSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
-        Some(1.0), 100, String, false, Map.empty, Some(nonSkewedDist))
+        Some(1.0), 100, Some(100), String, false, Map.empty, Some(nonSkewedDist))
       val integralWithNonSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
-        Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(nonSkewedIntegralDist))
+        Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty,
+        Some(nonSkewedIntegralDist))
       val stringWithFlgDist = StandardColumnProfile("flg", 1.0, Some(1.0), Some(1.0), Some(1.0),
-        2, String, false, Map.empty, Some(flgDist))
+        2, Some(2), String, false, Map.empty, Some(flgDist))
       val integralWithFlgDist = StandardColumnProfile("flg", 1.0, Some(1.0), Some(1.0), Some(1.0),
-        2, DataTypeInstances.Integral, false, Map.empty, Some(flgDist))
+        2, Some(2), DataTypeInstances.Integral, false, Map.empty, Some(flgDist))
 
       val stringWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
-        Some(1.0), 100, String, false, Map.empty, Some(skewedDist))
+        Some(1.0), 100, Some(100), String, false, Map.empty, Some(skewedDist))
       val stringNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 95,
-        String, false, Map.empty, None)
+        Some(95), String, false, Map.empty, None)
       val boolNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 94,
-        Boolean, false, Map.empty, None)
+        Some(94), Boolean, false, Map.empty, None)
       val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
-        20, Boolean, false, Map.empty, Some(noDistribution))
+        20, Some(20), Boolean, false, Map.empty, Some(noDistribution))
       val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
-        Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedDist))
+        Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(skewedDist))
       val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
-        95, DataTypeInstances.Integral, false, Map.empty, None)
+        95, Some(95), DataTypeInstances.Integral, false, Map.empty, None)
 
       assert(CategoricalRangeRule().shouldBeApplied(stringWithNonSkewedDist, 100))
       assert(CategoricalRangeRule().shouldBeApplied(integralWithNonSkewedDist, 100))
@@ -579,37 +580,37 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
       val noDistribution = Distribution(Map.empty, 0)
 
       val stringWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile(
-        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty,
+        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), String, false, Map.empty,
         Some(nonSkewedDistWithFractionalCategoricalRange))
       val stringWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile(
-        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty,
+        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), String, false, Map.empty,
         Some(nonSkewedDistWithActualCategoricalRange))
       val stringWithSomewhatSkewedDist = StandardColumnProfile(
-        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty,
+        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), String, false, Map.empty,
         Some(somewhatSkewedDist))
       val stringWithSkewedDist = StandardColumnProfile(
-        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, String, false, Map.empty,
+        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), String, false, Map.empty,
         Some(skewedDist))
       val stringNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
-        95, String, false, Map.empty, None)
+        95, Some(95), String, false, Map.empty, None)
       val boolNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 94,
-        Boolean, false, Map.empty, None)
+        Some(94), Boolean, false, Map.empty, None)
       val boolWithEmptyDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
-        20, Boolean, false, Map.empty, Some(noDistribution))
+        20, Some(20), Boolean, false, Map.empty, Some(noDistribution))
 
       val integralWithNonSkewedDistWithFractionalCategoricalRange = StandardColumnProfile("col1",
-        1.0, Some(1.0), Some(1.0), Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty,
-        Some(nonSkewedIntegralDistWithFractionalCategoricalRange))
+        1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), DataTypeInstances.Integral, false,
+        Map.empty, Some(nonSkewedIntegralDistWithFractionalCategoricalRange))
       val integralWithNonSkewedDistWithActualCategoricalRange = StandardColumnProfile(
-        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, DataTypeInstances.Integral, false,
-        Map.empty, Some(nonSkewedIntegralDistWithActualCategoricalRange))
+        "col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100), DataTypeInstances.Integral,
+        false, Map.empty, Some(nonSkewedIntegralDistWithActualCategoricalRange))
       val integralWithSomewhatSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
-        Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty,
+        Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty,
         Some(somewhatSkewedIntegralDist))
       val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
-        Some(1.0), 100, DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist))
+        Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist))
       val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
-        95, DataTypeInstances.Integral, false, Map.empty, None)
+        95, Some(95), DataTypeInstances.Integral, false, Map.empty, None)
 
       assert(FractionalCategoricalRangeRule().shouldBeApplied(stringWithSomewhatSkewedDist, 100))
       assert(FractionalCategoricalRangeRule().shouldBeApplied(
@@ -732,8 +733,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
   "NonNegativeNumbersRule and PositiveNumbersRule" should {
     "be applied correctly" in {
       def columnProfileWithMinimum(minimum: Double): NumericColumnProfile = {
-        NumericColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Fractional,
-          isDataTypeInferred = false, Map.empty, None, None, Some(10), Some(100),
+        NumericColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0), 100, Some(100),
+          Fractional, isDataTypeInferred = false, Map.empty, None, None, Some(10), Some(100),
           Some(minimum), Some(10000), Some(1.0), None, None)
       }
 
diff --git a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
index 7b56c744e..41a30c420 100644
--- a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
+++ b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
@@ -19,9 +19,8 @@ package com.amazon.deequ.utils
 import com.amazon.deequ.analyzers.DataTypeInstances
 import com.amazon.deequ.profiles.NumericColumnProfile
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, MapType, StringType, StructField, StructType}
+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 
 import scala.util.Random
 
@@ -337,6 +336,39 @@ trait FixtureSupport {
       "onlyUniqueWithOtherNonUnique", "halfUniqueCombinedWithNonUnique")
   }
 
+  def getDfWithNas(sparkSession: SparkSession): DataFrame = {
+    import org.apache.spark.sql.functions._
+
+    val schema = StructType( Array(
+      StructField("nullstr", StringType, true),
+      StructField("nullstrmixed", StringType, true),
+      StructField("nullint", IntegerType, true),
+      StructField("nullintmixed", IntegerType, true),
+      StructField("nulldbl", DoubleType, true),
+      StructField("nulldblna", DoubleType, true),
+      StructField("nulldblnamixed", DoubleType, true),
+      StructField("nullna", DoubleType, true)
+    ))
+
+    val data = Seq(
+      Row(null, "b", null, 2, null, java.lang.Double.NaN, 2.0, java.lang.Double.NaN),
+      Row(null, null, null, null, null, null, null, java.lang.Double.NaN),
+      Row(null, "c", null, 1, null, java.lang.Double.NaN, 1.0, java.lang.Double.NaN),
+      Row(null, null, null, null, null, null, java.lang.Double.NaN, java.lang.Double.NaN),
+      Row(null, "a", null, 0, null, null, 1.0, java.lang.Double.NaN),
+      Row(null, "a", null, 0, null, null, 1.0, java.lang.Double.NaN)
+    )
+
+    val nulldf = sparkSession.createDataFrame(
+      sparkSession.sparkContext.parallelize(data),
+      schema
+    )
+
+    nulldf.withColumn("nullstrmixed2",
+                  when(col("nullstrmixed").equalTo("null"), null)
+                  .otherwise(col("nullstrmixed")))
+  }
+
   def getDfWithDistinctValues(sparkSession: SparkSession): DataFrame = {
     import sparkSession.implicits._
 

From b21d74ed530a9dc564a628bb59566c99c5aa1ea8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Till=20D=C3=B6hmen?= <tdoehmen@users.noreply.github.com>
Date: Fri, 27 May 2022 15:10:03 +0200
Subject: [PATCH 18/21] fixed stylecheck

---
 .../scala/com/amazon/deequ/profiles/ColumnProfile.scala     | 4 ++--
 .../com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala   | 6 ++++--
 .../deequ/suggestions/rules/ConstraintRulesTest.scala       | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
index 6cfe29e7d..d57cc7b2c 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
@@ -98,7 +98,7 @@ object ColumnProfiles {
 
   def toJson(columnProfiles: Seq[ColumnProfile]): String = {
     // for backwards compatability with hsfs API
-    toJson(columnProfiles,  -1)
+    toJson(columnProfiles, -1)
   }
 
   def toJson(columnProfiles: Seq[ColumnProfile], numRecords: Long): String = {
@@ -218,7 +218,7 @@ object ColumnProfiles {
 
               // increase precision for small bucket sizes
               val fp = if (kllSketch.buckets.nonEmpty && scala.math.abs(kllSketch.buckets.head
-                .highValue - kllSketch.buckets.head.lowValue) > 0.05) "%.2f" else "%f"
+                .highValue - kllSketch.buckets.head.lowValue) > 0.05) { "%.2f" } else { "%f" }
 
               kllSketch.buckets.foreach{bucket =>
                 val histogramEntry = new JsonObject()
diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala
index c30c83399..cd936a4a3 100644
--- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala
+++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala
@@ -20,7 +20,9 @@ import com.amazon.deequ.SparkContextSpec
 import com.amazon.deequ.utils.FixtureSupport
 import org.scalatest.{Matchers, color}
 import org.scalatest.wordspec.AnyWordSpec
-class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport {
+
+class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextSpec with
+  FixtureSupport {
 
   "Column Profiler NaN Test" should {
     "return results for data frame with NaN and null values without failure" in withSparkSession {
@@ -45,4 +47,4 @@ class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextS
         assert(matches.forall(_ == true))
     }
   }
-}
\ No newline at end of file
+}
diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
index be29496dc..a9655d248 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
@@ -608,7 +608,8 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
         Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty,
         Some(somewhatSkewedIntegralDist))
       val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
-        Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist))
+        Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some
+        (skewedIntegralDist))
       val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
         95, Some(95), DataTypeInstances.Integral, false, Map.empty, None)
 

From 8edc0e998ed995300abd1594e463c3398d9d5894 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Till=20D=C3=B6hmen?= <tdoehmen@users.noreply.github.com>
Date: Fri, 20 May 2022 16:18:01 +0200
Subject: [PATCH 19/21] fixed NaN issues and improved statistics JSON

---
 src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala  | 2 +-
 .../com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala     | 4 +---
 .../amazon/deequ/suggestions/rules/ConstraintRulesTest.scala  | 3 +--
 src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala    | 1 -
 4 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
index d57cc7b2c..9c2905a0c 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
@@ -218,7 +218,7 @@ object ColumnProfiles {
 
               // increase precision for small bucket sizes
               val fp = if (kllSketch.buckets.nonEmpty && scala.math.abs(kllSketch.buckets.head
-                .highValue - kllSketch.buckets.head.lowValue) > 0.05) { "%.2f" } else { "%f" }
+                .highValue - kllSketch.buckets.head.lowValue) > 0.05) "%.2f" else "%f"
 
               kllSketch.buckets.foreach{bucket =>
                 val histogramEntry = new JsonObject()
diff --git a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala
index cd936a4a3..ac596bc9a 100644
--- a/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala
+++ b/src/test/scala/com/amazon/deequ/profiles/ColumnProfilerNaNTest.scala
@@ -20,9 +20,7 @@ import com.amazon.deequ.SparkContextSpec
 import com.amazon.deequ.utils.FixtureSupport
 import org.scalatest.{Matchers, color}
 import org.scalatest.wordspec.AnyWordSpec
-
-class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextSpec with
-  FixtureSupport {
+class ColumnProfilerNaNTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport {
 
   "Column Profiler NaN Test" should {
     "return results for data frame with NaN and null values without failure" in withSparkSession {
diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
index a9655d248..be29496dc 100644
--- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
+++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala
@@ -608,8 +608,7 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
         Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty,
         Some(somewhatSkewedIntegralDist))
       val integralWithSkewedDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0),
-        Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some
-        (skewedIntegralDist))
+        Some(1.0), 100, Some(100), DataTypeInstances.Integral, false, Map.empty, Some(skewedIntegralDist))
       val integralNoDist = StandardColumnProfile("col1", 1.0, Some(1.0), Some(1.0), Some(1.0),
         95, Some(95), DataTypeInstances.Integral, false, Map.empty, None)
 
diff --git a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
index 41a30c420..c80497eb7 100644
--- a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
+++ b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
@@ -18,7 +18,6 @@ package com.amazon.deequ.utils
 
 import com.amazon.deequ.analyzers.DataTypeInstances
 import com.amazon.deequ.profiles.NumericColumnProfile
-import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, MapType, StringType, StructField, StructType}
 import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 

From 05b4e1af810e2506a8b253dc9684114e32fe1b21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20de=20la=20R=C3=BAa=20Mart=C3=ADnez?=
 <javier@logicalclocks.com>
Date: Mon, 30 Oct 2023 14:40:39 +0100
Subject: [PATCH 20/21] Resolve conflicts for 2.0.4 - spark3.3

---
 pom.xml                                                    | 4 +++-
 .../scala/com/amazon/deequ/profiles/ColumnProfile.scala    | 6 +++++-
 .../scala/com/amazon/deequ/profiles/ColumnProfiler.scala   | 7 ++++++-
 src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala | 7 ++++++-
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/pom.xml b/pom.xml
index 40c378eb1..2a775b6d4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,6 +6,8 @@
 
     <groupId>com.logicalclocks</groupId>
     <artifactId>deequ_${scala.major.version}</artifactId>
+<!--    <groupId>com.amazon.deequ</groupId>-->
+<!--    <artifactId>deequ</artifactId>-->
     <version>2.0.4.1-SNAPSHOT</version>
 
     <name>deequ</name>
@@ -90,7 +92,7 @@
         <spark-24.version>2.4.2</spark-24.version>
         <spark-30.version>3.0.0</spark-30.version>
         <spark-31.version>3.1.1.0</spark-31.version>
-        <spark-33.version>3.3.0.0</spark-33.version>
+        <spark-33.version>3.3.0</spark-33.version>
         <artifact.spark.version></artifact.spark.version>
         <spark.scope>provided</spark.scope>
     </properties>
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
index 9c2905a0c..0a88a5a9c 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfile.scala
@@ -54,7 +54,11 @@ case class StandardColumnProfile(
 case class StringColumnProfile(
     column: String,
     completeness: Double,
+    distinctness: Option[Double],
+    entropy: Option[Double],
+    uniqueness: Option[Double],
     approximateNumDistinctValues: Long,
+    exactNumDistinctValues: Option[Long],
     dataType: DataTypeInstances.Value,
     isDataTypeInferred: Boolean,
     typeCounts: Map[String, Long],
@@ -218,7 +222,7 @@ object ColumnProfiles {
 
               // increase precision for small bucket sizes
               val fp = if (kllSketch.buckets.nonEmpty && scala.math.abs(kllSketch.buckets.head
-                .highValue - kllSketch.buckets.head.lowValue) > 0.05) "%.2f" else "%f"
+                .highValue - kllSketch.buckets.head.lowValue) > 0.05) { "%.2f" } else { "%f" }
 
               kllSketch.buckets.foreach{bucket =>
                 val histogramEntry = new JsonObject()
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
index b1293aa91..79fa9adc9 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
@@ -390,6 +390,7 @@ object ColumnProfiler {
       firstPassResults,
       predefinedTypes)
 
+    val stringStatistics = extractStringStatistics(firstPassResults)
 
     val numericStatistics = if (correlation) {
       extractNumericStatistics(firstPassResults, correlationCalculatedColumnNames)
@@ -434,7 +435,7 @@ object ColumnProfiler {
       case _ => Map.empty[String, Distribution]
     }
 
-    createProfiles(relevantColumns, genericStatistics, numericStatistics,
+    createProfiles(relevantColumns, genericStatistics, stringStatistics, numericStatistics,
       CategoricalColumnStatistics(secondPassResults))
   }
 
@@ -1038,7 +1039,11 @@ object ColumnProfiler {
             StringColumnProfile(
               name,
               completeness,
+              distinctness,
+              entropy,
+              uniqueness,
               approxNumDistinct,
+              exactNumDistinct,
               dataType,
               isDataTypeInferred,
               typeCounts,
diff --git a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
index c80497eb7..75e696dc0 100644
--- a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
+++ b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
@@ -486,7 +486,11 @@ trait FixtureSupport {
     NumericColumnProfile(
       column = columnName,
       completeness = completeness,
+      distinctness = Some(1.0),
+      entropy = Some(1.0),
+      uniqueness = Some(1.0),
       approximateNumDistinctValues = 1000,
+      exactNumDistinctValues = Some(1000L),
       dataType = dataType,
       isDataTypeInferred = false,
       typeCounts = Map[String, Long](),
@@ -497,7 +501,8 @@ trait FixtureSupport {
       minimum = Some(minimum),
       sum = Some(1000.879),
       stdDev = Some(1.023),
-      approxPercentiles = None
+      approxPercentiles = None,
+      correlation = None
     )
   }
 }

From 766d412e1d6bb3b66cb3e4238298bd5b82e54cb1 Mon Sep 17 00:00:00 2001
From: Fabio Buso <fabio@hopsworks.ai>
Date: Sat, 18 Nov 2023 17:18:38 +0100
Subject: [PATCH 21/21] Set better artifact name

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 2a775b6d4..82a3b75fb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -8,7 +8,7 @@
     <artifactId>deequ_${scala.major.version}</artifactId>
 <!--    <groupId>com.amazon.deequ</groupId>-->
 <!--    <artifactId>deequ</artifactId>-->
-    <version>2.0.4.1-SNAPSHOT</version>
+    <version>2.0.4.0-spark-3.3</version>
 
     <name>deequ</name>
     <description>Deequ is a library built on top of Apache Spark for defining "unit tests for data",