From 0a7d04de58b17a05210a4c5ab856a629c8c9d77e Mon Sep 17 00:00:00 2001
From: Paul Koch <code@koch.ninja>
Date: Sun, 22 Dec 2024 15:30:25 -0800
Subject: [PATCH] add new testing utilities for random dataset generation and
 add randomized stress test

---
 .../PartitionMultiDimensionalStraight.cpp     |  24 ----
 .../libebm/PartitionMultiDimensionalTree.cpp  |   3 -
 .../PartitionOneDimensionalBoosting.cpp       |   5 -
 shared/libebm/ebm_internal.hpp                |   4 -
 .../libebm/tests/boosting_unusual_inputs.cpp  | 111 ++++++++++++++++++
 shared/libebm/tests/libebm_test.cpp           |  75 ++++++++++--
 shared/libebm/tests/libebm_test.hpp           |  31 +++++
 7 files changed, 209 insertions(+), 44 deletions(-)
diff --git a/shared/libebm/PartitionMultiDimensionalStraight.cpp b/shared/libebm/PartitionMultiDimensionalStraight.cpp
index c9499c7ec..eed9b427f 100644
--- a/shared/libebm/PartitionMultiDimensionalStraight.cpp
+++ b/shared/libebm/PartitionMultiDimensionalStraight.cpp
@@ -106,10 +106,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalS
 
       EBM_ASSERT(std::numeric_limits<FloatCalc>::min() <= hessianMin);
 
-#ifndef NDEBUG
-      bool bAnySplits = false;
-#endif // NDEBUG
-
       const bool bUseLogitBoost = bHessian && !(CalcInteractionFlags_DisableNewton & flags);
 
       // if a negative value were to occur, then it would be due to numeric instability, so clip it to zero here
@@ -205,10 +201,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalS
             }
 
             {
-#ifndef NDEBUG
-               bAnySplits = true;
-#endif // NDEBUG
-
                const FloatCalc w00 = static_cast<FloatCalc>(bin00.GetWeight());
                const FloatCalc w01 = static_cast<FloatCalc>(bin01.GetWeight());
                const FloatCalc w10 = static_cast<FloatCalc>(bin10.GetWeight());
@@ -427,22 +419,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalS
                      regLambda,
                      deltaStepMax);
             }
-
-            // bestGain should be positive, or NaN, BUT it can be slightly negative due to floating point noise
-            // it could also be -inf if the parent/total bin overflows, but the children parts did not.
-            // bestGain can also be substantially negative if we didn't find any legal cuts and
-            // then we subtracted the base partial gain here from zero
-
-            // if no legal splits were found, then bestGain will be zero.  In theory we should
-            // therefore not subtract the parent partial gain, but doing so does no harm since we later set any
-            // negative interaction score to zero in the caller of this function.  Due to that we don't
-            // need to check here, since any value we subtract from zero will lead to a negative number and
-            // then will be zeroed by our caller
-            // BUT, for debugging purposes, check here for that condition so that we can check for illegal negative
-            // gain.
-
-            EBM_ASSERT(std::isnan(bestGain) || -std::numeric_limits<FloatCalc>::infinity() == bestGain ||
-                  k_epsilonNegativeGainAllowed <= bestGain || !bAnySplits);
          }
       }
 
diff --git a/shared/libebm/PartitionMultiDimensionalTree.cpp b/shared/libebm/PartitionMultiDimensionalTree.cpp
index 1787f978f..0b88e8e7f 100644
--- a/shared/libebm/PartitionMultiDimensionalTree.cpp
+++ b/shared/libebm/PartitionMultiDimensionalTree.cpp
@@ -864,13 +864,10 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalT
             }
 
             EBM_ASSERT(std::numeric_limits<FloatCalc>::infinity() != bestGain);
-            EBM_ASSERT(std::isnan(bestGain) || -std::numeric_limits<FloatCalc>::infinity() == bestGain ||
-                  k_epsilonNegativeGainAllowed <= bestGain);
 
             if(LIKELY(/* NaN */ std::numeric_limits<FloatCalc>::lowest() <= bestGain)) {
                EBM_ASSERT(!std::isnan(bestGain));
                EBM_ASSERT(!std::isinf(bestGain));
-               EBM_ASSERT(k_epsilonNegativeGainAllowed <= bestGain);
 
                *pTotalGain = 0;
                if(LIKELY(k_gainMin <= bestGain)) {
diff --git a/shared/libebm/PartitionOneDimensionalBoosting.cpp b/shared/libebm/PartitionOneDimensionalBoosting.cpp
index 8a4c48cc4..23355c16a 100644
--- a/shared/libebm/PartitionOneDimensionalBoosting.cpp
+++ b/shared/libebm/PartitionOneDimensionalBoosting.cpp
@@ -616,12 +616,7 @@ static int FindBestSplitGain(RandomDeterministic* const pRng,
       ++iScoreParent;
    } while(cScores != iScoreParent);
 
-   // bestGain could be -inf if the partial gain on the children reached a number close to +inf and then
-   // the children were -inf due to floating point noise.
-   EBM_ASSERT(std::isnan(bestGain) || -std::numeric_limits<FloatCalc>::infinity() == bestGain ||
-         k_epsilonNegativeGainAllowed <= bestGain);
    EBM_ASSERT(std::numeric_limits<FloatCalc>::infinity() != bestGain);
-
    EBM_ASSERT(std::numeric_limits<FloatCalc>::min() <= k_gainMin);
    if(UNLIKELY(/* NaN */ !LIKELY(k_gainMin <= bestGain))) {
       // do not allow splits on gains that are too small
diff --git a/shared/libebm/ebm_internal.hpp b/shared/libebm/ebm_internal.hpp
index 9431cf3bc..c68a47e75 100644
--- a/shared/libebm/ebm_internal.hpp
+++ b/shared/libebm/ebm_internal.hpp
@@ -44,10 +44,6 @@ typedef double FloatPrecomp;
 
 static constexpr double k_illegalGainDouble = std::numeric_limits<double>::lowest();
 
-#ifndef NDEBUG
-static constexpr FloatCalc k_epsilonNegativeGainAllowed = FloatCalc{-1e-7};
-#endif // NDEBUG
-
 extern double FloatTickIncrementInternal(double deprecisioned[1]) noexcept;
 extern double FloatTickDecrementInternal(double deprecisioned[1]) noexcept;
 
diff --git a/shared/libebm/tests/boosting_unusual_inputs.cpp b/shared/libebm/tests/boosting_unusual_inputs.cpp
index 45fc48db1..a3ed4c7a5 100644
--- a/shared/libebm/tests/boosting_unusual_inputs.cpp
+++ b/shared/libebm/tests/boosting_unusual_inputs.cpp
@@ -2060,3 +2060,114 @@ TEST_CASE("lossguide, boosting, regression") {
    termScore = test.GetCurrentTermScore(0, {0}, 0);
    CHECK_APPROX(termScore, 0.40592050000000002);
 }
+
+TEST_CASE("stress test, boosting") {
+   auto rng = MakeRng(0);
+   const IntEbm cTrainSamples = 200;
+   const IntEbm cValidationSamples = 100;
+   const std::vector<FeatureTest> features = {
+         FeatureTest(10, false, false, false),
+         FeatureTest(10, false, false, true),
+         FeatureTest(10, false, true, false),
+         FeatureTest(10, false, true, true),
+         FeatureTest(10, true, false, false),
+         FeatureTest(10, true, false, true),
+         FeatureTest(10, true, true, false),
+         FeatureTest(10, true, true, true),
+   };
+   auto terms = MakeMains(features);
+   terms.push_back({0, 0});
+   if(2 <= features.size()) {
+      terms.push_back({0, 1});
+      terms.push_back({1, 0});
+   }
+   if(3 <= features.size()) {
+      // terms.push_back({0, 1, 2}); // TODO: enable when fast enough
+   }
+   if(4 <= features.size()) {
+      // terms.push_back({0, 1, 2, 3}); // TODO: enable when fast enough
+   }
+   const size_t cRounds = 200;
+   std::vector<IntEbm> boostFlagsAny{// TermBoostFlags_PurifyGain,
+         TermBoostFlags_DisableNewtonGain,
+         TermBoostFlags_DisableCategorical,
+         // TermBoostFlags_PurifyUpdate,
+         // TermBoostFlags_GradientSums, // does not return a metric
+         TermBoostFlags_DisableNewtonUpdate,
+         TermBoostFlags_RandomSplits};
+   std::vector<IntEbm> boostFlagsChoose{TermBoostFlags_Default,
+         TermBoostFlags_MissingLow,
+         TermBoostFlags_MissingHigh,
+         TermBoostFlags_MissingSeparate,
+         TermBoostFlags_MissingDrop};
+
+   double validationMetric = 1.0;
+
+   for(IntEbm classesCount = Task_Regression; classesCount < 5; ++classesCount) {
+      if(classesCount != Task_Regression && classesCount < 1) {
+         continue;
+      }
+      const auto train = MakeRandomDataset(rng, classesCount, cTrainSamples, features);
+      const auto validation = MakeRandomDataset(rng, classesCount, cValidationSamples, features);
+      for(IntEbm innerBagCount = 0; innerBagCount < 3; ++innerBagCount) {
+         TestBoost test = TestBoost(classesCount,
+               features,
+               terms,
+               train,
+               validation,
+               innerBagCount,
+               k_testCreateBoosterFlags_Default,
+               AccelerationFlags_NONE);
+
+         double validationMetricIteration = 0.0;
+         for(size_t iRound = 0; iRound < cRounds; ++iRound) {
+            for(IntEbm iTerm = 0; iTerm < static_cast<IntEbm>(terms.size()); ++iTerm) {
+               const IntEbm cRealBins = features[terms[iTerm][0]].CountRealBins();
+               const IntEbm cDimensions = terms[iTerm].size();
+
+               const TermBoostFlags boostFlags =
+                     static_cast<TermBoostFlags>(ChooseAny(rng, boostFlagsAny) | ChooseFrom(rng, boostFlagsChoose));
+
+               const double learningRate = 0.015625;
+               const IntEbm minSamplesLeaf = TestRand(rng, 5) + 1;
+               const double minHessian = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
+               const double regAlpha = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
+               const double regLambda = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
+               const double maxDeltaStep = 0 == TestRand(rng, 5) ? 1.0 : 0.0;
+               const double categoricalSmoothing = 10.0;
+               const IntEbm maxCategoricalThreshold = 1 + TestRand(rng, cRealBins + 1);
+               const double categoricalInclusionPercent = 0 == TestRand(rng, 2) ? 0.75 : 1.0;
+
+               // we allow 1 cut more than the number of bins to test excessive leaves.
+               const IntEbm cLeaves = 1 + TestRand(rng, cRealBins + 1);
+               const std::vector<IntEbm> leaves(cDimensions, cLeaves);
+               const MonotoneDirection direction =
+                     0 == TestRand(rng, 5) ? static_cast<MonotoneDirection>(TestRand(rng, 2) * 2 - 1) : 0;
+               const std::vector<MonotoneDirection> monotonicity(cDimensions, direction);
+
+               validationMetricIteration = test.Boost(iTerm,
+                                                     boostFlags,
+                                                     learningRate,
+                                                     minSamplesLeaf,
+                                                     minHessian,
+                                                     regAlpha,
+                                                     regLambda,
+                                                     maxDeltaStep,
+                                                     categoricalSmoothing,
+                                                     maxCategoricalThreshold,
+                                                     categoricalInclusionPercent,
+                                                     leaves,
+                                                     monotonicity)
+                                                 .validationMetric;
+            }
+         }
+         if(classesCount == 1) {
+            CHECK(std::numeric_limits<double>::infinity() == validationMetricIteration);
+         } else {
+            validationMetric *= validationMetricIteration;
+         }
+      }
+   }
+
+   CHECK(validationMetric == 62013566170252.117);
+}
diff --git a/shared/libebm/tests/libebm_test.cpp b/shared/libebm/tests/libebm_test.cpp
index 46cdfdabd..2519e15f0 100644
--- a/shared/libebm/tests/libebm_test.cpp
+++ b/shared/libebm/tests/libebm_test.cpp
@@ -607,18 +607,22 @@ BoostRet TestBoost::Boost(const IntEbm indexTerm,
 
    std::vector<double> scoreTensor(cUpdateScores);
 
-   memset(&scoreTensor[0], 0xFF, sizeof(double) * cUpdateScores);
-   error = GetTermUpdate(m_boosterHandle, &scoreTensor[0]);
+   if(0 != cUpdateScores) {
+      memset(scoreTensor.data(), 0xFF, sizeof(double) * cUpdateScores);
+   }
+   error = GetTermUpdate(m_boosterHandle, scoreTensor.data());
    if(Error_None != error) {
       throw TestException(error, "SetTermUpdate");
    }
 
    if(0 != (TermBoostFlags_GradientSums & flags)) {
       // if sums are on, then we MUST change the term update
-      memset(&scoreTensor[0], 0, sizeof(double) * cUpdateScores);
+      if(0 != cUpdateScores) {
+         memset(scoreTensor.data(), 0, sizeof(double) * cUpdateScores);
+      }
    }
 
-   error = SetTermUpdate(m_boosterHandle, indexTerm, &scoreTensor[0]);
+   error = SetTermUpdate(m_boosterHandle, indexTerm, scoreTensor.data());
    if(Error_None != error) {
       throw TestException(error, "SetTermUpdate");
    }
@@ -629,14 +633,18 @@ BoostRet TestBoost::Boost(const IntEbm indexTerm,
    }
 
    if(0 <= indexTerm) {
-      memset(&scoreTensor[0], 0xFF, sizeof(double) * cUpdateScores);
-      error = GetBestTermScores(m_boosterHandle, indexTerm, &scoreTensor[0]);
+      if(0 != cUpdateScores) {
+         memset(scoreTensor.data(), 0xFF, sizeof(double) * cUpdateScores);
+      }
+      error = GetBestTermScores(m_boosterHandle, indexTerm, scoreTensor.data());
       if(Error_None != error) {
          throw TestException(error, "ApplyTermUpdate");
       }
 
-      memset(&scoreTensor[0], 0xFF, sizeof(double) * cUpdateScores);
-      error = GetCurrentTermScores(m_boosterHandle, indexTerm, &scoreTensor[0]);
+      if(0 != cUpdateScores) {
+         memset(scoreTensor.data(), 0xFF, sizeof(double) * cUpdateScores);
+      }
+      error = GetCurrentTermScores(m_boosterHandle, indexTerm, scoreTensor.data());
       if(Error_None != error) {
          throw TestException(error, "ApplyTermUpdate");
       }
@@ -1004,6 +1012,57 @@ extern void DisplayCuts(IntEbm countSamples,
    std::cout << std::endl << std::endl;
 }
 
+extern IntEbm ChooseAny(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options) {
+   IntEbm ret = 0;
+   for(const IntEbm option : options) {
+      if(0 == TestRand(rng, 3)) {
+         ret |= option;
+      }
+   }
+   return ret;
+}
+
+extern IntEbm ChooseFrom(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options) {
+   return options[TestRand(rng, options.size())];
+}
+
+extern std::vector<TestSample> MakeRandomDataset(std::vector<unsigned char>& rng,
+      const IntEbm cClasses,
+      const size_t cSamples,
+      const std::vector<FeatureTest>& features) {
+   std::vector<TestSample> samples;
+
+   for(size_t iSample = 0; iSample < cSamples; ++iSample) {
+      std::vector<IntEbm> sampleBinIndexes;
+      for(const FeatureTest& feature : features) {
+         IntEbm iBin = TestRand(rng, feature.CountRealBins());
+         if(!feature.m_bMissing) {
+            ++iBin;
+         }
+         sampleBinIndexes.push_back(iBin);
+      }
+
+      double target;
+      if(Task_GeneralClassification <= cClasses) {
+         target = static_cast<double>(TestRand(rng, cClasses));
+      } else {
+         target = TestRand(rng);
+      }
+
+      samples.push_back(TestSample(sampleBinIndexes, target));
+   }
+   return samples;
+}
+
+extern std::vector<std::vector<IntEbm>> MakeMains(const std::vector<FeatureTest>& features) {
+   const IntEbm cFeatures = static_cast<IntEbm>(features.size());
+   std::vector<std::vector<IntEbm>> termFeatures;
+   for(IntEbm iFeature = 0; iFeature < cFeatures; ++iFeature) {
+      termFeatures.push_back({iFeature});
+   }
+   return termFeatures;
+}
+
 int main() {
    SetLogCallback(&LogCallback);
    SetTraceLevel(Trace_Verbose);
diff --git a/shared/libebm/tests/libebm_test.hpp b/shared/libebm/tests/libebm_test.hpp
index e247e6ee9..8d7a8da81 100644
--- a/shared/libebm/tests/libebm_test.hpp
+++ b/shared/libebm/tests/libebm_test.hpp
@@ -179,6 +179,8 @@ class FeatureTest final {
    const bool m_bUnseen;
    const bool m_bNominal;
 
+   inline IntEbm CountRealBins() const { return m_countBins - (m_bMissing ? 0 : 1) - (m_bUnseen ? 0 : 1); }
+
    inline FeatureTest(
          const IntEbm countBins, const bool bMissing = true, const bool bUnseen = true, const bool bNominal = false) :
          m_countBins(countBins), m_bMissing(bMissing), m_bUnseen(bUnseen), m_bNominal(bNominal) {}
@@ -536,4 +538,33 @@ void DisplayCuts(IntEbm countSamples,
       double minFeatureVal,
       double maxFeatureVal);
 
+std::vector<TestSample> MakeRandomDataset(std::vector<unsigned char>& rng,
+      const IntEbm cClasses,
+      const size_t cSamples,
+      const std::vector<FeatureTest>& features);
+
+std::vector<std::vector<IntEbm>> MakeMains(const std::vector<FeatureTest>& features);
+
+IntEbm ChooseAny(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options);
+IntEbm ChooseFrom(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options);
+
+inline static std::vector<unsigned char> MakeRng(const SeedEbm seed) {
+   std::vector<unsigned char> rng(static_cast<size_t>(MeasureRNG()));
+   InitRNG(seed, &rng[0]);
+   return rng;
+}
+
+inline IntEbm TestRand(std::vector<unsigned char>& rng, const IntEbm count) {
+   // this isn't balanced, but good enough for tests
+   SeedEbm randomNum;
+   GenerateSeed(&rng[0], &randomNum);
+   return static_cast<IntEbm>(static_cast<USeedEbm>(randomNum) % static_cast<USeedEbm>(count));
+}
+
+inline double TestRand(std::vector<unsigned char>& rng) {
+   double ret;
+   GenerateGaussianRandom(&rng[0], 100.0, 1, &ret);
+   return ret;
+}
+
 #endif // LIBEBM_TEST_HPP