From 08f3679b1dfd5e89759bbc8835cb7be9931c4e59 Mon Sep 17 00:00:00 2001 From: Andrew Pikler Date: Sun, 10 Jun 2018 16:22:13 +0300 Subject: [PATCH] Interpolate merging digest percentiles by centroid weight --- .../tdunning/math/stats/MergingDigest.java | 29 +++++++++++++++---- .../math/stats/MergingDigestTest.java | 26 +++++++++++++++-- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/core/src/main/java/com/tdunning/math/stats/MergingDigest.java b/core/src/main/java/com/tdunning/math/stats/MergingDigest.java index d43f0c6f..a65a1e06 100644 --- a/core/src/main/java/com/tdunning/math/stats/MergingDigest.java +++ b/core/src/main/java/com/tdunning/math/stats/MergingDigest.java @@ -678,7 +678,9 @@ public double quantile(double q) { // at the boundaries, we return min or max if (index < weight[0] / 2) { assert weight[0] > 0; - return min + 2 * index / weight[0] * (mean[0] - min); + double z1 = (index / totalWeight) * weight[0]; + double z2 = ((weight[0] / 2 - index) / totalWeight) * getWeightForMinOrMax(weight[0]); + return weightedAverage(min, z2, mean[0], z1); } // in between we interpolate between centroids @@ -687,8 +689,8 @@ public double quantile(double q) { double dw = (weight[i] + weight[i + 1]) / 2; if (weightSoFar + dw > index) { // centroids i and i+1 bracket our current point - double z1 = index - weightSoFar; - double z2 = weightSoFar + dw - index; + double z1 = ((index - weightSoFar) / totalWeight) * weight[i + 1]; + double z2 = ((weightSoFar + dw - index) / totalWeight) * weight[i]; return weightedAverage(mean[i], z2, mean[i + 1], z1); } weightSoFar += dw; @@ -698,11 +700,28 @@ public double quantile(double q) { // weightSoFar = totalWeight - weight[n-1]/2 (very nearly) // so we interpolate out to max value ever seen - double z1 = index - totalWeight - weight[n - 1] / 2.0; - double z2 = weight[n - 1] / 2 - z1; + double z1 = ((totalWeight - index) / totalWeight) * weight[n - 1]; + double z2 = (weight[n - 1] / 2 - z1) * getWeightForMinOrMax(weight[n - 1]); return weightedAverage(mean[n - 1], z1, max, z2); } + private double getWeightForMinOrMax(double maxWeight) { + return Math.min(maxWeight, centroidScaleToQuantile(1 / compression)); + } + + /** + * Takes a centroid scale (the centroid number) and returns the ideal + * starting percentile for it. + * + * @param centroidNumber the number of the centroid + * @return the ideal starting percentile for this centroid + */ + private double centroidScaleToQuantile(double centroidNumber) { + double k = centroidNumber / compression; + double sinePrecalc = Math.sin((k*Math.PI)/2); + return sinePrecalc * sinePrecalc; + } + @Override public int centroidCount() { return lastUsedCell; diff --git a/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java b/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java index 0f81b8ca..c03fdc1b 100644 --- a/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java +++ b/core/src/test/java/com/tdunning/math/stats/MergingDigestTest.java @@ -122,7 +122,7 @@ public void printQuantiles() throws FileNotFoundException { td.setMinMax(0, 10); td.add(1); td.add(2); - td.add(5, 2); + td.add(5); td.add(6); td.add(9); td.add(10); @@ -132,7 +132,7 @@ public void printQuantiles() throws FileNotFoundException { quantiles.printf("x,q\n"); cdfs.printf("x,q\n"); - for (double q = 0; q < 1; q += 1e-3) { + for (double q = 1.0 / 12.0; q < 1; q += 1e-3) { double x = td.quantile(q); quantiles.printf("%.3f,%.3f\n", x, q); @@ -145,7 +145,27 @@ public void printQuantiles() throws FileNotFoundException { } } - assertEquals(2.0 / 7, td.cdf(3), 1e-9); + assertEquals(0.25 + 1.0/18.0, td.cdf(3), 1e-9); + } + + @Test + public void testCloseTo1QuantileBigWeights() { + MergingDigest td = new MergingDigest(200); + td.add(1, 1); + td.add(2, 100); + td.setMinMax(0, 1000); + final double quantile = td.quantile(.999); + assertTrue("Quantile val incorrect: " + quantile, quantile < 101); + } + + @Test + public void testCloseToZeroQuantileBigWeights() { + MergingDigest td = new MergingDigest(200); + td.add(1000, 100); + td.add(1001, 1); + td.setMinMax(0, 1002); + final double quantile = td.quantile(.001); + assertTrue("Quantile val incorrect: " + quantile, quantile > 999); } @Override