Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
125811: sql: add dynamic sample size max/min private settings and logging r=Uzair5162 a=Uzair5162

Logs the histogram sample size when it is determined dynamically. Also makes the bounds on the dynamic sample size computation configurable with private cluster settings. These are private for now since users should modify the `sql_stats_histogram_samples_count` table setting or `sql.stats.histogram_samples.count` cluster setting if they want custom sample sizes.

See also: cockroachdb#123972

Release note: None

Co-authored-by: Uzair Ahmad <[email protected]>
  • Loading branch information
craig[bot] and Uzair5162 committed Jun 18, 2024
2 parents dd4e4ca + 5a139ea commit d7afa42
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 6 deletions.
51 changes: 46 additions & 5 deletions pkg/sql/distsql_plan_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/colinfo"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
Expand All @@ -35,6 +36,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/stats"
"github.com/cockroachdb/cockroach/pkg/sql/stats/bounds"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/logtags"
)
Expand Down Expand Up @@ -74,13 +76,33 @@ var maxTimestampAge = settings.RegisterDurationSetting(
5*time.Minute,
)

// minAutoHistogramSamples and maxAutoHistogramSamples are the bounds used by
// computeNumberSamples to determine the number of samples to collect for
// histogram construction.
var minAutoHistogramSamples = settings.RegisterIntSetting(
settings.ApplicationLevel,
"sql.stats.histogram_samples.min",
"minimum sample size to be selected when sample size is automatically determined",
10000,
settings.NonNegativeIntWithMaximum(math.MaxUint32),
settings.WithVisibility(settings.Reserved))

var maxAutoHistogramSamples = settings.RegisterIntSetting(
settings.ApplicationLevel,
"sql.stats.histogram_samples.max",
"maximum sample size to be selected when sample size is automatically determined",
300000,
settings.NonNegativeIntWithMaximum(math.MaxUint32),
settings.WithVisibility(settings.Reserved))

// computeNumberSamples dynamically determines the number of samples to collect
// based on the estimated number of rows in the table. The formula 582n^0.29 is
// based on empirical data collected by running the sampler with different
// sample sizes on a variety of table sizes and observing the proportion of
// heavy hitters (most frequent elements) represented in the sample. It was
// derived by fitting a best-fit curve to the table below. The number of samples
// returned is bounded between 10,000 and 300,000.
// returned is bounded between minAutoHistogramSamples and
// maxAutoHistogramSamples (10,000 and 300,000 by default).
// +---------------+-------------+
// | Table Size | Sample Size |
// +---------------+-------------+
Expand All @@ -103,13 +125,27 @@ var maxTimestampAge = settings.RegisterDurationSetting(
// ~65% down to 1000x, ~10% down to 100x
// - 1b rows/300k samples: ~100% coverage of multiplicities down to 100000x,
// ~95% down to 10000x, ~25% down to 1000x
func computeNumberSamples(numRows uint64) uint32 {
func computeNumberSamples(ctx context.Context, numRows uint64, st *cluster.Settings) uint32 {
maxSampleSize := maxAutoHistogramSamples.Get(&st.SV)
minSampleSize := minAutoHistogramSamples.Get(&st.SV)

if maxSampleSize < minSampleSize {
log.Infof(
ctx,
"using default sample size bounds since max sample size %d is less than min sample size %d",
maxSampleSize,
minSampleSize,
)
maxSampleSize = maxAutoHistogramSamples.Default()
minSampleSize = minAutoHistogramSamples.Default()
}

numSamples := math.Max(
math.Min(
582.0*math.Pow(float64(numRows), 0.29),
300000.0,
float64(maxSampleSize),
),
10000.0,
float64(minSampleSize),
)
return uint32(numSamples)
}
Expand Down Expand Up @@ -156,7 +192,12 @@ func (dsp *DistSQLPlanner) createAndAttachSamplers(
} else if clusterSampleCount := histogramSamples.Get(&dsp.st.SV); clusterSampleCount != histogramSamples.Default() {
histogramSamplesCount = uint32(clusterSampleCount)
} else {
histogramSamplesCount = computeNumberSamples(rowsExpected)
histogramSamplesCount = computeNumberSamples(
ctx,
rowsExpected,
dsp.st,
)
log.Infof(ctx, "using computed sample size of %d for histogram construction", histogramSamplesCount)
}
sampler.SampleSize = histogramSamplesCount
// This could be anything >= 2 to produce a histogram, but the max number
Expand Down
7 changes: 6 additions & 1 deletion pkg/sql/distsql_plan_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,19 @@
package sql

import (
"context"
"math"
"testing"

"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
)

func TestComputeNumberSamples(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()

testData := []struct {
numRows int
Expand All @@ -43,7 +46,9 @@ func TestComputeNumberSamples(t *testing.T) {
t.Fatalf("expected %d samples, got %d", expectedNumSamples, computedNumSamples)
}
}

st := cluster.MakeTestingClusterSettings()
for _, td := range testData {
checkComputeNumberSamples(int(computeNumberSamples(uint64(td.numRows))), td.expectedNumSamples)
checkComputeNumberSamples(int(computeNumberSamples(ctx, uint64(td.numRows), st)), td.expectedNumSamples)
}
}
66 changes: 66 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/distsql_stats
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,72 @@ SELECT (count(*) // 10) * 10 FROM [SHOW HISTOGRAM $hist_id_injected]
----
20000

# Verify that we can configure the minimum and maximum automatically-determined
# sample size.

statement ok
SET CLUSTER SETTING sql.stats.histogram_samples.min = 15000

statement ok
CREATE STATISTICS s_dynamic_min FROM big

let $hist_id_dynamic_min
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE big] WHERE statistics_name = 's_dynamic_min';

# Perform integer division by 10 because there may be 2 extra buckets added
# on either end of the histogram to account for the 20000 distinct values.
query I
SELECT (count(*) // 10) * 10 FROM [SHOW HISTOGRAM $hist_id_dynamic_min]
----
15000

statement ok
RESET CLUSTER SETTING sql.stats.histogram_samples.min

statement ok
SET CLUSTER SETTING sql.stats.histogram_samples.max = 10500

statement ok
CREATE STATISTICS s_dynamic_max FROM big

let $hist_id_dynamic_max
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE big] WHERE statistics_name = 's_dynamic_max';

# Perform integer division by 10 because there may be 2 extra buckets added
# on either end of the histogram to account for the 20000 distinct values.
query I
SELECT (count(*) // 10) * 10 FROM [SHOW HISTOGRAM $hist_id_dynamic_max]
----
10500

# Verify that the default sample size bounds are used if the minimum is
# greater than the maximum.

statement ok
SET CLUSTER SETTING sql.stats.histogram_samples.min = 11000

statement ok
CREATE STATISTICS s_dynamic_default_bounds FROM big

let $hist_id_dynamic_default_bounds
SELECT histogram_id FROM [SHOW STATISTICS FOR TABLE big] WHERE statistics_name = 's_dynamic_default_bounds';

# Perform integer division by 10 because there may be 2 extra buckets added
# on either end of the histogram to account for the 20000 distinct values.
query I
SELECT (count(*) // 10) * 10 FROM [SHOW HISTOGRAM $hist_id_dynamic_default_bounds]
----
10840

statement ok
RESET CLUSTER SETTING sql.stats.histogram_samples.min

statement ok
RESET CLUSTER SETTING sql.stats.histogram_samples.max

# Verify that specifying the number of samples in the cluster setting overrides
# the dynamically determined number of samples.

statement ok
SET CLUSTER SETTING sql.stats.histogram_samples.count = 20000

Expand Down

0 comments on commit d7afa42

Please sign in to comment.