Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add KS tests for weighted sampling #1530

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion distr_test/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ edition = "2021"
publish = false

[dev-dependencies]
rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false }
rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false, features = ["alloc"] }
rand = { path = "..", version = "=0.9.0-alpha.1", features = ["small_rng"] }
num-traits = "0.2.19"
# Special functions for testing distributions
Expand Down
235 changes: 235 additions & 0 deletions distr_test/tests/weighted.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
// Copyright 2024 Developers of the Rand project.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

mod ks;
use ks::test_discrete;
use rand::distr::{Distribution, WeightedIndex};
use rand::seq::{IndexedRandom, IteratorRandom};
use rand_distr::{WeightedAliasIndex, WeightedTreeIndex};

/// Takes the unnormalized pdf and creates the cdf of a discrete distribution
fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 {
dhardy marked this conversation as resolved.
Show resolved Hide resolved
let mut cdf = Vec::with_capacity(num);
let mut ac = 0.0;
for i in 0..num {
ac += f(i as i64);
cdf.push(ac);
}

let frac = 1.0 / ac;
for x in &mut cdf {
*x *= frac;
}

move |i| {
if i < 0 {
0.0
} else {
cdf[i as usize]
}
}
}

#[test]
fn weighted_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = WeightedIndex::new((0..num).map(|i| weight(i as i64))).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn weighted_alias_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let weights = (0..num).map(|i| weight(i as i64)).collect();
let distr = WeightedAliasIndex::new(weights).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn weighted_tree_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = WeightedTreeIndex::new((0..num).map(|i| weight(i as i64))).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_weighted_indexed() {
struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
*IndexedRandom::choose_weighted(&self.0[..], rng, |i| (self.1)(*i)).unwrap()
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
test_discrete(0, distr, make_cdf(num, &weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_one_weighted_indexed() {
struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
*IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 1, |i| (self.1)(*i))
.unwrap()
.next()
.unwrap()
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
test_discrete(0, distr, make_cdf(num, &weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_two_weighted_indexed() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably more complex than needed, but looks correct.
It's probably worth implementing chi squared at some point, but this should also be quite sensitive.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably more complex than needed, but looks correct.

You mean the use of an Adapter? Yes, but I'd sooner do this than revise the KS test API (which is well adapted for other usages).

It's probably worth implementing chi squared at some point, but this should also be quite sensitive.

A fair point.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean using KS for these distributions (chi squared would be more straight forward), Adapter I think is fine.

struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
let mut iter =
IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 2, |i| (self.1)(*i))
.unwrap();
let mut a = *iter.next().unwrap();
let mut b = *iter.next().unwrap();
assert!(iter.next().is_none());
if b < a {
std::mem::swap(&mut a, &mut b);
}
a * self.0.len() as i64 + b
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);

let pmf1 = (0..num).map(|i| weight(i as i64)).collect::<Vec<f64>>();
let sum: f64 = pmf1.iter().sum();
let frac = 1.0 / sum;

let mut ac = 0.0;
let mut cdf = Vec::with_capacity(num * num);
for a in 0..num {
for b in 0..num {
if a < b {
let pa = pmf1[a] * frac;
let pab = pa * pmf1[b] / (sum - pmf1[a]);

let pb = pmf1[b] * frac;
let pba = pb * pmf1[a] / (sum - pmf1[b]);

ac += pab + pba;
}
cdf.push(ac);
}
}
assert!((cdf.last().unwrap() - 1.0).abs() < 1e-9);

let cdf = |i| {
if i < 0 {
0.0
} else {
cdf[i as usize]
}
};

test_discrete(0, distr, cdf);
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
test_weights(10, |i| ((i + 1) as f64).powi(-8));
}

#[test]
fn choose_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
IteratorRandom::choose(self.0.clone(), rng).unwrap()
}
}

let distr = Adapter((0..100).map(|i| i as i64));
test_discrete(0, distr, make_cdf(100, |_| 1.0));
}

#[test]
fn choose_stable_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
IteratorRandom::choose_stable(self.0.clone(), rng).unwrap()
}
}

let distr = Adapter((0..100).map(|i| i as i64));
test_discrete(0, distr, make_cdf(100, |_| 1.0));
}

#[test]
fn choose_two_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
let mut buf = [0; 2];
IteratorRandom::choose_multiple_fill(self.0.clone(), rng, &mut buf);
buf.sort_unstable();
assert!(buf[0] < 99 && buf[1] >= 1);
let a = buf[0];
4950 - (99 - a) * (100 - a) / 2 + buf[1] - a - 1
}
}

let distr = Adapter((0..100).map(|i| i as i64));

test_discrete(
0,
distr,
|i| if i < 0 { 0.0 } else { (i + 1) as f64 / 4950.0 },
);
}
49 changes: 32 additions & 17 deletions src/seq/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,8 @@ where
/// ordering). The weights are to be provided by the input function `weights`,
/// which will be called once for each index.
///
/// This implementation uses the algorithm described by Efraimidis and Spirakis
/// in this paper: <https://doi.org/10.1016/j.ipl.2005.11.003>
/// This implementation is based on the algorithm A-ExpJ as found in
/// [Efraimidis and Spirakis, 2005](https://doi.org/10.1016/j.ipl.2005.11.003).
/// It uses `O(length + amount)` space and `O(length)` time.
///
/// Error cases:
Expand Down Expand Up @@ -387,37 +387,52 @@ where

impl<N> Eq for Element<N> {}

let mut candidates = Vec::with_capacity(length.as_usize());
let mut candidates = Vec::with_capacity(amount.as_usize());
let mut index = N::zero();
while index < length {
while index < length && candidates.len() < amount.as_usize() {
let weight = weight(index.as_usize()).into();
if weight > 0.0 {
let key = rng.random::<f64>().powf(1.0 / weight);
// We use the log of the key used in A-ExpJ to improve precision
// for small weights:
let key = rng.random::<f64>().ln() / weight;
candidates.push(Element { index, key });
} else if !(weight >= 0.0) {
return Err(WeightError::InvalidWeight);
}

index += N::one();
}
candidates.sort_unstable();

let avail = candidates.len();
if avail < amount.as_usize() {
if candidates.len() < amount.as_usize() {
return Err(WeightError::InsufficientNonZero);
}

// Partially sort the array to find the `amount` elements with the greatest
// keys. Do this by using `select_nth_unstable` to put the elements with
// the *smallest* keys at the beginning of the list in `O(n)` time, which
// provides equivalent information about the elements with the *greatest* keys.
let (_, mid, greater) = candidates.select_nth_unstable(avail - amount.as_usize());
let mut x = rng.random::<f64>().ln() / candidates[0].key;
benjamin-lieser marked this conversation as resolved.
Show resolved Hide resolved
while index < length {
let weight = weight(index.as_usize()).into();
if weight > 0.0 {
x -= weight;
if x <= 0.0 {
let t = (candidates[0].key * weight).exp();
let key = rng.random_range(t..1.0).ln() / weight;
candidates[0] = Element { index, key };
// TODO: consider using a binary tree instead of sorting at each
// step. This should be faster for some THRESHOLD < amount.
candidates.sort_unstable();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it is very likely that a tree data structure would perform much faster if amount is big enough. Not sure where the threshold is. Depending on sort_unstable is could even perform particularly worse on an almost sorted slice.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, though I won't address it now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://doc.rust-lang.org/std/collections/struct.BinaryHeap.html
should be always faster as it also stores its elements in a Vec should be an easy change.


x = rng.random::<f64>().ln() / candidates[0].key;
}
} else if !(weight >= 0.0) {
return Err(WeightError::InvalidWeight);
}

let mut result: Vec<N> = Vec::with_capacity(amount.as_usize());
result.push(mid.index);
for element in greater {
result.push(element.index);
index += N::one();
}
Ok(IndexVec::from(result))

Ok(IndexVec::from(
candidates.iter().map(|elt| elt.index).collect(),
))
}

/// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
Expand Down
16 changes: 8 additions & 8 deletions src/seq/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -732,17 +732,17 @@ mod test {
use super::*;

// The theoretical probabilities of the different outcomes are:
// AB: 0.5 * 0.5 = 0.250
// AC: 0.5 * 0.5 = 0.250
// BA: 0.25 * 0.67 = 0.167
// BC: 0.25 * 0.33 = 0.082
// CA: 0.25 * 0.67 = 0.167
// CB: 0.25 * 0.33 = 0.082
let choices = [('a', 2), ('b', 1), ('c', 1)];
// AB: 0.5 * 0.667 = 0.3333
// AC: 0.5 * 0.333 = 0.1667
// BA: 0.333 * 0.75 = 0.25
// BC: 0.333 * 0.25 = 0.0833
// CA: 0.167 * 0.6 = 0.1
// CB: 0.167 * 0.4 = 0.0667
let choices = [('a', 3), ('b', 2), ('c', 1)];
let mut rng = crate::test::rng(414);

let mut results = [0i32; 3];
let expected_results = [4167, 4167, 1666];
let expected_results = [5833, 2667, 1500];
for _ in 0..10000 {
let result = choices
.choose_multiple_weighted(&mut rng, 2, |item| item.1)
Expand Down
39 changes: 39 additions & 0 deletions tests/choose_multiple_weighted.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
use rand::{prelude::IndexedRandom, rng};

#[test]
fn main() {
let values: Vec<(i32, f64)> =
vec![(1, 0.080), (2, 0.0078), (3, 0.012)]
.into_iter()
.map(|v| (v.0, f64::powf(v.1, 4.0)))
.collect();

for v in &values {
println!("value={} has weight={:.10}", v.0, v.1);
}
let weight_sum: f64 = values.iter().map(|(_, w)| w).sum();
let weights: Vec<f64> = values.iter().map(|(_, w)| w / weight_sum).collect();
println!("expected: {weights:?}");

let largest_weight = values[0].1;

let mut results = [0u32; 9];

for _ in 0..100_000 {
let mut iter = values.choose_multiple_weighted(&mut rng(), 2, |a| a.1 / largest_weight).unwrap();
let a = iter.next().unwrap().0;
let b = iter.next().unwrap().0;
assert!(iter.next().is_none());

let i = (a - 1) * 3 + b - 1;
results[i as usize] += 1;
}

println!("got following results:");
for i in 0..9 {
let i0 = i / 3 + 1;
let i1 = i % 3 + 1;
println!("({}, {}):\t{}", i0, i1, results[i]);
}
assert!(false);
}
Loading