rust-ml · LukeMathWalker · May 11, 2019 · May 11, 2019 · May 11, 2019 · May 11, 2019
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+/target
+**/*.rs.bk
+Cargo.lock
+
+# IDEs
+.idea/
+tags
diff --git a/Cargo.toml b/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "linfa"
+version = "0.1.0"
+authors = ["LukeMathWalker <[email protected]>"]
+edition = "2018"
+
+[dependencies]
+
+[dev-dependencies]
+ndarray = "0.12.1"
+ndarray-rand = "0.9.0"
+rand = "*"
+derive_more = "0.13.0"
diff --git a/examples/running_mean/main.rs b/examples/running_mean/main.rs
@@ -0,0 +1,59 @@
+extern crate linfa;
+extern crate ndarray;
+extern crate ndarray_rand;
+extern crate rand;
+#[macro_use]
+extern crate derive_more;
+
+use crate::standard_scaler::{Config, OnlineOptimizer, ScalingError, StandardScaler};
+use linfa::{Fit, IncrementalFit, Transformer};
+use ndarray::{stack, Array1, ArrayBase, Axis, Data, Ix1};
+use ndarray_rand::RandomExt;
+use rand::distributions::Uniform;
+
+mod standard_scaler;
+
+fn generate_batch(n_samples: usize) -> (Array1<f64>, Array1<f64>) {
+    let distribution = Uniform::new(0., 10.);
+    let x = Array1::random(n_samples, distribution);
+    let y = Array1::random(n_samples, distribution);
+    (x, y)
+}
+
+fn check<S>(scaler: &StandardScaler, x: &ArrayBase<S, Ix1>) -> Result<(), ScalingError>
+where
+    S: Data<Elem = f64>,
+{
+    let old_batch_mean = x.mean_axis(Axis(0)).into_scalar();
+    let new_batch_mean = scaler.transform(&x)?.mean_axis(Axis(0)).into_scalar();
+    let old_batch_std = x.std_axis(Axis(0), 1.).into_scalar();
+    let new_batch_std = scaler.transform(&x)?.std_axis(Axis(0), 1.).into_scalar();
+    println!(
+        "The mean.\nBefore scaling: {:?}\nAfter scaling: {:?}\n",
+        old_batch_mean, new_batch_mean
+    );
+    println!(
+        "The std deviation.\nBefore scaling: {:?}\nAfter scaling: {:?}\n",
+        old_batch_std, new_batch_std
+    );
+    Ok(())
+}
+
+/// Run it with: cargo run --example running_mean
+fn main() -> Result<(), ScalingError> {
+    let n_samples = 20;
+    let (x, y) = generate_batch(n_samples);
+
+    let mut optimizer = OnlineOptimizer::default();
+    let standard_scaler = optimizer.fit(&x, &y, Config::default())?;
+
+    check(&standard_scaler, &x)?;
+
+    let (x2, y2) = generate_batch(n_samples);
+    let standard_scaler = optimizer.incremental_fit(&x2, &y2, standard_scaler)?;
+
+    let whole_x = stack(Axis(0), &[x.view(), x2.view()]).expect("Failed to stack arrays");
+    check(&standard_scaler, &whole_x)?;
+
+    Ok(())
+}
diff --git a/examples/running_mean/standard_scaler/config.rs b/examples/running_mean/standard_scaler/config.rs
@@ -0,0 +1,24 @@
+use crate::standard_scaler::{Input, Output, StandardScaler};
+use linfa::Blueprint;
+use ndarray::Data;
+
+pub struct Config {
+    // Delta degrees of freedom.
+    // With ddof = 1, you get the sample standard deviation
+    // With ddof = 0, you get the population standard deviation
+    pub ddof: f64,
+}
+
+/// Defaults to computing the sample standard deviation.
+impl Default for Config {
+    fn default() -> Self {
+        Self { ddof: 1. }
+    }
+}
+
+impl<S> Blueprint<Input<S>, Output> for Config
+where
+    S: Data<Elem = f64>,
+{
+    type Transformer = StandardScaler;
+}
diff --git a/examples/running_mean/standard_scaler/error.rs b/examples/running_mean/standard_scaler/error.rs
@@ -0,0 +1,7 @@
+use std::error::Error;
+
+/// Fast-and-dirty error struct
+#[derive(Debug, Eq, PartialEq, From, Display)]
+pub struct ScalingError {}
+
+impl Error for ScalingError {}
diff --git a/examples/running_mean/standard_scaler/mod.rs b/examples/running_mean/standard_scaler/mod.rs
@@ -0,0 +1,15 @@
+use ndarray::{Array1, ArrayBase, Ix1};
+
+/// Short-hand notations
+type Input<S> = ArrayBase<S, Ix1>;
+type Output = Array1<f64>;
+
+mod config;
+mod error;
+mod optimizer;
+mod transformer;
+
+pub use config::Config;
+pub use error::ScalingError;
+pub use optimizer::OnlineOptimizer;
+pub use transformer::StandardScaler;
diff --git a/examples/running_mean/standard_scaler/optimizer.rs b/examples/running_mean/standard_scaler/optimizer.rs
@@ -0,0 +1,93 @@
+use crate::standard_scaler::{Config, Input, Output, ScalingError, StandardScaler};
+use linfa::{Fit, IncrementalFit};
+use ndarray::{Axis, Data};
+
+/// It keeps track of the number of samples seen so far, to allow for
+/// incremental computation of mean and standard deviation.
+pub struct OnlineOptimizer {
+    pub n_samples: u64,
+}
+
+/// Initialize n_samples to 0.
+impl Default for OnlineOptimizer {
+    fn default() -> Self {
+        Self { n_samples: 0 }
+    }
+}
+
+impl<S> Fit<Config, Input<S>, Output> for OnlineOptimizer
+where
+    S: Data<Elem = f64>,
+{
+    type Error = ScalingError;
+
+    fn fit(
+        &mut self,
+        inputs: &Input<S>,
+        _targets: &Output,
+        blueprint: Config,
+    ) -> Result<StandardScaler, Self::Error> {
+        if inputs.len() == 0 {
+            return Err(ScalingError {});
+        }
+        // Compute relevant quantities
+        let mean = inputs.mean_axis(Axis(0)).into_scalar();
+        let standard_deviation = inputs.std_axis(Axis(0), blueprint.ddof).into_scalar();
+        // Initialize n_samples using the array length
+        self.n_samples = inputs.len() as u64;
+        // Return new, tuned scaler
+        Ok(StandardScaler {
+            ddof: blueprint.ddof,
+            mean,
+            standard_deviation,
+        })
+    }
+}
+
+impl<S> IncrementalFit<StandardScaler, Input<S>, Output> for OnlineOptimizer
+where
+    S: Data<Elem = f64>,
+{
+    type Error = ScalingError;
+
+    fn incremental_fit(
+        &mut self,
+        inputs: &Input<S>,
+        _targets: &Output,
+        transformer: StandardScaler,
+    ) -> Result<StandardScaler, Self::Error> {
+        if inputs.len() == 0 {
+            // Nothing to be done
+            return Ok(transformer);
+        }
+
+        let ddof = transformer.ddof;
+
+        // Compute relevant quantities for the new batch
+        let batch_n_samples = inputs.len();
+        let batch_mean = inputs.mean_axis(Axis(0)).into_scalar();
+        let batch_std = inputs.std_axis(Axis(0), ddof).into_scalar();
+
+        // Update
+        let mean_delta = batch_mean - transformer.mean;
+        let new_n_samples = self.n_samples + (batch_n_samples as u64);
+        let new_mean =
+            transformer.mean + mean_delta * (batch_n_samples as f64) / (new_n_samples as f64);
+        let new_std = ((transformer.standard_deviation.powi(2) * (self.n_samples as f64 - ddof)
+            + batch_std.powi(2) * (batch_n_samples as f64 - ddof)
+            + mean_delta.powi(2) * (self.n_samples as f64) * (batch_n_samples as f64)
+                / (new_n_samples as f64))
+            / (new_n_samples as f64 - ddof))
+            .sqrt();
+
+        // Update n_samples
+        self.n_samples = new_n_samples;
+
+        // Return tuned scaler
+        Ok(StandardScaler {
+            ddof,
+            mean: new_mean,
+            standard_deviation: new_std,
+        })
+    }
+}
diff --git a/examples/running_mean/standard_scaler/transformer.rs b/examples/running_mean/standard_scaler/transformer.rs
@@ -0,0 +1,29 @@
+use crate::standard_scaler::{Input, Output, ScalingError};
+use linfa::Transformer;
+use ndarray::Data;
+
+/// Given an input, it rescales it to have zero mean and unit variance.
+///
+/// We use 64-bit floats for simplicity.
+pub struct StandardScaler {
+    // Delta degrees of freedom.
+    // With ddof = 1, you get the sample standard deviation
+    // With ddof = 0, you get the population standard deviation
+    pub ddof: f64,
+    pub mean: f64,
+    pub standard_deviation: f64,
+}
+
+impl<S> Transformer<Input<S>, Output> for StandardScaler
+where
+    S: Data<Elem = f64>,
+{
+    type Error = ScalingError;
+
+    fn transform(&self, inputs: &Input<S>) -> Result<Output, Self::Error>
+    where
+        S: Data<Elem = f64>,
+    {
+        Ok((inputs - self.mean) / self.standard_deviation)
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -0,0 +1,133 @@
+use std::error;
+use std::iter;
+
+/// The basic `Transformer` trait.
+///
+/// It is training-agnostic: a transformer takes an input and returns an output.
+///
+/// There might be multiple ways to discover the best settings for every
+/// particular algorithm (e.g. training a logistic regressor using
+/// a pseudo-inverse matrix vs using gradient descent).
+/// It doesn't matter: the end result, the transformer, is a set of parameters.
+/// The way those parameter originated is an orthogonal concept.
+///
+/// In the same way, it has no notion of loss or "correct" predictions.
+/// Those concepts are embedded elsewhere.
+///
+/// It's generic over input and output types:
+/// - you can transform a fully in-memory dataset;
+/// - you can transform a stream of data;
+/// - you can return a class;
+/// - you can return a probability distribution.
+///
+/// The mechanism for selecting the desired output, when not self-evident from the downstream
+/// usage, should be the same of the `::collect()` method.
+pub trait Transformer<I, O> {
+    type Error: error::Error;
+
+    fn transform(&self, inputs: &I) -> Result<O, Self::Error>;
+}
+
+/// One step closer to the peak.
+///
+/// `Fit` is generic over a type `B` implementing the `Blueprint` trait: `B::Transformer` is used to
+/// constrain what type of inputs and targets are acceptable.
+///
+/// `fit` takes an instance of `B` as one of its inputs, `blueprint`: it's consumed with move
+/// semantics and a new transformer is returned.
+///
+/// Different types implementing `Fit` can work on the same `Blueprint` type!
+///
+/// It's a transition in the transformer state machine: from `Blueprint` to `Transformer`.
+///
+/// It's generic over input and output types:
+/// - you can fit on a fully in-memory dataset;
+/// - you can fit on a stream of data;
+/// - you can use integer-encoded class membership as a target;
+/// - you can use a one-hot-encoded class membership as a target.
+pub trait Fit<B, I, O>
+where
+    B: Blueprint<I, O>,
+{
+    type Error: error::Error;
+
+    fn fit(&mut self, inputs: &I, targets: &O, blueprint: B)
+        -> Result<B::Transformer, Self::Error>;
+}
+
+/// We are not done with that `Transformer` yet.
+///
+/// `IncrementalFit` is generic over a type `T` implementing the `Transformer` trait: `T` is used to
+/// constrain what type of inputs and targets are acceptable.
+///
+/// `incremental_fit` takes an instance of `T` as one of its inputs, `transformer`: it's consumed with move
+/// semantics and a new transformer is returned.
+///
+/// It's a transition in the transformer state machine: from `Transformer` to `Transformer`.
+///
+/// It's generic over input and output types:
+/// - you can fit on a fully in-memory dataset;
+/// - you can fit on a stream of data;
+/// - you can use integer-encoded class membership as a target;
+/// - you can use a one-hot-encoded class membership as a target.
+pub trait IncrementalFit<T, I, O>
+where
+    T: Transformer<I, O>,
+{
+    type Error: error::Error;
+
+    fn incremental_fit(
+        &mut self,
+        inputs: &I,
+        targets: &O,
+        transformer: T,
+    ) -> Result<T, Self::Error>;
+}
+
+/// Where `Transformer`s are forged.
+///
+/// `Blueprint` is a marker trait: it identifies what types can be used as starting points for
+/// building `Transformer`s. It's the initial stage of the transformer state machine.
+///
+/// Every `Blueprint` is associated to a single `Transformer` type (is it wise to do so?).
+///
+/// For the same transformer type `T`, nothing prevents a user from providing more than one `Blueprint`:
+/// multiple initialization strategies can sometimes be used to be build the same transformer type.
+///
+/// Each of these strategies can take different (hyper)parameters, even though they return an
+/// instance of the same transformer type in the end.
+pub trait Blueprint<I, O> {
+    type Transformer: Transformer<I, O>;
+}
+
+/// Where you need to go meta (hyperparameters!).
+///
+/// `BlueprintGenerator`s can be used to explore different combination of hyperparameters
+/// when you are working with a certain `Transformer` type.
+///
+/// `BlueprintGenerator::generate` returns, if successful, an `IntoIterator` type
+/// yielding instances of blueprints.
+pub trait BlueprintGenerator<B, I, O>
+where
+    B: Blueprint<I, O>,
+{
+    type Error: error::Error;
+    type Output: IntoIterator<Item = B>;
+
+    fn generate(&self) -> Result<Self::Output, Self::Error>;
+}
+
+/// Any `Blueprint` can be used as `BlueprintGenerator`, as long as it's clonable:
+/// it returns an iterator with a single element, a clone of itself.
+impl<B, I, O> BlueprintGenerator<B, I, O> for B
+where
+    B: Blueprint<I, O> + Clone,
+{
+    // Random error, didn't have time to get a proper one
+    type Error = std::io::Error;
+    type Output = iter::Once<B>;
+
+    fn generate(&self) -> Result<Self::Output, Self::Error> {
+        Ok(iter::once(self.clone()))
+    }
+}