diff --git a/.packcheck.ignore b/.packcheck.ignore index 366286e..0340fc6 100644 --- a/.packcheck.ignore +++ b/.packcheck.ignore @@ -6,3 +6,4 @@ cabal.project cabal.project.user cabal.project.Werror default.nix +stack.yaml diff --git a/appveyor.yml b/appveyor.yml index 0859c06..4e4f9ac 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -36,8 +36,9 @@ environment: # version. #STACKVER: "1.6.5" STACK_UPGRADE: "y" - RESOLVER: "nightly-2023-12-07" + RESOLVER: "lts-22.33" STACK_ROOT: "c:\\sr" + STACK_YAML: "stack.yaml" # ------------------------------------------------------------------------ # cabal options diff --git a/benchmark/Main.hs b/benchmark/Main.hs index a0435b0..ef02f09 100644 --- a/benchmark/Main.hs +++ b/benchmark/Main.hs @@ -2,14 +2,18 @@ import Control.DeepSeq (NFData) import Streamly.Data.Fold (Fold) +import Streamly.Data.Scanl (Scanl) import Streamly.Data.Stream (Stream) import System.Random (randomRIO) import qualified Streamly.Data.Fold as Fold +import qualified Streamly.Internal.Data.Fold as Fold + (windowFold, windowFoldWith) +import qualified Streamly.Internal.Data.Scanl as Scanl import qualified Streamly.Data.Stream as Stream import qualified Streamly.Data.Array as Array -import qualified Streamly.Internal.Data.Ring as Ring import qualified Streamly.Statistics as Statistics +import qualified Streamly.Statistics.Scanl as StatScan import Gauge @@ -47,24 +51,34 @@ benchWithFold len name f = benchWith source len name f benchWithFoldInt :: Int -> String -> Fold IO Int Int -> Benchmark benchWithFoldInt len name f = benchWith source len name f +{-# INLINE benchWithScanSrc #-} +benchWithScanSrc :: (Num a) => + (Int -> a -> Stream IO a) -> Int -> String -> Scanl IO a a -> Benchmark +benchWithScanSrc src len name f = + bench name + $ nfIO + $ randomRIO (1, 1 :: Int) + >>= Stream.fold Fold.drain + . Stream.postscanl f . src len . fromIntegral + {-# INLINE benchWithPostscan #-} -benchWithPostscan :: Int -> String -> Fold IO Double Double -> Benchmark +benchWithPostscan :: Int -> String -> Scanl IO Double Double -> Benchmark benchWithPostscan len name f = bench name $ nfIO $ randomRIO (1, 1) >>= - Stream.fold Fold.drain . Stream.postscan f . source len + Stream.fold Fold.drain . Stream.postscanl f . source len {-# INLINE benchWithResample #-} benchWithResample :: Int -> String -> Benchmark benchWithResample len name = bench name $ nfIO $ do i <- randomRIO (1, 1) - arr <- Stream.fold Array.write (source len i :: Stream IO Double) + arr <- Stream.fold Array.create (source len i :: Stream IO Double) Stream.fold Fold.drain $ Stream.unfold Statistics.resample arr {-# INLINE benchWithFoldResamples #-} benchWithFoldResamples :: Int -> String -> Fold IO Double Double -> Benchmark benchWithFoldResamples len name f = bench name $ nfIO $ do i <- randomRIO (1, 1) - arr <- Stream.fold Array.write (source len i :: Stream IO Double) + arr <- Stream.fold Array.create (source len i :: Stream IO Double) Stream.fold Fold.drain $ Statistics.foldResamples len arr f {-# INLINE numElements #-} @@ -77,128 +91,127 @@ mkBenchmarks :: mkBenchmarks mkBench = [ mkBench numElements "minimum (window size 100)" - (Ring.slidingWindow 100 Statistics.minimum) + (Fold.windowFold 100 Statistics.minimum) , mkBench numElements "minimum (window size 1000)" - (Ring.slidingWindow 1000 Statistics.minimum) + (Fold.windowFold 1000 Statistics.minimum) , benchWith sourceDescendingInt numElements "minimum descending (window size 1000)" - (Ring.slidingWindow 1000 Statistics.minimum) + (Fold.windowFold 1000 Statistics.minimum) , mkBench numElements "maximum (window size 100)" - (Ring.slidingWindow 100 Statistics.maximum) + (Fold.windowFold 100 Statistics.maximum) , mkBench numElements "maximum (window size 1000)" - (Ring.slidingWindow 1000 Statistics.maximum) + (Fold.windowFold 1000 Statistics.maximum) , benchWith sourceDescendingInt numElements "maximum descending (window size 1000)" - (Ring.slidingWindow 1000 Statistics.maximum) + (Fold.windowFold 1000 Statistics.maximum) , mkBench numElements "range (window size 100)" - (Ring.slidingWindow 100 Statistics.range) + (Fold.windowFold 100 Statistics.range) , mkBench numElements "range (window size 1000)" - (Ring.slidingWindow 1000 Statistics.range) + (Fold.windowFold 1000 Statistics.range) , mkBench numElements "sum (window size 100)" - (Ring.slidingWindow 100 Statistics.sum) + (Fold.windowFold 100 Statistics.sum) , mkBench numElements "sum (window size 1000)" - (Ring.slidingWindow 1000 Statistics.sum) + (Fold.windowFold 1000 Statistics.sum) , mkBench numElements "sum (entire stream)" (Statistics.cumulative Statistics.sum) , mkBench numElements "sum (Data.Fold)" Fold.sum , mkBench numElements "mean (window size 100)" - (Ring.slidingWindow 100 Statistics.mean) + (Fold.windowFold 100 Statistics.mean) , mkBench numElements "mean (window size 1000)" - (Ring.slidingWindow 1000 Statistics.mean) + (Fold.windowFold 1000 Statistics.mean) , mkBench numElements "mean (entire stream)" (Statistics.cumulative Statistics.mean) , mkBench numElements "mean (Data.Fold)" Fold.mean , mkBench numElements "welfordMean (window size 100)" - (Ring.slidingWindow 100 Statistics.welfordMean) + (Fold.windowFold 100 Statistics.welfordMean) , mkBench numElements "welfordMean (window size 1000)" - (Ring.slidingWindow 1000 Statistics.welfordMean) + (Fold.windowFold 1000 Statistics.welfordMean) , mkBench numElements "welfordMean (entire stream)" (Statistics.cumulative Statistics.welfordMean) , mkBench numElements "geometricMean (window size 100)" - (Ring.slidingWindow 100 Statistics.geometricMean) + (Fold.windowFold 100 Statistics.geometricMean) , mkBench numElements "geometricMean (window size 1000)" - (Ring.slidingWindow 1000 Statistics.geometricMean) + (Fold.windowFold 1000 Statistics.geometricMean) , mkBench numElements "geometricMean (entire stream)" (Statistics.cumulative Statistics.geometricMean) , mkBench numElements "harmonicMean (window size 100)" - (Ring.slidingWindow 100 Statistics.harmonicMean) + (Fold.windowFold 100 Statistics.harmonicMean) , mkBench numElements "harmonicMean (window size 1000)" - (Ring.slidingWindow 1000 Statistics.harmonicMean) + (Fold.windowFold 1000 Statistics.harmonicMean) , mkBench numElements "harmonicMean (entire stream)" (Statistics.cumulative Statistics.harmonicMean) , mkBench numElements "quadraticMean (window size 100)" - (Ring.slidingWindow 100 Statistics.quadraticMean) + (Fold.windowFold 100 Statistics.quadraticMean) , mkBench numElements "quadraticMean (window size 1000)" - (Ring.slidingWindow 1000 Statistics.quadraticMean) + (Fold.windowFold 1000 Statistics.quadraticMean) , mkBench numElements "quadraticMean (entire stream)" (Statistics.cumulative Statistics.quadraticMean) , mkBench numElements "powerSum 2 (window size 100)" - (Ring.slidingWindow 100 (Statistics.powerSum 2)) + (Fold.windowFold 100 (Statistics.powerSum 2)) , mkBench numElements "powerSum 2 (entire stream)" (Statistics.cumulative (Statistics.powerSum 2)) , mkBench numElements "rawMoment 2 (window size 100)" - (Ring.slidingWindow 100 (Statistics.powerSum 2)) + (Fold.windowFold 100 (Statistics.powerSum 2)) , mkBench numElements "rawMoment 2 (entire stream)" (Statistics.cumulative (Statistics.rawMoment 2)) , mkBench numElements "powerMean 1 (window size 100)" - (Ring.slidingWindow 100 (Statistics.powerMean 1)) + (Fold.windowFold 100 (Statistics.powerMean 1)) , mkBench numElements "powerMean 2 (window size 100)" - (Ring.slidingWindow 100 (Statistics.powerMean 2)) + (Fold.windowFold 100 (Statistics.powerMean 2)) , mkBench numElements "powerMean 10 (window size 100)" - (Ring.slidingWindow 100 (Statistics.powerMean 10)) + (Fold.windowFold 100 (Statistics.powerMean 10)) , mkBench numElements "powerMeanFrac (-1) (window size 100)" - (Ring.slidingWindow 100 (Statistics.powerMeanFrac (-1))) + (Fold.windowFold 100 (Statistics.powerMeanFrac (-1))) , mkBench numElements "powerMeanFrac 1 (window size 100)" - (Ring.slidingWindow 100 (Statistics.powerMeanFrac 1)) + (Fold.windowFold 100 (Statistics.powerMeanFrac 1)) , mkBench numElements "powerMeanFrac 2 (window size 100)" - (Ring.slidingWindow 100 (Statistics.powerMeanFrac 2)) + (Fold.windowFold 100 (Statistics.powerMeanFrac 2)) , mkBench numElements "powerMeanFrac 10 (window size 100)" - (Ring.slidingWindow 100 (Statistics.powerMeanFrac 10)) + (Fold.windowFold 100 (Statistics.powerMeanFrac 10)) , mkBench numElements "ewma (entire stream)" (Statistics.ewma 0.5) - -- XXX Disabled because this is not scannable - -- , mkBench numElements "ewmaAfterMean (entire stream)" - -- (Statistics.ewmaAfterMean 10 0.5) + , mkBench numElements "ewmaAfterMean (entire stream)" + (Statistics.ewmaAfterMean 10 0.5) , mkBench numElements "ewmaRampUpSmoothing (entire stream)" (Statistics.ewmaRampUpSmoothing 0.5 0.5) , mkBench numElements "variance (window size 100)" - (Ring.slidingWindow 100 Statistics.variance) + (Fold.windowFold 100 Statistics.variance) , mkBench numElements "variance (entire stream)" (Statistics.cumulative Statistics.variance) -- , mkBench numElements "variance (Data.Fold)" Fold.variance , mkBench numElements "sampleVariance (window size 100)" - (Ring.slidingWindow 100 Statistics.sampleVariance) + (Fold.windowFold 100 Statistics.sampleVariance) , mkBench numElements "sampleVariance (entire stream)" (Statistics.cumulative Statistics.sampleVariance) , mkBench numElements "stdDev (window size 100)" - (Ring.slidingWindow 100 Statistics.stdDev) + (Fold.windowFold 100 Statistics.stdDev) , mkBench numElements "stdDev (entire stream)" (Statistics.cumulative Statistics.stdDev) -- , mkBench numElements "stdDev (Data.Fold)" Fold.stdDev , mkBench numElements "sampleStdDev (window size 100)" - (Ring.slidingWindow 100 Statistics.sampleStdDev) + (Fold.windowFold 100 Statistics.sampleStdDev) , mkBench numElements "sampleStdDev (entire stream)" (Statistics.cumulative Statistics.sampleStdDev) , mkBench numElements "stdErrMean (window size 100)" - (Ring.slidingWindow 100 Statistics.stdErrMean) + (Fold.windowFold 100 Statistics.stdErrMean) , mkBench numElements "stdErrMean (entire stream)" (Statistics.cumulative Statistics.stdErrMean) @@ -206,17 +219,163 @@ mkBenchmarks mkBench = -- because of the use of Tee. #ifndef FUSION_PLUGIN , mkBench numElements "skewness (window size 100)" - (Ring.slidingWindow 100 Statistics.skewness) + (Fold.windowFold 100 Statistics.skewness) , mkBench numElements "skewness (entire stream)" (Statistics.cumulative Statistics.skewness) , mkBench numElements "kurtosis (window size 100)" - (Ring.slidingWindow 100 Statistics.kurtosis) + (Fold.windowFold 100 Statistics.kurtosis) , mkBench numElements "kurtosis (entire stream)" (Statistics.cumulative Statistics.kurtosis) #endif , mkBench numElements "md (window size 100)" - (Ring.slidingWindowWith 100 Statistics.md) + (Fold.windowFoldWith 100 Statistics.md) + + ] + +mkScans :: + (Int -> String -> Scanl IO Double Double -> Benchmark) + -> [Benchmark] +mkScans mkBench = + [ + mkBench numElements "minimum (window size 100)" + (Scanl.windowScan 100 StatScan.windowMinimum) + , mkBench numElements "minimum (window size 1000)" + (Scanl.windowScan 1000 StatScan.windowMinimum) + , benchWithScanSrc sourceDescendingInt numElements + "minimum descending (window size 1000)" + (Scanl.windowScan 1000 StatScan.windowMinimum) + + , mkBench numElements "maximum (window size 100)" + (Scanl.windowScan 100 StatScan.windowMaximum) + , mkBench numElements "maximum (window size 1000)" + (Scanl.windowScan 1000 StatScan.windowMaximum) + , benchWithScanSrc sourceDescendingInt numElements + "maximum descending (window size 1000)" + (Scanl.windowScan 1000 StatScan.windowMaximum) + + , mkBench numElements "range (window size 100)" + (Scanl.windowScan 100 StatScan.windowRange) + , mkBench numElements "range (window size 1000)" + (Scanl.windowScan 1000 StatScan.windowRange) + + , mkBench numElements "sum (window size 100)" + (Scanl.windowScan 100 Scanl.windowSum) + , mkBench numElements "sum (window size 1000)" + (Scanl.windowScan 1000 Scanl.windowSum) + , mkBench numElements "sum (entire stream)" + (Scanl.cumulative Scanl.windowSum) + , mkBench numElements "sum (Data.Fold)" Scanl.sum + + , mkBench numElements "mean (window size 100)" + (Scanl.windowScan 100 Scanl.windowMean) + , mkBench numElements "mean (window size 1000)" + (Scanl.windowScan 1000 Scanl.windowMean) + , mkBench numElements "mean (entire stream)" + (Scanl.cumulative Scanl.windowMean) + , mkBench numElements "mean (Data.Fold)" Scanl.mean + + , mkBench numElements "welfordMean (window size 100)" + (Scanl.windowScan 100 StatScan.windowWelfordMean) + , mkBench numElements "welfordMean (window size 1000)" + (Scanl.windowScan 1000 StatScan.windowWelfordMean) + , mkBench numElements "welfordMean (entire stream)" + (Scanl.cumulative StatScan.windowWelfordMean) + + , mkBench numElements "geometricMean (window size 100)" + (Scanl.windowScan 100 StatScan.windowGeometricMean) + , mkBench numElements "geometricMean (window size 1000)" + (Scanl.windowScan 1000 StatScan.windowGeometricMean) + , mkBench numElements "geometricMean (entire stream)" + (Scanl.cumulative StatScan.windowGeometricMean) + + , mkBench numElements "harmonicMean (window size 100)" + (Scanl.windowScan 100 StatScan.windowHarmonicMean) + , mkBench numElements "harmonicMean (window size 1000)" + (Scanl.windowScan 1000 StatScan.windowHarmonicMean) + , mkBench numElements "harmonicMean (entire stream)" + (Scanl.cumulative StatScan.windowHarmonicMean) + + , mkBench numElements "quadraticMean (window size 100)" + (Scanl.windowScan 100 StatScan.windowQuadraticMean) + , mkBench numElements "quadraticMean (window size 1000)" + (Scanl.windowScan 1000 StatScan.windowQuadraticMean) + , mkBench numElements "quadraticMean (entire stream)" + (Scanl.cumulative StatScan.windowQuadraticMean) + + , mkBench numElements "powerSum 2 (window size 100)" + (Scanl.windowScan 100 (Scanl.windowPowerSum 2)) + , mkBench numElements "powerSum 2 (entire stream)" + (Scanl.cumulative (Scanl.windowPowerSum 2)) + + , mkBench numElements "rawMoment 2 (window size 100)" + (Scanl.windowScan 100 (StatScan.windowRawMoment 2)) + , mkBench numElements "rawMoment 2 (entire stream)" + (Scanl.cumulative (StatScan.windowRawMoment 2)) + + , mkBench numElements "powerMean 1 (window size 100)" + (Scanl.windowScan 100 (StatScan.windowPowerMean 1)) + , mkBench numElements "powerMean 2 (window size 100)" + (Scanl.windowScan 100 (StatScan.windowPowerMean 2)) + , mkBench numElements "powerMean 10 (window size 100)" + (Scanl.windowScan 100 (StatScan.windowPowerMean 10)) + + , mkBench numElements "powerMeanFrac (-1) (window size 100)" + (Scanl.windowScan 100 (StatScan.windowPowerMeanFrac (-1))) + , mkBench numElements "powerMeanFrac 1 (window size 100)" + (Scanl.windowScan 100 (StatScan.windowPowerMeanFrac 1)) + , mkBench numElements "powerMeanFrac 2 (window size 100)" + (Scanl.windowScan 100 (StatScan.windowPowerMeanFrac 2)) + , mkBench numElements "powerMeanFrac 10 (window size 100)" + (Scanl.windowScan 100 (StatScan.windowPowerMeanFrac 10)) + + , mkBench numElements "ewma (entire stream)" + (StatScan.ewma 0.5) + , mkBench numElements "ewmaRampUpSmoothing (entire stream)" + (StatScan.ewmaRampUpSmoothing 0.5 0.5) + + , mkBench numElements "variance (window size 100)" + (Scanl.windowScan 100 StatScan.windowVariance) + , mkBench numElements "variance (entire stream)" + (Scanl.cumulative StatScan.windowVariance) + -- , mkBench numElements "variance (Data.Fold)" Fold.variance + + , mkBench numElements "sampleVariance (window size 100)" + (Scanl.windowScan 100 StatScan.windowSampleVariance) + , mkBench numElements "sampleVariance (entire stream)" + (Scanl.cumulative StatScan.windowSampleVariance) + + , mkBench numElements "stdDev (window size 100)" + (Scanl.windowScan 100 StatScan.windowStdDev) + , mkBench numElements "stdDev (entire stream)" + (Scanl.cumulative StatScan.windowStdDev) + -- , mkBench numElements "stdDev (Data.Fold)" Fold.stdDev + + , mkBench numElements "sampleStdDev (window size 100)" + (Scanl.windowScan 100 StatScan.windowSampleStdDev) + , mkBench numElements "sampleStdDev (entire stream)" + (Scanl.cumulative StatScan.windowSampleStdDev) + + , mkBench numElements "stdErrMean (window size 100)" + (Scanl.windowScan 100 StatScan.windowStdErrMean) + , mkBench numElements "stdErrMean (entire stream)" + (Scanl.cumulative StatScan.windowStdErrMean) + +-- These benchmarks take a lot of time/memory with fusion-plugin possibly +-- because of the use of Tee. +#ifndef FUSION_PLUGIN + , mkBench numElements "skewness (window size 100)" + (Scanl.windowScan 100 StatScan.windowSkewness) + , mkBench numElements "skewness (entire stream)" + (Scanl.cumulative StatScan.windowSkewness) + + , mkBench numElements "kurtosis (window size 100)" + (Scanl.windowScan 100 StatScan.windowKurtosis) + , mkBench numElements "kurtosis (entire stream)" + (Scanl.cumulative StatScan.windowKurtosis) +#endif + , mkBench numElements "md (window size 100)" + (Scanl.windowScanWith 100 StatScan.windowMd) ] @@ -227,11 +386,11 @@ main = bgroup "fold" $ mkBenchmarks benchWithFold , bgroup "fold_Int" [ benchWithFoldInt numElements "sumInt (window size 100)" - (Ring.slidingWindow 100 Statistics.sumInt) + (Fold.windowFold 100 Statistics.sumInt) , benchWithFoldInt numElements "sum for Int (window size 100)" - (Ring.slidingWindow 100 Statistics.sum) + (Fold.windowFold 100 Statistics.sum) ] - , bgroup "scan" $ mkBenchmarks benchWithPostscan + , bgroup "scan" $ mkScans benchWithPostscan -- XXX These benchmarks measure the cost of creating the array as well, -- we can do that outside the benchmark. , bgroup "resample" diff --git a/cabal.project.user b/cabal.project.user index f62f67d..d6d62c7 100644 --- a/cabal.project.user +++ b/cabal.project.user @@ -1,14 +1,14 @@ packages: streamly-statistics.cabal --- source-repository-package --- type: git --- location: https://github.com/composewell/streamly.git --- tag: master --- --- source-repository-package --- type: git --- location: https://github.com/composewell/streamly.git --- tag: master --- subdir: core +source-repository-package + type: git + location: https://github.com/composewell/streamly.git + tag: master + +source-repository-package + type: git + location: https://github.com/composewell/streamly.git + tag: master + subdir: core jobs: 1 diff --git a/src/Streamly/Statistics.hs b/src/Streamly/Statistics.hs index 3751073..f702889 100644 --- a/src/Streamly/Statistics.hs +++ b/src/Streamly/Statistics.hs @@ -202,7 +202,8 @@ import qualified Deque.Strict as Deque import qualified Streamly.Data.Fold as Fold import qualified Streamly.Data.Array as Array import qualified Streamly.Data.MutArray as MA -import qualified Streamly.Internal.Data.MutArray as MA (unsafeSwapIndices) +import qualified Streamly.Internal.Data.MutArray as MA + (unsafeSwapIndices, unsafeGetIndex, unsafePutIndex) import qualified Streamly.Internal.Data.Fold as Window import qualified Streamly.Data.Stream as Stream @@ -221,18 +222,25 @@ import Prelude hiding (length, sum, minimum, maximum) -- Re-exports ------------------------------------------------------------------------------- +-- XXX Deprecate these once the streamly functions are released. + +-- {-# DEPRECATED lmap "Use Streamly.Data.Fold.windowLmap instead" #-} lmap :: (c -> a) -> Fold m (a, Maybe a) b -> Fold m (c, Maybe c) b lmap = Window.windowLmap +-- {-# DEPRECATED length "Use Streamly.Data.Fold.windowLength instead" #-} length :: (Monad m, Num b) => Fold m (a, Maybe a) b length = Window.windowLength +-- {-# DEPRECATED sum "Use Streamly.Data.Fold.windowSum instead" #-} sum :: (Monad m, Num a) => Fold m (a, Maybe a) a sum = Window.windowSum +-- {-# DEPRECATED sumInt "Use Streamly.Data.Fold.windowSumInt instead" #-} sumInt :: (Monad m, Integral a) => Fold m (a, Maybe a) a sumInt = Window.windowSumInt +-- {-# DEPRECATED powerSum "Use Streamly.Data.Fold.windowPowerSum instead" #-} powerSum :: (Monad m, Num a) => Int -> Fold m (a, Maybe a) a powerSum = Window.windowPowerSum @@ -326,13 +334,13 @@ fft marr let butterfly i | i >= len = flight (j + 1) (a + e) | otherwise = do let i1 = i + l1 - xi1 :+ yi1 <- MA.getIndexUnsafe i1 marr + xi1 :+ yi1 <- MA.unsafeGetIndex i1 marr let !c = cos a !s = sin a d = (c * xi1 - s * yi1) :+ (s * xi1 + c * yi1) - ci <- MA.getIndexUnsafe i marr - MA.putIndexUnsafe i1 marr (ci - d) - MA.putIndexUnsafe i marr (ci + d) + ci <- MA.unsafeGetIndex i marr + MA.unsafePutIndex i1 marr (ci - d) + MA.unsafePutIndex i marr (ci + d) butterfly (i + l2) butterfly j flight 0 0 @@ -341,6 +349,10 @@ fft marr -- Location ------------------------------------------------------------------------------- +-- XXX prefix window to these folds and implement these using the scans. or +-- just remove these as the corresponding scans can be converted to folds. If +-- we remove these we can just export the scans through this module itself. + -- Theoretically, we can approximate minimum in a rolling window by using a -- 'powerMean' with sufficiently large negative power. -- diff --git a/src/Streamly/Statistics/Scanl.hs b/src/Streamly/Statistics/Scanl.hs new file mode 100644 index 0000000..a17b837 --- /dev/null +++ b/src/Streamly/Statistics/Scanl.hs @@ -0,0 +1,726 @@ +-- | +-- Module : Streamly.Statistics.Scanl +-- Copyright : (c) 2024 Composewell Technologies +-- License : Apache-2.0 +-- Maintainer : streamly@composewell.com +-- Stability : experimental +-- Portability : GHC +-- +-- See "Streamly.Statistics" for general information. This module provides +-- scans instead of folds. + +{-# LANGUAGE ScopedTypeVariables #-} +module Streamly.Statistics.Scanl + ( + -- * Incremental Scans + -- | Scans of type @Scanl m (a, Maybe a) b@ are incremental sliding window + -- scans. An input of type @(a, Nothing)@ indicates that the input element + -- @a@ is being inserted in the window without ejecting an old value + -- increasing the window size by 1. An input of type @(a, Just a)@ + -- indicates that the first element is being inserted in the window and the + -- second element is being removed from the window, the window size remains + -- the same. The window size can only increase and never decrease. + -- + -- You can compute the statistics over the entire stream using sliding + -- window folds by keeping the second element of the input tuple as + -- @Nothing@. + -- + -- Also see "Streamly.Data.Scanl" for some basic window scans. + + -- * Summary Statistics + -- | See https://en.wikipedia.org/wiki/Summary_statistics . + + -- ** Location + -- | See https://en.wikipedia.org/wiki/Location_parameter . + -- + -- See https://en.wikipedia.org/wiki/Central_tendency . + windowMinimum + , windowMaximum + , windowRawMoment + , windowRawMomentFrac + + -- Pythagorean means (https://en.wikipedia.org/wiki/Pythagorean_means) + , windowWelfordMean + , windowGeometricMean + , windowHarmonicMean + + , windowQuadraticMean + + -- Generalized mean + , windowPowerMean + , windowPowerMeanFrac + + -- ** Weighted Means + -- | Exponential Smoothing. + , ewma + , ewmaRampUpSmoothing + + -- ** Spread + -- | Second order central moment is a statistical measure of dispersion. + -- The \(k\)th moment about the mean (or \(k\)th central moment) is defined + -- as: + -- + -- \(\mu_k = \frac{1}{n}\sum_{i=1}^n {(x_{i}-\mu)}^k\) + -- + -- See https://mathworld.wolfram.com/CentralMoment.html . + -- + -- See https://en.wikipedia.org/wiki/Statistical_dispersion . + , windowRange + , windowMd + , windowVariance + , windowStdDev + + -- ** Shape + -- | Third and fourth order central moments are a measure of shape. + -- + -- See https://en.wikipedia.org/wiki/Shape_parameter . + -- + -- See https://en.wikipedia.org/wiki/Standardized_moment . + , windowSkewness + , windowKurtosis + + -- XXX Move to Statistics.Sample or Statistics.Estimation module? + -- ** Estimation + , windowSampleVariance + , windowSampleStdDev + , windowStdErrMean + + -- ** Probability Distribution + , windowFrequency + ) +where + +import Control.Monad.IO.Class (MonadIO(..)) +import Data.Function ((&)) +import Data.Map.Strict (Map) +import Data.Maybe (fromMaybe) +import Streamly.Internal.Data.Fold (Step(..)) +import Streamly.Internal.Data.Scanl (Scanl(..)) +import Streamly.Internal.Data.Tuple.Strict (Tuple'(..), Tuple3'(..)) + +import qualified Data.Map.Strict as Map +import qualified Deque.Strict as Deque +import qualified Streamly.Data.Fold as Fold +import qualified Streamly.Data.MutArray as MA +import qualified Streamly.Internal.Data.Scanl as Scanl +import qualified Streamly.Data.Stream as Stream + +import Prelude hiding (length, sum, minimum, maximum) + +-- TODO: Overflow checks. Would be good if we can directly replace the +-- operations with overflow checked operations. +-- +-- See https://hackage.haskell.org/package/safe-numeric +-- See https://hackage.haskell.org/package/safeint +-- +-- TODO We have many of these functions in Streamly.Data.Fold as well. Need to +-- think about deduplication. + +------------------------------------------------------------------------------- +-- Location +------------------------------------------------------------------------------- + +-- Theoretically, we can approximate minimum in a rolling window by using a +-- 'powerMean' with sufficiently large negative power. +-- +-- XXX If we need to know the minimum in the window only once in a while then +-- we can use linear search when it is extracted and not pay the cost all the +-- time. + +-- | The minimum element in a rolling window. +-- +-- For smaller window sizes (< 30) Streamly.Internal.Data.Fold.windowMinimum performs +-- better. If you want to compute the minimum of the entire stream Fold.min +-- from streamly package would be much faster. +-- +-- /Time/: \(\mathcal{O}(n*w)\) where \(w\) is the window size. +-- +{-# INLINE windowMinimum #-} +windowMinimum :: (Monad m, Ord a) => Scanl m (a, Maybe a) a +windowMinimum = Scanl step initial extract extract + + where + + initial = + return + $ Partial + $ Tuple3' (0 :: Int) (0 :: Int) (mempty :: Deque.Deque (Int, a)) + + step (Tuple3' i w q) (a, ma) = + case ma of + Nothing -> + return + $ Partial + $ Tuple3' + (i + 1) + (w + 1) + (headCheck i q (w + 1) & dqloop (i, a)) + Just _ -> + return + $ Partial + $ Tuple3' (i + 1) w (headCheck i q w & dqloop (i,a)) + + {-# INLINE headCheck #-} + headCheck i q w = + case Deque.uncons q of + Nothing -> q + Just (ia', q') -> + if fst ia' <= i - w + then q' + else q + + dqloop ia q = + case Deque.unsnoc q of + Nothing -> Deque.snoc ia q + -- XXX This can be improved for the case of `=` + Just (ia', q') -> + if snd ia <= snd ia' + then dqloop ia q' + else Deque.snoc ia q + + extract (Tuple3' _ _ q) = + return + $ snd + $ fromMaybe (0, error "minimum: Empty stream") + $ Deque.head q + +-- Theoretically, we can approximate maximum in a rolling window by using a +-- 'powerMean' with sufficiently large positive power. + +-- | The maximum element in a rolling window. +-- +-- For smaller window sizes (< 30) Streamly.Internal.Data.Fold.windowMaximum performs +-- better. If you want to compute the maximum of the entire stream +-- Streamly.Data.Fold.maximum from streamly package would be much faster. +-- +-- /Time/: \(\mathcal{O}(n*w)\) where \(w\) is the window size. +-- +{-# INLINE windowMaximum #-} +windowMaximum :: (Monad m, Ord a) => Scanl m (a, Maybe a) a +windowMaximum = Scanl step initial extract extract + + where + + initial = + return + $ Partial + $ Tuple3' (0 :: Int) (0 :: Int) (mempty :: Deque.Deque (Int, a)) + + step (Tuple3' i w q) (a, ma) = + case ma of + Nothing -> + return + $ Partial + $ Tuple3' + (i + 1) + (w + 1) + (headCheck i q (w + 1) & dqloop (i, a)) + Just _ -> + return + $ Partial + $ Tuple3' (i + 1) w (headCheck i q w & dqloop (i,a)) + + {-# INLINE headCheck #-} + headCheck i q w = + case Deque.uncons q of + Nothing -> q + Just (ia', q') -> + if fst ia' <= i - w + then q' + else q + + dqloop ia q = + case Deque.unsnoc q of + Nothing -> Deque.snoc ia q + -- XXX This can be improved for the case of `=` + Just (ia', q') -> + if snd ia >= snd ia' + then dqloop ia q' + else Deque.snoc ia q + + extract (Tuple3' _ _ q) = + return + $ snd + $ fromMaybe (0, error "maximum: Empty stream") + $ Deque.head q + +------------------------------------------------------------------------------- +-- Mean +------------------------------------------------------------------------------- + +-- | Recompute mean from old mean when an item is added to the sample. +{-# INLINE meanAdd #-} +meanAdd :: Fractional a => Int -> a -> a -> a +meanAdd n oldMean newItem = + let delta = (newItem - oldMean) / fromIntegral (n + 1) + in oldMean + delta + +-- We do not carry rounding errors, therefore, this would be less numerically +-- stable than the kbn mean. + +-- | Recompute mean from old mean when an item in the sample is replaced. +{-# INLINE meanReplace #-} +meanReplace :: Fractional a => Int -> a -> a -> a -> a +meanReplace n oldMean oldItem newItem = + let n1 = fromIntegral n + -- Compute two deltas instead of a single (newItem - oldItem) because + -- the latter would be too small causing rounding errors. + delta1 = (newItem - oldMean) / n1 + delta2 = (oldItem - oldMean) / n1 + in (oldMean + delta1) - delta2 + +-- | Same as 'mean' but uses Welford's algorithm to compute the mean +-- incrementally. +-- +-- It maintains a running mean instead of a running sum and adjusts the mean +-- based on a new value. This is slower than 'mean' because of using the +-- division operation on each step and it is numerically unstable (as of now). +-- The advantage over 'mean' could be no overflow if the numbers are large, +-- because we do not maintain a sum, but that is a highly unlikely corner case. +-- +-- /Internal/ +{-# INLINE windowWelfordMean #-} +windowWelfordMean :: forall m a. (Monad m, Fractional a) => + Scanl m (a, Maybe a) a +windowWelfordMean = Scanl step initial extract extract + + where + + initial = + return + $ Partial + $ Tuple' + (0 :: a) -- running mean + (0 :: Int) -- count of items in the window + + step (Tuple' oldMean w) (new, mOld) = + return + $ Partial + $ case mOld of + Nothing -> Tuple' (meanAdd w oldMean new) (w + 1) + Just old -> Tuple' (meanReplace w oldMean old new) w + + extract (Tuple' x _) = return x + +------------------------------------------------------------------------------- +-- Moments +------------------------------------------------------------------------------- + +-- XXX We may have chances of overflow if the powers are high or the numbers +-- are large. A limited mitigation could be to use welford style avg +-- computation. Do we need an overflow detection? + +-- | Raw moment is the moment about 0. The \(k\)th raw moment is defined as: +-- +-- \(\mu'_k = \frac{\sum_{i=1}^n x_{i}^k}{n}\) +-- +-- >>> rawMoment k = Fold.teeWith (/) (Fold.windowPowerSum p) Fold.windowLength +-- +-- See https://en.wikipedia.org/wiki/Moment_(mathematics) . +-- +-- /Space/: \(\mathcal{O}(1)\) +-- +-- /Time/: \(\mathcal{O}(n)\) +{-# INLINE windowRawMoment #-} +windowRawMoment :: (Monad m, Fractional a) => Int -> Scanl m (a, Maybe a) a +windowRawMoment k = + Scanl.teeWith (/) (Scanl.windowPowerSum k) Scanl.windowLength + +-- | Like 'rawMoment' but powers can be negative or fractional. This is +-- slower than 'rawMoment' for positive intergal powers. +-- +-- >>> rawMomentFrac p = Fold.teeWith (/) (Fold.windowPowerSumFrac p) Fold.windowLength +-- +{-# INLINE windowRawMomentFrac #-} +windowRawMomentFrac :: (Monad m, Floating a) => a -> Scanl m (a, Maybe a) a +windowRawMomentFrac k = + Scanl.teeWith (/) (Scanl.windowPowerSumFrac k) Scanl.windowLength + +-- XXX Overflow can happen when large powers or large numbers are used. We can +-- keep a running mean instead of running sum but that won't mitigate the +-- overflow possibility by much. The overflow can still happen when computing +-- the mean incrementally. + +-- | The \(k\)th power mean of numbers \(x_1, x_2, \ldots, x_n\) is: +-- +-- \(M_k = \left( \frac{1}{n} \sum_{i=1}^n x_i^k \right)^{\frac{1}{k}}\) +-- +-- \(powerMean(k) = (rawMoment(k))^\frac{1}{k}\) +-- +-- >>> powerMean k = (** (1 / fromIntegral k)) <$> rawMoment k +-- +-- All other means can be expressed in terms of power mean. It is also known as +-- the generalized mean. +-- +-- See https://en.wikipedia.org/wiki/Generalized_mean +-- +{-# INLINE windowPowerMean #-} +windowPowerMean :: (Monad m, Floating a) => Int -> Scanl m (a, Maybe a) a +windowPowerMean k = (** (1 / fromIntegral k)) <$> windowRawMoment k + +-- | Like 'powerMean' but powers can be negative or fractional. This is +-- slower than 'powerMean' for positive intergal powers. +-- +-- >>> powerMeanFrac k = (** (1 / k)) <$> rawMomentFrac k +-- +{-# INLINE windowPowerMeanFrac #-} +windowPowerMeanFrac :: (Monad m, Floating a) => a -> Scanl m (a, Maybe a) a +windowPowerMeanFrac k = (** (1 / k)) <$> windowRawMomentFrac k + +-- | The harmonic mean of the positive numbers \(x_1, x_2, \ldots, x_n\) is +-- defined as: +-- +-- \(HM = \frac{n}{\frac1{x_1} + \frac1{x_2} + \cdots + \frac1{x_n}}\) +-- +-- \(HM = \left(\frac{\sum\limits_{i=1}^n x_i^{-1}}{n}\right)^{-1}\) +-- +-- >>> harmonicMean = Fold.teeWith (/) length (lmap recip sum) +-- >>> harmonicMean = powerMeanFrac (-1) +-- +-- See https://en.wikipedia.org/wiki/Harmonic_mean . +-- +{-# INLINE windowHarmonicMean #-} +windowHarmonicMean :: (Monad m, Fractional a) => Scanl m (a, Maybe a) a +windowHarmonicMean = + Scanl.teeWith (/) + Scanl.windowLength (Scanl.windowLmap recip Scanl.windowSum) + +-- | Geometric mean, defined as: +-- +-- \(GM = \sqrt[n]{x_1 x_2 \cdots x_n}\) +-- +-- \(GM = \left(\prod_{i=1}^n x_i\right)^\frac{1}{n}\) +-- +-- or, equivalently, as the arithmetic mean in log space: +-- +-- \(GM = e ^{{\frac{\sum_{i=1}^{n}\ln a_i}{n}}}\) +-- +-- >>> geometricMean = exp <$> lmap log mean +-- +-- See https://en.wikipedia.org/wiki/Geometric_mean . +{-# INLINE windowGeometricMean #-} +windowGeometricMean :: (Monad m, Floating a) => Scanl m (a, Maybe a) a +windowGeometricMean = exp <$> Scanl.windowLmap log Scanl.windowMean + +-- | The quadratic mean or root mean square (rms) of the numbers +-- \(x_1, x_2, \ldots, x_n\) is defined as: +-- +-- \(RMS = \sqrt{ \frac{1}{n} \left( x_1^2 + x_2^2 + \cdots + x_n^2 \right) }.\) +-- +-- >>> quadraticMean = powerMean 2 +-- +-- See https://en.wikipedia.org/wiki/Root_mean_square . +-- +{-# INLINE windowQuadraticMean #-} +windowQuadraticMean :: (Monad m, Floating a) => Scanl m (a, Maybe a) a +windowQuadraticMean = windowPowerMean 2 + +------------------------------------------------------------------------------- +-- Weighted Means +------------------------------------------------------------------------------- + +-- XXX Is this numerically stable? We can use the kbn summation here. + +-- | ewmaStep smoothing-factor old-value new-value +{-# INLINE ewmaStep #-} +ewmaStep :: Double -> Double -> Double -> Double +ewmaStep k x0 x1 = (1 - k) * x0 + k * x1 + +-- XXX Compute this in a sliding window? + +-- | @ewma smoothingFactor@. +-- +-- @ewma@ of an empty stream is 0. +-- +-- Exponential weighted moving average, \(s_n\), of \(n\) values, +-- \(x_1,\ldots,x_n\), is defined recursively as: +-- +-- \(\begin{align} s_0& = x_0\\ s_n & = \alpha x_{n} + (1-\alpha)s_{n-1},\quad n>0 \end{align}\) +-- +-- If we expand the recursive term it becomes an exponential series: +-- +-- \(s_n = \alpha \left[x_n + (1-\alpha)x_{n-1} + (1-\alpha)^2 x_{n-2} + \cdots + (1-\alpha)^{n-1} x_1 \right] + (1-\alpha)^n x_0\) +-- +-- where \(\alpha\), the smoothing factor, is in the range \(0 <\alpha < 1\). +-- More the value of \(\alpha\), the more weight is given to newer values. As +-- a special case if it is 0 then the weighted sum would always be the same as +-- the oldest value, if it is 1 then the sum would always be the same as the +-- newest value. +-- +-- See https://en.wikipedia.org/wiki/Moving_average +-- +-- See https://en.wikipedia.org/wiki/Exponential_smoothing +-- +{-# INLINE ewma #-} +ewma :: Monad m => Double -> Scanl m Double Double +ewma k = extract <$> Scanl.mkScanl step (Tuple' 0 1) + + where + + step (Tuple' x0 k1) x = Tuple' (ewmaStep k1 x0 x) k + + extract (Tuple' x _) = x + +-- | @ewma n k@ is like 'ewma' but uses 1 as the initial smoothing factor and +-- then exponentially smooths it to @k@ using @n@ as the smoothing factor. +-- +-- This is significantly faster than 'ewmaAfterMean'. +-- +{-# INLINE ewmaRampUpSmoothing #-} +ewmaRampUpSmoothing :: Monad m => Double -> Double -> Scanl m Double Double +ewmaRampUpSmoothing n k1 = extract <$> Scanl.mkScanl step initial + + where + + initial = Tuple' 0 1 + + step (Tuple' x0 k0) x1 = + let x = ewmaStep k0 x0 x1 + k = ewmaStep n k0 k1 + in Tuple' x k + + extract (Tuple' x _) = x + +------------------------------------------------------------------------------- +-- Spread/Dispersion +------------------------------------------------------------------------------- + +-- | The difference between the maximum and minimum elements of a rolling window. +-- +-- >>> range = Fold.teeWith (-) maximum minimum +-- +-- If you want to compute the range of the entire stream @Fold.teeWith (-) +-- Fold.maximum Fold.minimum@ from the streamly package would be much faster. +-- +-- /Space/: \(\mathcal{O}(n)\) where @n@ is the window size. +-- +-- /Time/: \(\mathcal{O}(n*w)\) where \(w\) is the window size. +-- +{-# INLINE windowRange #-} +windowRange :: (Monad m, Num a, Ord a) => Scanl m (a, Maybe a) a +windowRange = Scanl.teeWith (-) windowMaximum windowMinimum + +-- | @md n@ computes the mean absolute deviation (or mean deviation) in a +-- sliding window of last @n@ elements in the stream. +-- +-- The mean absolute deviation of the numbers \(x_1, x_2, \ldots, x_n\) is: +-- +-- \(MD = \frac{1}{n}\sum_{i=1}^n |x_i-\mu|\) +-- +-- Note: It is expensive to compute MD in a sliding window. We need to +-- maintain a ring buffer of last n elements and maintain a running mean, when +-- the result is extracted we need to compute the difference of all elements +-- from the mean and get the average. Using standard deviation may be +-- computationally cheaper. +-- +-- See https://en.wikipedia.org/wiki/Average_absolute_deviation . +-- +-- /Pre-release/ +{-# INLINE windowMd #-} +windowMd :: MonadIO m => + Scanl m ((Double, Maybe Double), m (MA.MutArray Double)) Double +windowMd = + Scanl.rmapM computeMD + $ Scanl.tee + (Scanl.lmap fst Scanl.windowMean) (Scanl.lmap snd Scanl.latest) + + where + + computeMD (mn, rng) = + case rng of + Just action -> do + arr <- action + Stream.fold Fold.mean + $ fmap (\a -> abs (mn - a)) + $ Stream.unfold MA.reader arr + Nothing -> return 0.0 + +-- | The variance \(\sigma^2\) of a population of \(n\) equally likely values +-- is defined as the average of the squares of deviations from the mean +-- \(\mu\). In other words, second moment about the mean: +-- +-- \(\sigma^2 = \frac{1}{n}\sum_{i=1}^n {(x_{i}-\mu)}^2\) +-- +-- \(\sigma^2 = rawMoment(2) - \mu^2\) +-- +-- \(\mu_2 = -(\mu'_1)^2 + \mu'_2\) +-- +-- Note that the variance would be biased if applied to estimate the population +-- variance from a sample of the population. See 'sampleVariance'. +-- +-- See https://en.wikipedia.org/wiki/Variance. +-- +-- /Space/: \(\mathcal{O}(1)\) +-- +-- /Time/: \(\mathcal{O}(n)\) +{-# INLINE windowVariance #-} +windowVariance :: (Monad m, Fractional a) => Scanl m (a, Maybe a) a +windowVariance = + Scanl.teeWith + (\p2 m -> p2 - m ^ (2 :: Int)) (windowRawMoment 2) Scanl.windowMean + +-- | Standard deviation \(\sigma\) is the square root of 'variance'. +-- +-- This is the population standard deviation or uncorrected sample standard +-- deviation. +-- +-- >>> stdDev = sqrt <$> variance +-- +-- See https://en.wikipedia.org/wiki/Standard_deviation . +-- +-- /Space/: \(\mathcal{O}(1)\) +-- +-- /Time/: \(\mathcal{O}(n)\) +{-# INLINE windowStdDev #-} +windowStdDev :: (Monad m, Floating a) => Scanl m (a, Maybe a) a +windowStdDev = sqrt <$> windowVariance + +-- XXX Need a tee3 operation for better performance. + +-- | Skewness \(\gamma\) is the standardized third central moment defined as: +-- +-- \(\tilde{\mu}_3 = \frac{\mu_3}{\sigma^3}\) +-- +-- The third central moment can be computed in terms of raw moments: +-- +-- \(\mu_3 = 2(\mu'_1)^3 - 3\mu'_1\mu'_2 + \mu'_3\) +-- +-- Substituting \(\mu'_1 = \mu\), and \(\mu'_2 = \mu^2 + \sigma^2\): +-- +-- \(\mu_3 = -\mu^3 - 3\mu\sigma^2 + \mu'_3\) +-- +-- Skewness is a measure of symmetry of the probability distribution. It is 0 +-- for a symmetric distribution, negative for a distribution that is skewed +-- towards left, positive for a distribution skewed towards right. +-- +-- For a normal like distribution the median can be found around +-- \(\mu - \frac{\gamma\sigma}{6}\) and the mode can be found around +-- \(\mu - \frac{\gamma \sigma}{2}\). +-- +-- See https://en.wikipedia.org/wiki/Skewness . +-- +{-# INLINE windowSkewness #-} +windowSkewness :: (Monad m, Floating a) => Scanl m (a, Maybe a) a +windowSkewness = + (\rm3 sd mu -> + rm3 / sd ^ (3 :: Int) - 3 * (mu / sd) - (mu / sd) ^ (3 :: Int) + ) + <$> windowRawMoment 3 + <*> windowStdDev + <*> Scanl.windowMean + +-- XXX We can compute the 2nd, 3rd, 4th raw moments by repeatedly multiplying +-- instead of computing the powers every time. +-- XXX Need a tee4 operation for better performance. + +-- | Kurtosis \(\kappa\) is the standardized fourth central moment, defined as: +-- +-- \(\tilde{\mu}_4 = \frac{\mu_4}{\sigma^4}\) +-- +-- The fourth central moment can be computed in terms of raw moments: +-- +-- \(\mu_4 = -3(\mu'_1)^4 + 6(\mu'_1)^2\mu'_2 - 4\mu'_1\mu'_3\ + \mu'_4\) +-- +-- Substituting \(\mu'_1 = \mu\), and \(\mu'_2 = \mu^2 + \sigma^2\): +-- +-- \(\mu_4 = 3\mu^4 + 6\mu^2\sigma^2 - 4\mu\mu'_3 + \mu'_4\) +-- +-- It is always non-negative. It is 0 for a point distribution, low for light +-- tailed (platykurtic) distributions and high for heavy tailed (leptokurtic) +-- distributions. +-- +-- \(\kappa >= \gamma^2 + 1\) +-- +-- For a normal distribution \(\kappa = 3\sigma^4\). +-- +-- See https://en.wikipedia.org/wiki/Kurtosis . +-- +{-# INLINE windowKurtosis #-} +windowKurtosis :: (Monad m, Floating a) => Scanl m (a, Maybe a) a +windowKurtosis = + (\rm4 rm3 sd mu -> + ( 3 * mu ^ (4 :: Int) + + 6 * mu ^ (2 :: Int) * sd ^ (2 :: Int) + - 4 * mu * rm3 + + rm4) / (sd ^ (4 :: Int)) + ) + <$> windowRawMoment 4 + <*> windowRawMoment 3 + <*> windowStdDev + <*> Scanl.windowMean + +------------------------------------------------------------------------------- +-- Estimation +------------------------------------------------------------------------------- + +-- | Unbiased sample variance i.e. the variance of a sample corrected to +-- better estimate the variance of the population, defined as: +-- +-- \(s^2 = \frac{1}{n - 1}\sum_{i=1}^n {(x_{i}-\mu)}^2\) +-- +-- \(s^2 = \frac{n}{n - 1} \times \sigma^2\). +-- +-- See https://en.wikipedia.org/wiki/Bessel%27s_correction. +-- +{-# INLINE windowSampleVariance #-} +windowSampleVariance :: (Monad m, Fractional a) => Scanl m (a, Maybe a) a +windowSampleVariance = + Scanl.teeWith (\n s2 -> n * s2 / (n - 1)) Scanl.windowLength windowVariance + +-- | Sample standard deviation: +-- +-- \(s = \sqrt{sampleVariance}\) +-- +-- >>> sampleStdDev = sqrt <$> sampleVariance +-- +-- See https://en.wikipedia.org/wiki/Unbiased_estimation_of_standard_deviation +-- . +-- +{-# INLINE windowSampleStdDev #-} +windowSampleStdDev :: (Monad m, Floating a) => Scanl m (a, Maybe a) a +windowSampleStdDev = sqrt <$> windowSampleVariance + +-- | Standard error of the sample mean (SEM), defined as: +-- +-- \( SEM = \frac{sampleStdDev}{\sqrt{n}} \) +-- +-- See https://en.wikipedia.org/wiki/Standard_error . +-- +-- /Space/: \(\mathcal{O}(1)\) +-- +-- /Time/: \(\mathcal{O}(n)\) +{-# INLINE windowStdErrMean #-} +windowStdErrMean :: (Monad m, Floating a) => Scanl m (a, Maybe a) a +windowStdErrMean = + Scanl.teeWith (\sd n -> sd / sqrt n) windowSampleStdDev Scanl.windowLength + +------------------------------------------------------------------------------- +-- Probability Distribution +------------------------------------------------------------------------------- + +-- XXX We can use a Windowed classifyWith operation, that will allow us to +-- express windowed frequency, mode, histograms etc idiomatically. + +-- | Count the frequency of elements in a sliding window. +-- +-- >>> input = Stream.fromList [1,1,3,4,4::Int] +-- >>> f = Ring.slidingWindow 4 Statistics.frequency +-- >>> Stream.fold f input +-- fromList [(1,1),(3,1),(4,2)] +-- +{-# INLINE windowFrequency #-} +windowFrequency :: (Monad m, Ord a) => Scanl m (a, Maybe a) (Map a Int) +windowFrequency = Scanl.mkScanl step Map.empty + + where + + decrement v = + if v == 1 + then Nothing + else Just (v - 1) + + step refCountMap (new, mOld) = + let m1 = Map.insertWith (+) new 1 refCountMap + in case mOld of + Just k -> Map.update decrement k m1 + Nothing -> m1 diff --git a/stack.yaml b/stack.yaml new file mode 100644 index 0000000..d12ae90 --- /dev/null +++ b/stack.yaml @@ -0,0 +1,19 @@ +resolver: lts-22.33 +packages: +- '.' +#extra-deps: +# - streamly-core-0.3.0 +# - streamly-0.11.0 + +extra-deps: +- github: composewell/streamly + commit: "9539dcca361bb05eaa04cfe5be99ce38d8a1f98f" +- github: composewell/streamly + commit: "9539dcca361bb05eaa04cfe5be99ce38d8a1f98f" + subdirs: + - core + +# Look at https://stackoverflow.com/questions/70045586/could-not-find-module-system-console-mintty-win32-when-compiling-test-framework +flags: + mintty: + Win32-2-13-1: false diff --git a/streamly-statistics.cabal b/streamly-statistics.cabal index 02a3697..035d198 100644 --- a/streamly-statistics.cabal +++ b/streamly-statistics.cabal @@ -107,10 +107,12 @@ common ghc-options library import: ghc-options - exposed-modules: Streamly.Statistics + exposed-modules: + Streamly.Statistics + , Streamly.Statistics.Scanl build-depends: base >= 4.9 && < 5 - , streamly-core >= 0.2.0 - , containers >= 0.5 && < 0.7 + , streamly-core >= 0.3.0 + , containers >= 0.5 && < 0.8 , random >= 1.2 && < 1.3 , mwc-random >= 0.15 && < 0.16 , deque >= 0.4.4 && < 0.4.5 @@ -122,13 +124,13 @@ test-suite test hs-source-dirs: test main-is: Main.hs build-depends: streamly-statistics - , streamly-core >= 0.2.0 + , streamly-core >= 0.3.0 , base >= 4.9 && < 5 - , QuickCheck >= 2.10 && < 2.15 + , QuickCheck >= 2.10 && < 2.16 , hspec >= 2.0 && < 3 , hspec-core >= 2.0 && < 3 , random >= 1.0.0 && < 2 - , containers >= 0.5 && < 0.7 + , containers >= 0.5 && < 0.8 -- XXX Should remove these dependencies , vector >= 0.11 && < 0.14 , statistics >= 0.15 && < 0.17 @@ -140,12 +142,12 @@ benchmark benchmark hs-source-dirs: benchmark main-is: Main.hs build-depends: streamly-statistics - , streamly-core >= 0.2.0 + , streamly-core >= 0.3.0 , base >= 4.9 && < 5 , random >= 1.0.0 && < 2 - , deepseq >= 1.4.1 && < 1.5 - , tasty-bench >= 0.3 && < 0.4 - , tasty >= 1.4.1 && < 1.5 + , deepseq >= 1.4.1 && < 1.6 + , tasty-bench >= 0.3 && < 0.5 + , tasty >= 1.4.1 && < 1.6 mixins: tasty-bench (Test.Tasty.Bench as Gauge ,Test.Tasty.Bench as Gauge.Main diff --git a/test/Main.hs b/test/Main.hs index 095babe..12f795f 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -18,10 +18,13 @@ import qualified Statistics.Sample.Powers as STAT import qualified Statistics.Transform as STAT import qualified Streamly.Data.Array as Array import qualified Streamly.Data.Fold as Fold +import qualified Streamly.Internal.Data.Fold as Fold + (windowFold, windowFoldWith) import qualified Streamly.Data.MutArray as MA -import qualified Streamly.Internal.Data.Ring as Ring import qualified Streamly.Data.Stream as Stream import qualified Streamly.Data.Stream as S +import qualified Streamly.Internal.Data.Scanl as Scanl +import qualified Streamly.Statistics.Scanl as Stat import Prelude hiding (sum, maximum, minimum) @@ -68,7 +71,7 @@ testDistributions func fld = let var2 = func . STAT.powers 2 $ V.fromList ls strm = S.fromList ls var1 <- - liftIO $ S.fold (Ring.slidingWindow list_length fld) strm + liftIO $ S.fold (Fold.windowFold list_length fld) strm assert (validate $ abs (var1 - var2)) testVariance :: Property @@ -81,9 +84,9 @@ testFuncMD :: Fold.Fold IO ((Double, Maybe Double), IO (MA.MutArray Double)) Double -> Spec testFuncMD f = do let c = S.fromList [10.0, 11.0, 12.0, 14.0] - a1 <- runIO $ S.fold (Ring.slidingWindowWith 2 f) c - a2 <- runIO $ S.fold (Ring.slidingWindowWith 3 f) c - a3 <- runIO $ S.fold (Ring.slidingWindowWith 4 f) c + a1 <- runIO $ S.fold (Fold.windowFoldWith 2 f) c + a2 <- runIO $ S.fold (Fold.windowFoldWith 3 f) c + a3 <- runIO $ S.fold (Fold.windowFoldWith 4 f) c it ("MD should be 1.0 , 1.1111111111111114 , 1.25 but actual is " ++ show a1 ++ " " ++ show a2 ++ " " ++ show a3) ( validate (abs (a1 - 1.0)) @@ -95,7 +98,7 @@ testFuncKurt :: Spec testFuncKurt = do let c = S.fromList [21.3 :: Double, 38.4, 12.7, 41.6] - krt <- runIO $ S.fold (Ring.slidingWindow 4 kurtosis) c + krt <- runIO $ S.fold (Fold.windowFold 4 kurtosis) c it ( "kurtosis should be 1.2762447351370185 Actual is " ++ show krt ) @@ -208,7 +211,7 @@ main = hspec $ do deviationLimit = 1 testFunc f = do let c = S.fromList testCase - a <- runIO $ S.fold (Ring.slidingWindow winSize f) c + a <- runIO $ S.fold (Fold.windowFold winSize f) c b <- runIO $ S.fold f $ S.drop (numElem - winSize) $ fmap (, Nothing) c let c1 = a - b @@ -226,9 +229,13 @@ main = hspec $ do testFunc tc f sI sW = do let c = S.fromList tc - a <- runIO $ S.fold Fold.toList $ S.postscan f $ fmap (, Nothing) c - b <- runIO $ S.fold Fold.toList $ S.postscan - (Ring.slidingWindow winSize f) c + a <- runIO + $ S.fold Fold.toList + $ S.postscanl f + $ fmap (, Nothing) c + b <- runIO + $ S.fold Fold.toList + $ S.postscanl (Scanl.windowScan winSize f) c it "Infinite" $ a == sI it ("Finite " ++ show winSize) $ b == sW @@ -254,27 +261,27 @@ main = hspec $ do describe "minimum" $ do let scanInf = [31, 31, 31, 26, 26, 26, 26] :: [Double] scanWin = [31, 31, 31, 26, 26, 26, 53] :: [Double] - testFunc testCase1 minimum scanInf scanWin + testFunc testCase1 Stat.windowMinimum scanInf scanWin describe "maximum" $ do let scanInf = [31, 41, 59, 59, 59, 59, 97] :: [Double] scanWin = [31, 41, 59, 59, 59, 58, 97] :: [Double] - testFunc testCase1 maximum scanInf scanWin + testFunc testCase1 Stat.windowMaximum scanInf scanWin describe "range" $ do let scanInf = [0, 10, 28, 33, 33, 33, 71] :: [Double] scanWin = [0, 10, 28, 33, 33, 32, 44] :: [Double] - testFunc testCase1 range scanInf scanWin + testFunc testCase1 Stat.windowRange scanInf scanWin describe "sum" $ do let scanInf = [1, 2, 3, 4, 5, 12] :: [Double] scanWin = [1, 2, 3, 3, 3, 9] :: [Double] - testFunc testCase2 sum scanInf scanWin + testFunc testCase2 Scanl.windowSum scanInf scanWin describe "mean" $ do let scanInf = [1, 1, 1, 1, 1, 2] :: [Double] scanWin = [1, 1, 1, 1, 1, 3] :: [Double] - testFunc testCase2 mean scanInf scanWin + testFunc testCase2 Scanl.windowMean scanInf scanWin describe "welfordMean" $ do let scanInf = [1, 1, 1, 1, 1, 2] :: [Double] scanWin = [1, 1, 1, 1, 1, 3] :: [Double] - testFunc testCase2 welfordMean scanInf scanWin + testFunc testCase2 Stat.windowWelfordMean scanInf scanWin -- Probability Distribution describe "frequency"