0.4.1

CyrilJl · Jun 9, 2024 · e0bce24 · e0bce24
1 parent 838c0a9
commit e0bce24
Show file tree

Hide file tree

Showing 7 changed files with 103 additions and 16 deletions.
diff --git a/batchstats/__init__.py b/batchstats/__init__.py
@@ -4,4 +4,4 @@
 __all__ = ['BatchCov', 'BatchMax', 'BatchMean', 'BatchMin', 'BatchPeakToPeak', 'BatchStat', 'BatchStd', 'BatchSum',
            'BatchVar', 'BatchNanMean', 'BatchNanStat', 'BatchNanSum']
 
-__version__ = '0.4'
+__version__ = '0.4.1'
diff --git a/batchstats/_misc.py b/batchstats/_misc.py
@@ -20,6 +20,27 @@ class UnequalSamplesNumber(ValueError):
     pass
 
 
+class DifferentAxisError(ValueError):
+    """
+    Error raised when two BatchStats objects are merged but have different `axis`.
+    """
+    pass
+
+
+class DifferentShapesError(ValueError):
+    """
+    Error raised when two BatchStats objects are merged but have different shapes.
+    """
+    pass
+
+
+class DifferentStatsError(ValueError):
+    """
+    Error raised when two BatchStats objects are merged but hare not of the same type.
+    """
+    pass
+
+
 def any_nan(x, axis=None):
     """
     Check if there are any NaN values in the input array.

diff --git a/batchstats/core.py b/batchstats/core.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from ._misc import any_nan, check_params
+from ._misc import DifferentAxisError, DifferentShapesError, DifferentStatsError, any_nan, check_params
 
 
 class BatchStat:
@@ -51,6 +51,15 @@ def _process_batch(self, batch, assume_valid=False):
     def __repr__(self):
         return f"{self.__class__.__name__}()"
 
+    def merge_test(self, other, field: str):
+        if type(self) != type(other):
+            raise DifferentStatsError()
+        if self.axis != other.axis:
+            raise DifferentAxisError()
+        if hasattr(self, field) and hasattr(other, field):
+            if getattr(self, field).shape != getattr(other, field).shape:
+                raise DifferentShapesError()
+
 
 class BatchNanStat:
     """

diff --git a/batchstats/stats.py b/batchstats/stats.py
@@ -53,6 +53,18 @@ def __call__(self) -> np.ndarray:
         else:
             return self.sum.copy()
 
+    def __add__(self, other):
+        self.merge_test(other, field='sum')
+        if self.n_samples == 0:
+            return other
+        elif other.n_samples == 0:
+            return self
+        else:
+            ret = BatchSum(axis=self.axis)
+            ret.n_samples = self.n_samples + other.n_samples
+            ret.sum = self.sum + other.sum
+            return ret
+
 
 class BatchMax(BatchStat):
     """
@@ -101,6 +113,18 @@ def __call__(self) -> np.ndarray:
         else:
             return self.max.copy()
 
+    def __add__(self, other):
+        self.merge_test(other, field='max')
+        if self.n_samples == 0:
+            return other
+        elif other.n_samples == 0:
+            return self
+        else:
+            ret = BatchMax(axis=self.axis)
+            ret.n_samples = self.n_samples + other.n_samples
+            ret.max = np.maximum(self.max, other.max)
+            return ret
+
 
 class BatchMin(BatchStat):
     """
@@ -149,6 +173,18 @@ def __call__(self) -> np.ndarray:
         else:
             return self.min.copy()
 
+    def __add__(self, other):
+        self.merge_test(other, field='min')
+        if self.n_samples == 0:
+            return other
+        elif other.n_samples == 0:
+            return self
+        else:
+            ret = BatchMin(axis=self.axis)
+            ret.n_samples = self.n_samples + other.n_samples
+            ret.max = np.minimum(self.max, other.max)
+            return ret
+
 
 class BatchMean(BatchStat):
     """
@@ -178,7 +214,8 @@ def update_batch(self, batch, assume_valid=False):
             if self.mean is None:
                 self.mean = np.mean(valid_batch, axis=self.axis)
             else:
-                self.mean = ((self.n_samples - n) * self.mean + np.sum(valid_batch, axis=self.axis)) / self.n_samples
+                mean_batch = np.mean(valid_batch, axis=self.axis)
+                self.mean = ((self.n_samples - n) * self.mean + np.sum(valid_batch-mean_batch, axis=self.axis) + n*mean_batch) / self.n_samples
         return self
 
     def __call__(self) -> np.ndarray:
@@ -197,6 +234,19 @@ def __call__(self) -> np.ndarray:
         else:
             return self.mean.copy()
 
+    def __add__(self, other):
+        self.merge_test(other, field='mean')
+        if self.n_samples == 0:
+            return other
+        elif other.n_samples == 0:
+            return self
+        else:
+            ret = BatchMean(axis=self.axis)
+            ret.n_samples = self.n_samples + other.n_samples
+            ret.mean = self.n_samples*self.mean + other.n_samples*other.mean
+            ret.mean /= ret.n_samples
+            return ret
+
 
 class BatchPeakToPeak(BatchStat):
     """

diff --git a/docs/source/future.rst b/docs/source/future.rst
@@ -1,5 +1,13 @@
 .. Future Development
 
+What's New ?
+============
+
+Version 0.4.1 (June 9, 2024)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+- Improved numerical stability of the ``BatchMean``'s algorithm, based on `Numerically Stable Parallel Computation of (Co-)Variance by Schubert and Gertz<https://ds.ifi.uni-heidelberg.de/files/Team/eschubert/publications/SSDBM18-covariance-authorcopy.pdf>`_
+
+
 Future Plans
 ============
 

diff --git a/setup.py b/setup.py
@@ -1,3 +1,4 @@
+import numpy as np
 from setuptools import find_packages, setup
 
 # Read version from the __init__.py file
@@ -7,14 +8,12 @@
             version = line.strip().split()[-1][1:-1]
             break
 
-setup(
-    name='batchstats',
-    version=version,
-    author='Cyril Joly',
-    description='Efficient batch statistics computation library for Python.',
-    long_description=open('README.md').read(),
-    long_description_content_type='text/markdown',
-    url='https://github.com/CyrilJl/BatchStats',
-    packages=find_packages(),
-    install_requires=['numpy'],
-)
+setup(name='batchstats',
+      version=version,
+      author='Cyril Joly',
+      description='Efficient batch statistics computation library for Python.',
+      long_description=open('README.md').read(),
+      long_description_content_type='text/markdown',
+      url='https://github.com/CyrilJl/BatchStats',
+      packages=find_packages(),
+      install_requires=['numpy'])
diff --git a/tests/test_stats.py b/tests/test_stats.py
@@ -7,13 +7,13 @@
 @pytest.fixture
 def data():
     m, n = 1_000_000, 50
-    return np.random.randn(m, n)
+    return 1e1*np.random.randn(m, n) + 1e3
 
 
 @pytest.fixture
 def data_2d_features():
     m, n, o = 100_000, 50, 60
-    return np.random.randn(m, n, o)
+    return 1e1*np.random.randn(m, n, o) + 1e3
 
 
 @pytest.fixture