Skip to content

Commit

Permalink
Added stats module originally developed for omc3 (#35)
Browse files Browse the repository at this point in the history
  • Loading branch information
lmalina authored Aug 16, 2023
1 parent 3bb1e22 commit 79776a7
Show file tree
Hide file tree
Showing 3 changed files with 437 additions and 0 deletions.
346 changes: 346 additions & 0 deletions pySC/utils/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,346 @@
"""
Stats
-----
Helper module providing statistical methods to compute various weighted averages along specified
axis and their errors as well as unbiased error estimator of infinite normal distribution from
finite-sized sample.
"""
import numpy as np
from scipy.special import erf
from scipy.stats import t

from pySC.utils import logging_tools

LOGGER = logging_tools.get_logger(__name__)

CONFIDENCE_LEVEL = (1 + erf(1 / np.sqrt(2))) / 2
PI2: float = 2 * np.pi
PI2I: float = 2j * np.pi


def circular_mean(data, period=PI2, errors=None, axis=None):
"""
Computes weighted circular average along the specified axis.
Parameters:
data: array-like
Contains the data to be averaged
period: scalar, optional, default (2 * np.pi)
Periodicity period of data
errors: array-like, optional
Contains errors associated with the values in data, it is used to calculated weights
axis: int or tuple of ints, optional
Axis or axes along which to average data
Returns:
Returns the weighted circular average along the specified axis.
"""
phases = data * PI2I / period
weights = weights_from_errors(errors, period=period)

return np.angle(np.average(np.exp(phases), axis=axis, weights=weights)) * period / PI2


def circular_nanmean(data, period=PI2, errors=None, axis=None):
""""Wrapper around circular_mean with added nan handling"""
return circular_mean(data=np.ma.array(data, mask=np.isnan(data)),
period=period,
errors= None if errors is None else np.ma.array(errors, mask=np.isnan(data)),
axis=axis)


def circular_error(data, period=PI2, errors=None, axis=None, t_value_corr=True):
"""
Computes error of weighted circular average along the specified axis.
Parameters:
data: array-like
Contains the data to be averaged
period: scalar, optional
Periodicity period of data, default is (2 * np.pi)
errors: array-like, optional
Contains errors associated with the values in data, it is used to calculated weights
axis: int or tuple of ints, optional
Axis or axes along which to average data
t_value_corr: bool, optional
Species if the error is corrected for small sample size, default True
Returns:
Returns the error of weighted circular average along the specified axis.
"""
phases = data * PI2I / period
weights = weights_from_errors(errors, period=period)
complex_phases = np.exp(phases)
complex_average = np.average(complex_phases, axis=axis, weights=weights)

(sample_variance, sum_of_weights) = np.average(
np.square(np.abs(complex_phases - complex_average.reshape(_get_shape(
complex_phases.shape, axis)))), weights=weights, axis=axis, returned=True)
if weights is not None:
sample_variance = sample_variance + 1. / sum_of_weights
error_of_complex_average = np.sqrt(sample_variance * unbias_variance(data, weights, axis=axis))
phase_error = np.nan_to_num(error_of_complex_average / np.abs(complex_average))
if t_value_corr:
phase_error = phase_error * t_value_correction(effective_sample_size(data, weights, axis=axis))
return np.where(phase_error > 0.25 * PI2, 0.3 * period, phase_error * period / PI2)


def circular_nanerror(data, period=PI2, errors=None, axis=None, t_value_corr=True):
""""Wrapper around circular_error with added nan handling"""
return circular_error(data=np.ma.array(data, mask=np.isnan(data)),
period=period,
errors=None if errors is None else np.ma.array(errors, mask=np.isnan(data)),
axis=axis,
t_value_corr=t_value_corr)


def weighted_mean(data, errors=None, axis=None):
"""
Computes weighted average along the specified axis.
Parameters:
data: array-like
Contains the data to be averaged
errors: array-like, optional
Contains errors associated with the values in data, it is used to calculated weights
axis: int or tuple of ints, optional
Axis or axes along which to average data
Returns:
Returns the weighted average along the specified axis.
"""
weights = weights_from_errors(errors)
return np.average(data, axis=axis, weights=weights)


def weighted_nanmean(data, errors=None, axis=None):
""""Wrapper around weighted_mean with added nan handling"""
return weighted_mean(data=np.ma.array(data, mask=np.isnan(data)),
errors=None if errors is None else np.ma.array(errors, mask=np.isnan(data)),
axis=axis)


def _get_shape(orig_shape, axis):
new_shape = np.array(orig_shape)
if axis is None:
new_shape[:] = 1
else:
new_shape[np.array(axis)] = 1
return tuple(new_shape)


def weighted_error(data, errors=None, axis=None, t_value_corr=True):
"""
Computes error of weighted average along the specified axis.
This is similar to calculating the standard deviation on the data,
but with both, the average to which the deviation is calculated,
as well as then the averaging over the deviations weighted by
weights based on the errors.
In addition, the weighted variance is unbiased by an unbias-factor
n / (n-1), where n is the :meth:`omc3.utils.stats.effective_sample_size` .
Additionally, a (student) t-value correction can be performed (done by default)
which corrects the estimate for small data sets.
Parameters:
data: array-like
Contains the data on which the weighted error on the average is calculated.
errors: array-like, optional
Contains errors associated with the values in data, it is used to calculated weights
axis: int or tuple of ints, optional
Axis or axes along which to average data
t_value_corr: bool, optional
Species if the error is corrected for small sample size, default True
Returns:
Returns the error of weighted average along the specified axis.
"""
weights = weights_from_errors(errors)
weighted_average = np.average(data, axis=axis, weights=weights)
weighted_average = weighted_average.reshape(_get_shape(data.shape, axis))
(sample_variance, sum_of_weights) = np.ma.average(
np.square(np.abs(data - weighted_average)),
weights=weights, axis=axis, returned=True
)
if weights is not None:
sample_variance = sample_variance + 1 / sum_of_weights
error = np.nan_to_num(np.sqrt(sample_variance * unbias_variance(data, weights, axis=axis)))
if t_value_corr:
error = error * t_value_correction(effective_sample_size(data, weights, axis=axis))
return error


def circular_rms(data, period=PI2, axis=None):
"""
Computes the circular root mean square along the specified axis.
Parameters:
data: array-like
Contains the data to be averaged
period: scalar, optional
Periodicity period of data, default is (2 * np.pi)
axis: int or tuple of ints, optional
Axis or axes along which to average data
Returns:
Returns the circular root mean square along the specified axis.
"""
return np.sqrt(circular_mean(np.square(data / period), period=1, axis=axis) * period)


def rms(data, axis=None):
"""
Computes the root mean square along the specified axis.
Parameters:
data: array-like
Contains the data to be averaged
axis: int or tuple of ints, optional
Axis or axes along which to average data
Returns:
Returns the root mean square along the specified axis.
"""
return weighted_rms(data, axis=axis)


def weighted_rms(data, errors=None, axis=None):
"""
Computes weighted root mean square along the specified axis.
Parameters:
data: array-like
Contains the data to be averaged
errors: array-like, optional
Contains errors associated with the values in data, it is used to calculated weights
axis: int or tuple of ints, optional
Axis or axes along which to average data
Returns:
Returns weighted root mean square along the specified axis.
"""
weights = weights_from_errors(errors)
return np.sqrt(np.average(np.square(data), weights=weights, axis=axis))


def weighted_nanrms(data, errors=None, axis=None):
""""Wrapper around weigthed_rms with added nan handling"""
return weighted_rms(data=np.ma.array(data, mask=np.isnan(data)),
errors=None if errors is None else np.ma.array(errors, mask=np.isnan(data)),
axis=axis)


def weights_from_errors(errors, period=PI2):
"""
Computes weights from measurement errors, weights are not output if errors contain zeros or nans
Parameters:
errors: array-like
Contains errors which are used to calculated weights
period: scalar, optional
Periodicity period of data, default is (2 * np.pi)
Returns:
Returns the error of weighted circular average along the specified axis.
"""
if errors is None:
return None
if np.any(np.isnan(errors)):
LOGGER.warning("NaNs found, weights are not used.")
return None
if np.any(np.logical_not(errors)):
LOGGER.warning("Zeros found, weights are not used.")
return None
return 1 / np.square(errors * PI2 / period)


def effective_sample_size(data, weights, axis=None):
r"""
Computes effective sample size of weighted data along specified axis,
the minimum value returned is 2 to avoid non-reasonable error blow-up.
It is calculated via Kish's approximate formula
from the (not necessarily normalized) weights :math:`w_i` (see wikipedia):
.. math::
n_\mathrm{eff} = \frac{\left(\sum_i w_i\right)^2}{\sum_i w_i^2}
What it represents:
"In most instances, weighting causes a decrease in the statistical significance of results.
The effective sample size is a measure of the precision of the survey
(e.g., even if you have a sample of 1,000 people, an effective sample size of 100 would indicate
that the weighted sample is no more robust than a well-executed un-weighted
simple random sample of 100 people)." -
https://wiki.q-researchsoftware.com/wiki/Weights,_Effective_Sample_Size_and_Design_Effects
Parameters:
data: array-like
weights: array-like
Contains weights associated with the values in data
axis: int or tuple of ints, optional
Axis or axes along which the effective sample size is computed
Returns:
Returns the error of weighted circular average along the specified axis.
"""
if weights is None:
sample_size = np.sum(np.ones(data.shape), axis=axis)
else:
sample_size = np.square(np.sum(weights, axis=axis)) / np.sum(np.square(weights), axis=axis)
return np.where(sample_size > 2, sample_size, 2)


def unbias_variance(data, weights, axis=None):
r"""
Computes a correction factor to unbias variance of weighted average of data along specified axis,
e.g. transform the standard deviation 1
.. math::
\sigma^2 = \frac{1}{N} \sum_{i=1}^N (x_i - x_\mathrm{mean})^2
into an un-biased estimator
.. math::
\sigma^2 = \frac{1}{N-1} \sum_{i=1}^N (x_i - x_\mathrm{mean})^2
Parameters:
data: array-like
weights: array-like
Contains weights associated with the values in data
axis: int or tuple of ints, optional
Axis or axes along which the effective sample size is computed
Returns:
Returns the error of weighted circular average along the specified axis.
"""
sample_size = effective_sample_size(data, weights, axis=axis)
try:
return sample_size / (sample_size - 1)
except ZeroDivisionError or RuntimeWarning:
return np.zeros(sample_size.shape)


def t_value_correction(sample_size):
"""
Calculates the multiplicative correction factor to determine standard deviation of
a normally distributed quantity from standard deviation of its finite-sized sample.
The minimum allowed sample size is 2 to avoid non-reasonable error blow-up
for smaller sample sizes 2 is used instead.
Note (jdilly): In other words, this transforms the area of 1 sigma under
the given student t distribution to the 1 sigma area of a normal distribution
(this transformation is where the ``CONFIDENCE LEVEL`` comes in).
I hope that makes the intention more clear.
Args:
sample_size: array-like
Returns:
multiplicative correction factor(s) of same shape as sample_size.
Can contain nans.
"""
return t.ppf(CONFIDENCE_LEVEL, np.where(sample_size > 2, sample_size, 2) - 1)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# Dependencies for the package itself
DEPENDENCIES = [
"numpy>=1.22.0",
"scipy>=1.8.0",
"matplotlib>=3.5.0",
"accelerator-toolbox>=0.4.0"

Expand Down
Loading

0 comments on commit 79776a7

Please sign in to comment.