-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1. add some simple utilities for goodness-of-fit studies `ostap.stat…
…s.gof`
- Loading branch information
1 parent
96b2b5c
commit 7b00e44
Showing
2 changed files
with
236 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# ============================================================================= | ||
## @file ostap/stats/gof.py | ||
# Set of utulities for goodness-of-fit studies | ||
# @author Vanya BELYAEV [email protected] | ||
# @date 2023-12-06 | ||
# ============================================================================= | ||
""" Simple utulities for goodness-of-fit studies | ||
""" | ||
# ============================================================================= | ||
__version__ = "$Revision$" | ||
__author__ = "Vanya BELYAEV [email protected]" | ||
__date__ = "2023-12-06" | ||
__all__ = ( | ||
'nEff' , ## get number of effective entries | ||
'mean_var' , ## mean and variance for (weighted) arrays | ||
'normalize' , ## "normalize" variables in dataset/structured array | ||
) | ||
# ============================================================================= | ||
from ostap.core.core import VE, Ostap | ||
from ostap.core.ostap_types import string_types | ||
# ============================================================================= | ||
try : | ||
import numpy as np | ||
_np_floats = np.float16, np.float32, np.float64, np.float128 | ||
except ImportError : | ||
np = None | ||
# ============================================================================= | ||
# logging | ||
# ============================================================================= | ||
from ostap.logger.logger import getLogger | ||
if '__main__' == __name__ : logger = getLogger( 'ostap.stats.gof' ) | ||
else : logger = getLogger( __name__ ) | ||
# ============================================================================= | ||
logger.debug( 'Simple utilities for goodness-of-fit studies') | ||
|
||
|
||
# ============================================================================ | ||
## Get the mean and variance for (1D) data array with optional (1D) weight array | ||
# @code | ||
# ds = ... ## dataste as structured array | ||
# mean, cov2 = mean_var ( ds ['x'] ) | ||
# @endcode | ||
# - with weight | ||
# @code | ||
# ds = ... ## dataset as structured array with weight | ||
# mean, cov2 = mean_var ( ds ['x'] , ds['weight'] ) | ||
# @endcode | ||
def mean_var ( data , weight = None ) : | ||
"""Get the mean and variance for 1D-data array with optional 1D-weight array | ||
>>> ds = ... ## dataste as structured array | ||
>>> mean, cov2 = mean_var ( ds ['x'] ) | ||
- with weight | ||
>>> ds = ... ## dataset as structured array with weight | ||
>>> mean, cov2 = mean_var ( ds ['x'] , ds['weight'] ) | ||
""" | ||
# | ||
if weight is None : | ||
mean = np.mean ( data , axis = 0 , dtype = np.float64 ) | ||
var = np.var ( data , axis = 0 , dtype = np.float64 ) | ||
return mean , var | ||
# | ||
mean = np.average ( data , weights = weight , axis = 0 ) | ||
var = np.average ( ( data - mean ) ** 2 , weights = weight , axis = 0 ) | ||
# | ||
return mean , var | ||
|
||
# ============================================================================= | ||
## Get the effectibe number of entries for 1D-array | ||
# \f{ n_{eff} = \frac{ \left\langle x \right\rangle^2} | ||
# { \left\langle x^2 \right\rangle } \f} | ||
def nEff ( weights ) : | ||
"""Get the effectibe number of entries for 1D-array | ||
n_eff = ( sum ( x ) ) ^2 / sum ( x^2 ) | ||
""" | ||
|
||
s1 = np.sum ( weights , dtype = np.float64 ) | ||
s2 = np.sum ( weights ** 2 , dtype = np.float64 ) | ||
|
||
return s1 * s1 / s2 | ||
|
||
# ============================================================================= | ||
## Get the "normalized" input datasets | ||
# All floating felds are calculated as | ||
# \f[ x = \frac{x - \left\langle x \right\rangle}{\sigma} \f] | ||
# where \f$ \left\langle x \right\rangle\f$ is mena value | ||
# and \f$ \sigma \f$ is a standard deviation. | ||
# | ||
# @code | ||
# ds = ... # data set as structured array | ||
# dsn = normalize ( ds ) | ||
# @endcode | ||
# | ||
# - If several datasets are specified, all floating names must be the same | ||
# and the mean and sigma are either taken either from the first dataset, | ||
# if <code>first=True</code> or as combined through all datasets otherwise | ||
# | ||
# @code | ||
# ds1 = ... # data set as structured array | ||
# ds2 = ... # data set as structured array | ||
# ds3 = ... # data set as structured array | ||
# ds1n, ds2n, ds3n = normalize ( ds1 , ds2 , ds3 , first = True ) | ||
# @endcode | ||
# | ||
# - If <code>weight</code> is specified, this floating column is considered | ||
# as the weight | ||
# | ||
# @code | ||
# ds = ... # data set as structured array with weight | ||
# dsn = normalize ( ds , weight = 'weight' ) | ||
# @endcode | ||
# | ||
# @code | ||
# ds1 = ... # data set as structured array without weight | ||
# ds2 = ... # data set as structured array with weight | ||
# ds1n , ds2n = normalize ( ds1 , ds2 , weight = ( None , 'weight' ) ) | ||
# @endcode | ||
# | ||
# @attention Only the floating point columns are transformed! | ||
# @attention Input datasets are expected to be numpy structured arrays | ||
# | ||
# @code | ||
# ds = ... # data set as structured array | ||
# dsn = normalize ( ds ) | ||
# @endcode | ||
def normalize ( ds , *others , weight = () , first = True ) : | ||
""" Get the `normalized' input datasets | ||
All floating felds are calculated as | ||
x = (x - <x>)/sigma | ||
- <x> is a mean value | ||
- is a standard deviation. | ||
- If several datasets are specified, all floating names must be the same | ||
and the mean and sigma are either taken either from the first dataset, | ||
if `first=True` or as combined through all datasets, otherwise | ||
- If `weight` is specified, this floating column is concidered | ||
as the weight | ||
- attention Only the floating point columns are transformed! | ||
- attention Input datasets are expected to be numpy structured arrays | ||
""" | ||
|
||
if not weight : | ||
weight = ( len ( others ) + 1 ) * [ '' ] | ||
elif isinstance ( weight , string_types ) : | ||
weight = [ weight] + len ( others ) * [ '' ] | ||
|
||
assert ( len ( weight ) == len ( others ) + 1 ) and \ | ||
all ( ( not w ) or isinstance ( w , string_types ) for w in weight ) , \ | ||
'Invalid specification of weight!' | ||
|
||
weight = list ( weight ) | ||
for i,w in enumerate ( weight ) : | ||
if not w : weight [ i ] = '' | ||
weight = tuple ( weight ) | ||
|
||
data = ( ds , ) + others | ||
result = [] | ||
|
||
## collect the floating columns | ||
columns = [] | ||
w0 = weight [ 0 ] | ||
for n,t in ds.dtype.fields.items () : | ||
if t[0] in _np_floats and n != w0 : columns.append ( n ) | ||
|
||
|
||
vmeans = [] | ||
for i , c in enumerate ( columns ) : | ||
mean, var = mean_var ( ds [ c ] , None if not weight [ 0 ] else ds [ weight [ 0] ] ) | ||
vmeans.append ( VE ( mean , var ) ) | ||
|
||
## Number of events/effective entries | ||
nevents = 1.0 * ds.shape [ 0 ] if not weight [ 0 ] else nEff ( ds [ weight [ 0 ] ] ) | ||
|
||
if not first and others : | ||
nevents = ds.shape[0] | ||
for k , dd in others : | ||
|
||
nn = 1.0 * dd.shape [ 0 ] if not weight [ k ] else nEff ( dd [ weight [ k ] ] ) | ||
|
||
for i , c in enumerate ( columns ) : | ||
|
||
mean, var = mean_var ( dd [ c ] , None if not weight [ k ] else dd [ weight [ k ] ] ) | ||
|
||
vv = VE ( mean , var ) | ||
nn = dd.shape [ 0 ] | ||
|
||
vmean [ i ] = Ostap.Math.two_samples ( vmean [ i ] , nevents , vv , nn ) | ||
|
||
nevents += nn | ||
|
||
|
||
result = [] | ||
for k , d in enumerate ( ( ds , *others ) ) : | ||
nds = d.copy () | ||
result.append ( nds ) | ||
for ic,c in enumerate ( columns ) : | ||
a = nds [ c ] | ||
vv = vmeans [ ic ] | ||
mean = vv.value () | ||
sigma = vv.error () | ||
|
||
nds [ c ] = ( a - mean ) / sigma | ||
|
||
|
||
|
||
return result [ 0 ] if not others else tuple ( result ) | ||
|
||
|
||
# ============================================================================= | ||
if '__main__' == __name__ : | ||
|
||
from ostap.utils.docme import docme | ||
docme ( __name__ , logger = logger ) | ||
|
||
|
||
# ============================================================================= | ||
## The END | ||
# ============================================================================= | ||
|
||
|
||
|