Skip to content

Commit

Permalink
1. add some simple utilities for goodness-of-fit studies `ostap.stat…
Browse files Browse the repository at this point in the history
…s.gof`
  • Loading branch information
VanyaBelyaev committed Dec 6, 2023
1 parent 96b2b5c commit 7b00e44
Show file tree
Hide file tree
Showing 2 changed files with 236 additions and 3 deletions.
9 changes: 6 additions & 3 deletions ReleaseNotes/release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,17 @@
1. small fixes for `ostap.utuls.split_ranges`
1. add conversion to int for `RooAbsCategory`
1. add iterator/contains/len functions for `RooAbsDataStore`

1. add some simple utilities for goodness-of-fit studies `ostap.stats.gof`

## Backward incompatible:

1. change the interface for fuctions from `ostap.stats.ustat` module
1. change the interface for `Ostap::UStat` class
1. change the interface for functions from the `ostap.stats.ustat` module
1. change the interface for the `Ostap::UStat` class

## Bug fixes:

1. fix a newly introduced bug in `ostap.utils.utils.split_range`
`

# v1.10.0.2

Expand Down
230 changes: 230 additions & 0 deletions ostap/stats/gof.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# =============================================================================
## @file ostap/stats/gof.py
# Set of utulities for goodness-of-fit studies
# @author Vanya BELYAEV [email protected]
# @date 2023-12-06
# =============================================================================
""" Simple utulities for goodness-of-fit studies
"""
# =============================================================================
__version__ = "$Revision$"
__author__ = "Vanya BELYAEV [email protected]"
__date__ = "2023-12-06"
__all__ = (
'nEff' , ## get number of effective entries
'mean_var' , ## mean and variance for (weighted) arrays
'normalize' , ## "normalize" variables in dataset/structured array
)
# =============================================================================
from ostap.core.core import VE, Ostap
from ostap.core.ostap_types import string_types
# =============================================================================
try :
import numpy as np
_np_floats = np.float16, np.float32, np.float64, np.float128
except ImportError :
np = None
# =============================================================================
# logging
# =============================================================================
from ostap.logger.logger import getLogger
if '__main__' == __name__ : logger = getLogger( 'ostap.stats.gof' )
else : logger = getLogger( __name__ )
# =============================================================================
logger.debug( 'Simple utilities for goodness-of-fit studies')


# ============================================================================
## Get the mean and variance for (1D) data array with optional (1D) weight array
# @code
# ds = ... ## dataste as structured array
# mean, cov2 = mean_var ( ds ['x'] )
# @endcode
# - with weight
# @code
# ds = ... ## dataset as structured array with weight
# mean, cov2 = mean_var ( ds ['x'] , ds['weight'] )
# @endcode
def mean_var ( data , weight = None ) :
"""Get the mean and variance for 1D-data array with optional 1D-weight array
>>> ds = ... ## dataste as structured array
>>> mean, cov2 = mean_var ( ds ['x'] )
- with weight
>>> ds = ... ## dataset as structured array with weight
>>> mean, cov2 = mean_var ( ds ['x'] , ds['weight'] )
"""
#
if weight is None :
mean = np.mean ( data , axis = 0 , dtype = np.float64 )
var = np.var ( data , axis = 0 , dtype = np.float64 )
return mean , var
#
mean = np.average ( data , weights = weight , axis = 0 )
var = np.average ( ( data - mean ) ** 2 , weights = weight , axis = 0 )
#
return mean , var

# =============================================================================
## Get the effectibe number of entries for 1D-array
# \f{ n_{eff} = \frac{ \left\langle x \right\rangle^2}
# { \left\langle x^2 \right\rangle } \f}
def nEff ( weights ) :
"""Get the effectibe number of entries for 1D-array
n_eff = ( sum ( x ) ) ^2 / sum ( x^2 )
"""

s1 = np.sum ( weights , dtype = np.float64 )
s2 = np.sum ( weights ** 2 , dtype = np.float64 )

return s1 * s1 / s2

# =============================================================================
## Get the "normalized" input datasets
# All floating felds are calculated as
# \f[ x = \frac{x - \left\langle x \right\rangle}{\sigma} \f]
# where \f$ \left\langle x \right\rangle\f$ is mena value
# and \f$ \sigma \f$ is a standard deviation.
#
# @code
# ds = ... # data set as structured array
# dsn = normalize ( ds )
# @endcode
#
# - If several datasets are specified, all floating names must be the same
# and the mean and sigma are either taken either from the first dataset,
# if <code>first=True</code> or as combined through all datasets otherwise
#
# @code
# ds1 = ... # data set as structured array
# ds2 = ... # data set as structured array
# ds3 = ... # data set as structured array
# ds1n, ds2n, ds3n = normalize ( ds1 , ds2 , ds3 , first = True )
# @endcode
#
# - If <code>weight</code> is specified, this floating column is considered
# as the weight
#
# @code
# ds = ... # data set as structured array with weight
# dsn = normalize ( ds , weight = 'weight' )
# @endcode
#
# @code
# ds1 = ... # data set as structured array without weight
# ds2 = ... # data set as structured array with weight
# ds1n , ds2n = normalize ( ds1 , ds2 , weight = ( None , 'weight' ) )
# @endcode
#
# @attention Only the floating point columns are transformed!
# @attention Input datasets are expected to be numpy structured arrays
#
# @code
# ds = ... # data set as structured array
# dsn = normalize ( ds )
# @endcode
def normalize ( ds , *others , weight = () , first = True ) :
""" Get the `normalized' input datasets
All floating felds are calculated as
x = (x - <x>)/sigma
- <x> is a mean value
- is a standard deviation.
- If several datasets are specified, all floating names must be the same
and the mean and sigma are either taken either from the first dataset,
if `first=True` or as combined through all datasets, otherwise
- If `weight` is specified, this floating column is concidered
as the weight
- attention Only the floating point columns are transformed!
- attention Input datasets are expected to be numpy structured arrays
"""

if not weight :
weight = ( len ( others ) + 1 ) * [ '' ]
elif isinstance ( weight , string_types ) :
weight = [ weight] + len ( others ) * [ '' ]

assert ( len ( weight ) == len ( others ) + 1 ) and \
all ( ( not w ) or isinstance ( w , string_types ) for w in weight ) , \
'Invalid specification of weight!'

weight = list ( weight )
for i,w in enumerate ( weight ) :
if not w : weight [ i ] = ''
weight = tuple ( weight )

data = ( ds , ) + others
result = []

## collect the floating columns
columns = []
w0 = weight [ 0 ]
for n,t in ds.dtype.fields.items () :
if t[0] in _np_floats and n != w0 : columns.append ( n )


vmeans = []
for i , c in enumerate ( columns ) :
mean, var = mean_var ( ds [ c ] , None if not weight [ 0 ] else ds [ weight [ 0] ] )
vmeans.append ( VE ( mean , var ) )

## Number of events/effective entries
nevents = 1.0 * ds.shape [ 0 ] if not weight [ 0 ] else nEff ( ds [ weight [ 0 ] ] )

if not first and others :
nevents = ds.shape[0]
for k , dd in others :

nn = 1.0 * dd.shape [ 0 ] if not weight [ k ] else nEff ( dd [ weight [ k ] ] )

for i , c in enumerate ( columns ) :

mean, var = mean_var ( dd [ c ] , None if not weight [ k ] else dd [ weight [ k ] ] )

vv = VE ( mean , var )
nn = dd.shape [ 0 ]

vmean [ i ] = Ostap.Math.two_samples ( vmean [ i ] , nevents , vv , nn )

nevents += nn


result = []
for k , d in enumerate ( ( ds , *others ) ) :
nds = d.copy ()
result.append ( nds )
for ic,c in enumerate ( columns ) :
a = nds [ c ]
vv = vmeans [ ic ]
mean = vv.value ()
sigma = vv.error ()

nds [ c ] = ( a - mean ) / sigma



return result [ 0 ] if not others else tuple ( result )


# =============================================================================
if '__main__' == __name__ :

from ostap.utils.docme import docme
docme ( __name__ , logger = logger )


# =============================================================================
## The END
# =============================================================================



0 comments on commit 7b00e44

Please sign in to comment.