diff --git a/ReleaseNotes/release_notes.md b/ReleaseNotes/release_notes.md index 782ce1cb..7542e5a9 100644 --- a/ReleaseNotes/release_notes.md +++ b/ReleaseNotes/release_notes.md @@ -12,14 +12,17 @@ 1. small fixes for `ostap.utuls.split_ranges` 1. add conversion to int for `RooAbsCategory` 1. add iterator/contains/len functions for `RooAbsDataStore` - + 1. add some simple utilities for goodness-of-fit studies `ostap.stats.gof` + ## Backward incompatible: - 1. change the interface for fuctions from `ostap.stats.ustat` module - 1. change the interface for `Ostap::UStat` class + 1. change the interface for functions from the `ostap.stats.ustat` module + 1. change the interface for the `Ostap::UStat` class ## Bug fixes: + 1. fix a newly introduced bug in `ostap.utils.utils.split_range` +` # v1.10.0.2 diff --git a/ostap/stats/gof.py b/ostap/stats/gof.py new file mode 100644 index 00000000..5990a741 --- /dev/null +++ b/ostap/stats/gof.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# ============================================================================= +## @file ostap/stats/gof.py +# Set of utulities for goodness-of-fit studies +# @author Vanya BELYAEV Ivan.Belyaev@cern.ch +# @date 2023-12-06 +# ============================================================================= +""" Simple utulities for goodness-of-fit studies +""" +# ============================================================================= +__version__ = "$Revision$" +__author__ = "Vanya BELYAEV Ivan.Belyaev@cern.ch" +__date__ = "2023-12-06" +__all__ = ( + 'nEff' , ## get number of effective entries + 'mean_var' , ## mean and variance for (weighted) arrays + 'normalize' , ## "normalize" variables in dataset/structured array + ) +# ============================================================================= +from ostap.core.core import VE, Ostap +from ostap.core.ostap_types import string_types +# ============================================================================= +try : + import numpy as np + _np_floats = np.float16, np.float32, np.float64, np.float128 +except ImportError : + np = None +# ============================================================================= +# logging +# ============================================================================= +from ostap.logger.logger import getLogger +if '__main__' == __name__ : logger = getLogger( 'ostap.stats.gof' ) +else : logger = getLogger( __name__ ) +# ============================================================================= +logger.debug( 'Simple utilities for goodness-of-fit studies') + + +# ============================================================================ +## Get the mean and variance for (1D) data array with optional (1D) weight array +# @code +# ds = ... ## dataste as structured array +# mean, cov2 = mean_var ( ds ['x'] ) +# @endcode +# - with weight +# @code +# ds = ... ## dataset as structured array with weight +# mean, cov2 = mean_var ( ds ['x'] , ds['weight'] ) +# @endcode +def mean_var ( data , weight = None ) : + """Get the mean and variance for 1D-data array with optional 1D-weight array + + >>> ds = ... ## dataste as structured array + >>> mean, cov2 = mean_var ( ds ['x'] ) + + - with weight + + >>> ds = ... ## dataset as structured array with weight + >>> mean, cov2 = mean_var ( ds ['x'] , ds['weight'] ) + """ + # + if weight is None : + mean = np.mean ( data , axis = 0 , dtype = np.float64 ) + var = np.var ( data , axis = 0 , dtype = np.float64 ) + return mean , var + # + mean = np.average ( data , weights = weight , axis = 0 ) + var = np.average ( ( data - mean ) ** 2 , weights = weight , axis = 0 ) + # + return mean , var + +# ============================================================================= +## Get the effectibe number of entries for 1D-array +# \f{ n_{eff} = \frac{ \left\langle x \right\rangle^2} +# { \left\langle x^2 \right\rangle } \f} +def nEff ( weights ) : + """Get the effectibe number of entries for 1D-array + + n_eff = ( sum ( x ) ) ^2 / sum ( x^2 ) + """ + + s1 = np.sum ( weights , dtype = np.float64 ) + s2 = np.sum ( weights ** 2 , dtype = np.float64 ) + + return s1 * s1 / s2 + +# ============================================================================= +## Get the "normalized" input datasets +# All floating felds are calculated as +# \f[ x = \frac{x - \left\langle x \right\rangle}{\sigma} \f] +# where \f$ \left\langle x \right\rangle\f$ is mena value +# and \f$ \sigma \f$ is a standard deviation. +# +# @code +# ds = ... # data set as structured array +# dsn = normalize ( ds ) +# @endcode +# +# - If several datasets are specified, all floating names must be the same +# and the mean and sigma are either taken either from the first dataset, +# if first=True or as combined through all datasets otherwise +# +# @code +# ds1 = ... # data set as structured array +# ds2 = ... # data set as structured array +# ds3 = ... # data set as structured array +# ds1n, ds2n, ds3n = normalize ( ds1 , ds2 , ds3 , first = True ) +# @endcode +# +# - If weight is specified, this floating column is considered +# as the weight +# +# @code +# ds = ... # data set as structured array with weight +# dsn = normalize ( ds , weight = 'weight' ) +# @endcode +# +# @code +# ds1 = ... # data set as structured array without weight +# ds2 = ... # data set as structured array with weight +# ds1n , ds2n = normalize ( ds1 , ds2 , weight = ( None , 'weight' ) ) +# @endcode +# +# @attention Only the floating point columns are transformed! +# @attention Input datasets are expected to be numpy structured arrays +# +# @code +# ds = ... # data set as structured array +# dsn = normalize ( ds ) +# @endcode +def normalize ( ds , *others , weight = () , first = True ) : + """ Get the `normalized' input datasets + All floating felds are calculated as + + x = (x - )/sigma + + - is a mean value + - is a standard deviation. + + - If several datasets are specified, all floating names must be the same + and the mean and sigma are either taken either from the first dataset, + if `first=True` or as combined through all datasets, otherwise + + - If `weight` is specified, this floating column is concidered + as the weight + + - attention Only the floating point columns are transformed! + - attention Input datasets are expected to be numpy structured arrays + """ + + if not weight : + weight = ( len ( others ) + 1 ) * [ '' ] + elif isinstance ( weight , string_types ) : + weight = [ weight] + len ( others ) * [ '' ] + + assert ( len ( weight ) == len ( others ) + 1 ) and \ + all ( ( not w ) or isinstance ( w , string_types ) for w in weight ) , \ + 'Invalid specification of weight!' + + weight = list ( weight ) + for i,w in enumerate ( weight ) : + if not w : weight [ i ] = '' + weight = tuple ( weight ) + + data = ( ds , ) + others + result = [] + + ## collect the floating columns + columns = [] + w0 = weight [ 0 ] + for n,t in ds.dtype.fields.items () : + if t[0] in _np_floats and n != w0 : columns.append ( n ) + + + vmeans = [] + for i , c in enumerate ( columns ) : + mean, var = mean_var ( ds [ c ] , None if not weight [ 0 ] else ds [ weight [ 0] ] ) + vmeans.append ( VE ( mean , var ) ) + + ## Number of events/effective entries + nevents = 1.0 * ds.shape [ 0 ] if not weight [ 0 ] else nEff ( ds [ weight [ 0 ] ] ) + + if not first and others : + nevents = ds.shape[0] + for k , dd in others : + + nn = 1.0 * dd.shape [ 0 ] if not weight [ k ] else nEff ( dd [ weight [ k ] ] ) + + for i , c in enumerate ( columns ) : + + mean, var = mean_var ( dd [ c ] , None if not weight [ k ] else dd [ weight [ k ] ] ) + + vv = VE ( mean , var ) + nn = dd.shape [ 0 ] + + vmean [ i ] = Ostap.Math.two_samples ( vmean [ i ] , nevents , vv , nn ) + + nevents += nn + + + result = [] + for k , d in enumerate ( ( ds , *others ) ) : + nds = d.copy () + result.append ( nds ) + for ic,c in enumerate ( columns ) : + a = nds [ c ] + vv = vmeans [ ic ] + mean = vv.value () + sigma = vv.error () + + nds [ c ] = ( a - mean ) / sigma + + + + return result [ 0 ] if not others else tuple ( result ) + + +# ============================================================================= +if '__main__' == __name__ : + + from ostap.utils.docme import docme + docme ( __name__ , logger = logger ) + + +# ============================================================================= +## The END +# ============================================================================= + + +