Skip to content

Commit

Permalink
1. extend Point-to-Point Dissimilarity GoF test for 1D case
Browse files Browse the repository at this point in the history
  1. extend 1D GoF test to include Point-to-Point Dissimilarity GoF test
  • Loading branch information
VanyaBelyaev committed Oct 6, 2024
1 parent 1026e57 commit 7b17672
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 49 deletions.
2 changes: 2 additions & 0 deletions ReleaseNotes/release_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
1. modify `point-to-point-dissimilarity` GoF method: split into chubnks for large datasets, use parallel processing for permutations
1. add keyword arguments to `WorkManager.iexecute` method
1. add an option to run parallel permutations using `WorkManager` parallelisation instead of `joblib`
1. extend Point-to-Point Dissimilarity GoF test for 1D case
1. extend 1D GoF test to include Point-to-Point Dissimilarity GoF test

## Backward incompatible

Expand Down
2 changes: 1 addition & 1 deletion ostap/stats/gof1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ def run ( self , nToys = 1000 , silent = False ) :
vct_cdf = self.vcdf

from ostap.utils.progress_bar import progress_bar
for i in progress_bar ( nToys , silent = silent ) :
for i in progress_bar ( nToys , silent = silent , description = 'Run toys:') :

dset = self.__pdf.generate ( self.N , sample = True )
data = dset.tonumpy ( varname ) [ varname ]
Expand Down
15 changes: 11 additions & 4 deletions ostap/stats/gof_np.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def __call__ ( self , N , silent = True ) :
n1 = len ( self.ds1 )
pooled = np.concatenate ( [ self.ds1 , self.ds2 ] )
counter = EffCounter()
for i in progress_bar ( N , silent = silent ) :
for i in progress_bar ( N , silent = silent , description = 'Permutations:') :
np.random.shuffle ( pooled )
tv = self.gof.t_value ( pooled [ : n1 ] , pooled [ n1: ] )
counter += bool ( self.t_value < tv )
Expand All @@ -267,7 +267,7 @@ def joblib_run ( self , NN , silent = True ) :
with warnings.catch_warnings():
warnings.simplefilter ( "ignore" ) ## , category = DeprecationWarning )
results = jl.Parallel ( **conf ) ( input )
for r in progress_bar ( results , max_value = nj , silent = silent ) :
for r in progress_bar ( results , max_value = nj , silent = silent , description = 'Jobs:') :
counter += r
##
return counter
Expand Down Expand Up @@ -306,8 +306,7 @@ class PPDNP(AGoFNP,GoFNP) :
""" Implementation of concrete method "Point-To-Point Dissimilarity"
for probing of Goodness-Of-Fit
- see M.Williams, "How good are your fits? Unbinned multivariate goodness-of-fit tests in high energy physics"
- see https://doi.org/10.1088/1748-0221/5/09/P09004
- see http://arxiv.org/abs/arXiv:1003.1768
- see https://doi.org/10.1088/1748-0221/5/09/P09004 - see http://arxiv.org/abs/arXiv:1003.1768
"""
def __init__ ( self ,
mc2mc = False ,
Expand Down Expand Up @@ -423,6 +422,10 @@ def __call__ ( self , data1 , data2 , normalize = True ) :
## convert to unstructured datasets
uds1 = s2u ( data1 , copy = False )
uds2 = s2u ( data2 , copy = False )

## For 1D-arrays add a fictiev second dimension to please `cdist`-function
if 1 == uds1.shape [ 1 ] : uds1 = np.c_[ uds1 , np.zeros ( len ( uds1 ) ) ]
if 1 == uds2.shape [ 1 ] : uds2 = np.c_[ uds2 , np.zeros ( len ( uds2 ) ) ]

return self.t_value ( uds1 , uds2 )

Expand All @@ -448,6 +451,10 @@ def pvalue ( self , data1 , data2 , normalize = True ) :
uds1 = s2u ( ds1 , copy = False )
uds2 = s2u ( ds2 , copy = False )

## For 1D-arrays add a fictive second dimension to please `cdist`-function
if 1 == uds1.shape [ 1 ] : uds1 = np.c_[ uds1 , np.zeros ( len ( uds1 ) ) ]
if 1 == uds2.shape [ 1 ] : uds2 = np.c_[ uds2 , np.zeros ( len ( uds2 ) ) ]

t_value = self.t_value ( uds1 , uds2 )

permutator = PERMUTATOR ( self , t_value , uds1 , uds2 )
Expand Down
162 changes: 118 additions & 44 deletions ostap/stats/tests/test_stats_gof1d.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ==============================================================================
# @file ostap/stats/tests/test_stats_gof1d.py
# Test Goddness-of-fits 1D
# Test Goddness-of-fits for 1D fits
# Copyright (c) Ostap developpers.
# ==============================================================================
""" # Test averages for inconsistend data
""" Test Goddness-of-fits for 1D fits
"""
# ==============================================================================
import ostap.fitting.models as M
import ostap.stats.gof1d as G1D
import ostap.stats.gofnd as GnD
import ostap.logger.table as T
from ostap.plotting.canvas import use_canvas
from ostap.logger.pretty import pretty_float
from ostap.utils.utils import vrange
import ROOT
# ==============================================================================
from ostap.logger.logger import getLogger
Expand All @@ -20,77 +25,141 @@
gauss = M.Gauss_pdf ( 'G' , xvar = xvar , mean = 5 , sigma = 1 )
model = M.Fit1D ( signal = gauss , background = 'flat' )

ND = 200

ND = 200
# =============================================================================
## data_g: pure gaussian
gauss.mean = 5
gauss.sigma = 0.75
data_g = gauss.generate ( ND , sample = True )
# =============================================================================
## data_b: 95% gaussian + 4% flat background
model.S = 0.95 * ND
model.B = 0.05 * ND
data_b = model.generate ( ND , sample = True )


fitconf = { 'draw' : True , 'nbins' : 50 , 'refit' : 5 , 'quiet' : True }
# ==============================================================================
def test_good_fit_1 ( ) :
## Run Point-to_pint dissimilatory Goodness-of-Fit test
def run_PPD ( pdf , data, result , logger ) :
""" Run Point-to_pint dissimilatory Goodness-of-Fit test"""

logger = getLogger ( 'test_good_fit_1' )
logger.info ( 'Make a test for presumably good fit' )

gauss.mean = 5
gauss.sigma = 0.75
data = gauss.generate ( ND , sample = True )
from ostap.stats.gof_np import np,sp,s2u,cdist
if not np or not sp or not s2u or not cdist:
logger.warning ('No numpy/scipy/s4u/cdist: skip the PPD estimate!')
return

# - Point to Point Sissimilarity test with Gaussian distance using different "sigma"
rows = [ ( 'PPD/sigma' , 't-value' , 'x[..]', 'p-value [%]' ) ]
Ns = 20
logger.info ( 'Run Point-to-Point Dissimilarity GoF-test for %d different values of sigma' % Ns )
for sigma in vrange ( 0.01 , 2.0 , Ns ) :

ppd = GnD.PPD ( Nperm = 1000 , sigma = sigma )
pdf.load_params ( result , silent = True )
tvalue = ppd ( pdf , data )
tvalue, pvalue = ppd.pvalue ( pdf , data )

tv , texpo = pretty_float ( tvalue )
pvalue *= 100
pvalue = '%4.1f +/- %.1f' % ( pvalue.value() , pvalue.error() )
row = '%.2f' % sigma , tv , '10^%+d' % texpo if texpo else '' , pvalue
rows.append ( row )

title= 'Goodness-of-fit PPD-test'
table = T.table ( rows , title = title , prefix = '# ')
logger.info ( '%s:\n%s' % ( title , table ) )


with use_canvas ( 'test_good_fit_1' , wait = 1 ) :
gauss.fitTo ( data , draw = True , nbins = 50 , quite = True )
# ==============================================================================
def test_good_fit_1 ( ) :
""" Make a test for presumably good fit: fit Gauss to Gauss
"""

logger = getLogger ( 'test_good_fit_1' )
logger.info ( 'Make a test for presumably good fit: fit Gauss to Gauss' )

with use_canvas ( 'test_good_fit_1: G -> G' , wait = 1 ) :
r , f = gauss.fitTo ( data_g , **fitconf )

with use_canvas ( 'test_good_fit_1: GoF' , wait = 1 ) :
gof = G1D.GoF1D ( gauss , data )

gauss.load_params ( r , silent = True )
gof = G1D.GoF1D ( gauss , data_g )
logger.info ( 'Goodness-of-fit:\n%s' % gof )

got = G1D.GoF1DToys ( gauss , data )
gauss.load_params ( r , silent = True )
got = G1D.GoF1DToys ( gauss , data_g )
logger.info ( 'Goodness-of-fit with toys:\n%s' % got )

## Try to use multidimensional methods
run_PPD ( gauss , data_g , r , logger )

# =============================================================================
def test_good_fit_2 ( ) :
""" Make a test for presumably good fit: fit Gauss+Bkg to Gauss
"""

logger = getLogger ( 'test_good_fit_2' )
logger.info ( 'Make a test for presumably good fit' )
logger.info ( 'Make a test for presumably good fit: fit Gauss+Bkg to Gauss' )

with use_canvas ( 'test_good_fit_2: G+B -> G' , wait = 1 ) :
r , f = model.fitTo ( data_g , **fitconf )

with use_canvas ( 'test_good_fit_2: GoF' , wait = 1 ) :
model.load_params ( r , silent = True )
gof = G1D.GoF1D ( model , data_g )
logger.info ( 'Goodness-of-fit:\n%s' % gof )

gauss.mean = 5
gauss.sigma = 0.75
data = gauss.generate ( ND , sample = True )
model.load_params ( r , silent = True )
got = G1D.GoF1DToys ( model , data_g , 500 )
logger.info ( 'Goodness-of-fit with toys:\n%s' % got )

## Try to use multidimensional methods
run_PPD ( model , data_g , r , logger )

with use_canvas ( 'test_good_fit_2' , wait = 1 ) :
model.fitTo ( data , draw = True , nbins = 50 , refit = 5 , quite = True )

# =============================================================================
def test_good_fit_3 ( ) :

logger = getLogger ( 'test_good_fit_3' )
logger.info ( 'Make a test for presumably good fit: fit Gauss+Bkg to Gauss+Bkg' )

with use_canvas ( 'test_good_fit_3: G+B -> G+G' , wait = 1 ) :
r , f = model.fitTo ( data_b , **fitconf )

with use_canvas ( 'test_good_fit_2: GoF' , wait = 1 ) :
gof = G1D.GoF1D ( model , data )
model.load_params ( r , silent = True )
gof = G1D.GoF1D ( model , data_b )
logger.info ( 'Goodness-of-fit:\n%s' % gof )

got = G1D.GoF1DToys ( model , data , 500 )
got.run ( 500 )
got.run ( 500 )
model.load_params ( r , silent = True )
got = G1D.GoF1DToys ( model , data_b , 500 )
logger.info ( 'Goodness-of-fit with toys:\n%s' % got )


## Try to use multidimensional methods
run_PPD ( model , data_b , r , logger )

# =============================================================================
def test_bad_fit_1 ( ) :

""" Make a test for presumably bad fit: fit Gauss to Gauss+Bkg
"""
logger = getLogger ( 'test_bad_fit_1' )
logger.info ( 'Make a test for presumably bad fit' )

gauss.mean = 5
gauss.sigma = 0.75
model.S = 0.96 * ND
model.B = 0.04 * ND

data = model.generate ( ND , sample = True )
logger.info ( 'Make a test for presumably bad fit: fit Gauss to Gauss+Bkg' )


with use_canvas ( 'test_bad_fit_1' , wait = 1 ) :
gauss.fitTo ( data , draw = True , nbins = 50 , quite = True )
with use_canvas ( 'test_bad_fit_1: G -> G+B' , wait = 1 ) :
r , f = gauss.fitTo ( data_b , **fitconf )

with use_canvas ( 'test_bad_fit_1: GoF' , wait = 1 ) :
gof = G1D.GoF1D ( gauss , data )

gauss.load_params ( r , silent = True )
gof = G1D.GoF1D ( gauss , data_b )
logger.info ( 'Goodness-of-fit:\n%s' % gof )
gof.draw()
got = G1D.GoF1DToys ( gauss , data , 5000 )
gof.draw()

gauss.load_params ( r , silent = True )
got = G1D.GoF1DToys ( gauss , data_b , 5000 )
got.run ( 10000 )
logger.info ( 'Goodness-of-fit with toys:\n%s' % got )

Expand All @@ -108,14 +177,19 @@ def test_bad_fit_1 ( ) :
dza = got.draw('ZA')
with use_canvas ( 'test_bad_fit_1: GoF/ZC' , wait = 3 ) :
dzc = got.draw('ZC')

## Try to use multidimensional methods
run_PPD ( gauss , data_b , r , logger )


# ===============================================================================
if '__main__' == __name__ :

test_good_fit_1 ()
test_good_fit_2 ()
test_bad_fit_1 ()

test_good_fit_1 () ## fit Gauss to Gauss
test_good_fit_2 () ## fit Gauss+Bkg to Gauss
test_good_fit_3 () ## fit Gauss+Bkg to Gauss+Bkg
test_bad_fit_1 () ## fit Gauss to Gauss+Bkg

# ===============================================================================
## The END
# ===============================================================================
Expand Down
1 change: 1 addition & 0 deletions ostap/stats/tests/test_stats_gofnd.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ==============================================================================
# @file ostap/stats/tests/test_stats_gof1d.py
Expand Down

0 comments on commit 7b17672

Please sign in to comment.