From 7b176720bfda8a1ca9bd39f1a1a27704f55ee21d Mon Sep 17 00:00:00 2001 From: Vanya Belyaev Date: Sun, 6 Oct 2024 13:06:54 +0200 Subject: [PATCH] 1. extend Point-to-Point Dissimilarity GoF test for 1D case 1. extend 1D GoF test to include Point-to-Point Dissimilarity GoF test --- ReleaseNotes/release_notes.md | 2 + ostap/stats/gof1d.py | 2 +- ostap/stats/gof_np.py | 15 ++- ostap/stats/tests/test_stats_gof1d.py | 162 +++++++++++++++++++------- ostap/stats/tests/test_stats_gofnd.py | 1 + 5 files changed, 133 insertions(+), 49 deletions(-) diff --git a/ReleaseNotes/release_notes.md b/ReleaseNotes/release_notes.md index 67201efb..cd2a90d2 100644 --- a/ReleaseNotes/release_notes.md +++ b/ReleaseNotes/release_notes.md @@ -11,6 +11,8 @@ 1. modify `point-to-point-dissimilarity` GoF method: split into chubnks for large datasets, use parallel processing for permutations 1. add keyword arguments to `WorkManager.iexecute` method 1. add an option to run parallel permutations using `WorkManager` parallelisation instead of `joblib` + 1. extend Point-to-Point Dissimilarity GoF test for 1D case + 1. extend 1D GoF test to include Point-to-Point Dissimilarity GoF test ## Backward incompatible diff --git a/ostap/stats/gof1d.py b/ostap/stats/gof1d.py index c1b64efa..eeb948ae 100644 --- a/ostap/stats/gof1d.py +++ b/ostap/stats/gof1d.py @@ -477,7 +477,7 @@ def run ( self , nToys = 1000 , silent = False ) : vct_cdf = self.vcdf from ostap.utils.progress_bar import progress_bar - for i in progress_bar ( nToys , silent = silent ) : + for i in progress_bar ( nToys , silent = silent , description = 'Run toys:') : dset = self.__pdf.generate ( self.N , sample = True ) data = dset.tonumpy ( varname ) [ varname ] diff --git a/ostap/stats/gof_np.py b/ostap/stats/gof_np.py index 9b5792bb..4c222cd4 100644 --- a/ostap/stats/gof_np.py +++ b/ostap/stats/gof_np.py @@ -241,7 +241,7 @@ def __call__ ( self , N , silent = True ) : n1 = len ( self.ds1 ) pooled = np.concatenate ( [ self.ds1 , self.ds2 ] ) counter = EffCounter() - for i in progress_bar ( N , silent = silent ) : + for i in progress_bar ( N , silent = silent , description = 'Permutations:') : np.random.shuffle ( pooled ) tv = self.gof.t_value ( pooled [ : n1 ] , pooled [ n1: ] ) counter += bool ( self.t_value < tv ) @@ -267,7 +267,7 @@ def joblib_run ( self , NN , silent = True ) : with warnings.catch_warnings(): warnings.simplefilter ( "ignore" ) ## , category = DeprecationWarning ) results = jl.Parallel ( **conf ) ( input ) - for r in progress_bar ( results , max_value = nj , silent = silent ) : + for r in progress_bar ( results , max_value = nj , silent = silent , description = 'Jobs:') : counter += r ## return counter @@ -306,8 +306,7 @@ class PPDNP(AGoFNP,GoFNP) : """ Implementation of concrete method "Point-To-Point Dissimilarity" for probing of Goodness-Of-Fit - see M.Williams, "How good are your fits? Unbinned multivariate goodness-of-fit tests in high energy physics" - - see https://doi.org/10.1088/1748-0221/5/09/P09004 - - see http://arxiv.org/abs/arXiv:1003.1768 + - see https://doi.org/10.1088/1748-0221/5/09/P09004 - see http://arxiv.org/abs/arXiv:1003.1768 """ def __init__ ( self , mc2mc = False , @@ -423,6 +422,10 @@ def __call__ ( self , data1 , data2 , normalize = True ) : ## convert to unstructured datasets uds1 = s2u ( data1 , copy = False ) uds2 = s2u ( data2 , copy = False ) + + ## For 1D-arrays add a fictiev second dimension to please `cdist`-function + if 1 == uds1.shape [ 1 ] : uds1 = np.c_[ uds1 , np.zeros ( len ( uds1 ) ) ] + if 1 == uds2.shape [ 1 ] : uds2 = np.c_[ uds2 , np.zeros ( len ( uds2 ) ) ] return self.t_value ( uds1 , uds2 ) @@ -448,6 +451,10 @@ def pvalue ( self , data1 , data2 , normalize = True ) : uds1 = s2u ( ds1 , copy = False ) uds2 = s2u ( ds2 , copy = False ) + ## For 1D-arrays add a fictive second dimension to please `cdist`-function + if 1 == uds1.shape [ 1 ] : uds1 = np.c_[ uds1 , np.zeros ( len ( uds1 ) ) ] + if 1 == uds2.shape [ 1 ] : uds2 = np.c_[ uds2 , np.zeros ( len ( uds2 ) ) ] + t_value = self.t_value ( uds1 , uds2 ) permutator = PERMUTATOR ( self , t_value , uds1 , uds2 ) diff --git a/ostap/stats/tests/test_stats_gof1d.py b/ostap/stats/tests/test_stats_gof1d.py index 5c902b45..3032a585 100644 --- a/ostap/stats/tests/test_stats_gof1d.py +++ b/ostap/stats/tests/test_stats_gof1d.py @@ -1,15 +1,20 @@ +#!/usr/bin/env python # -*- coding: utf-8 -*- # ============================================================================== # @file ostap/stats/tests/test_stats_gof1d.py -# Test Goddness-of-fits 1D +# Test Goddness-of-fits for 1D fits # Copyright (c) Ostap developpers. # ============================================================================== -""" # Test averages for inconsistend data +""" Test Goddness-of-fits for 1D fits """ # ============================================================================== import ostap.fitting.models as M import ostap.stats.gof1d as G1D +import ostap.stats.gofnd as GnD +import ostap.logger.table as T from ostap.plotting.canvas import use_canvas +from ostap.logger.pretty import pretty_float +from ostap.utils.utils import vrange import ROOT # ============================================================================== from ostap.logger.logger import getLogger @@ -20,77 +25,141 @@ gauss = M.Gauss_pdf ( 'G' , xvar = xvar , mean = 5 , sigma = 1 ) model = M.Fit1D ( signal = gauss , background = 'flat' ) +ND = 200 -ND = 200 +# ============================================================================= +## data_g: pure gaussian +gauss.mean = 5 +gauss.sigma = 0.75 +data_g = gauss.generate ( ND , sample = True ) +# ============================================================================= +## data_b: 95% gaussian + 4% flat background +model.S = 0.95 * ND +model.B = 0.05 * ND +data_b = model.generate ( ND , sample = True ) + + +fitconf = { 'draw' : True , 'nbins' : 50 , 'refit' : 5 , 'quiet' : True } # ============================================================================== -def test_good_fit_1 ( ) : +## Run Point-to_pint dissimilatory Goodness-of-Fit test +def run_PPD ( pdf , data, result , logger ) : + """ Run Point-to_pint dissimilatory Goodness-of-Fit test""" - logger = getLogger ( 'test_good_fit_1' ) - logger.info ( 'Make a test for presumably good fit' ) - - gauss.mean = 5 - gauss.sigma = 0.75 - data = gauss.generate ( ND , sample = True ) + from ostap.stats.gof_np import np,sp,s2u,cdist + if not np or not sp or not s2u or not cdist: + logger.warning ('No numpy/scipy/s4u/cdist: skip the PPD estimate!') + return + + # - Point to Point Sissimilarity test with Gaussian distance using different "sigma" + rows = [ ( 'PPD/sigma' , 't-value' , 'x[..]', 'p-value [%]' ) ] + Ns = 20 + logger.info ( 'Run Point-to-Point Dissimilarity GoF-test for %d different values of sigma' % Ns ) + for sigma in vrange ( 0.01 , 2.0 , Ns ) : + + ppd = GnD.PPD ( Nperm = 1000 , sigma = sigma ) + pdf.load_params ( result , silent = True ) + tvalue = ppd ( pdf , data ) + tvalue, pvalue = ppd.pvalue ( pdf , data ) + + tv , texpo = pretty_float ( tvalue ) + pvalue *= 100 + pvalue = '%4.1f +/- %.1f' % ( pvalue.value() , pvalue.error() ) + row = '%.2f' % sigma , tv , '10^%+d' % texpo if texpo else '' , pvalue + rows.append ( row ) + + title= 'Goodness-of-fit PPD-test' + table = T.table ( rows , title = title , prefix = '# ') + logger.info ( '%s:\n%s' % ( title , table ) ) - with use_canvas ( 'test_good_fit_1' , wait = 1 ) : - gauss.fitTo ( data , draw = True , nbins = 50 , quite = True ) +# ============================================================================== +def test_good_fit_1 ( ) : + """ Make a test for presumably good fit: fit Gauss to Gauss + """ + + logger = getLogger ( 'test_good_fit_1' ) + logger.info ( 'Make a test for presumably good fit: fit Gauss to Gauss' ) + with use_canvas ( 'test_good_fit_1: G -> G' , wait = 1 ) : + r , f = gauss.fitTo ( data_g , **fitconf ) with use_canvas ( 'test_good_fit_1: GoF' , wait = 1 ) : - gof = G1D.GoF1D ( gauss , data ) + + gauss.load_params ( r , silent = True ) + gof = G1D.GoF1D ( gauss , data_g ) logger.info ( 'Goodness-of-fit:\n%s' % gof ) - got = G1D.GoF1DToys ( gauss , data ) + gauss.load_params ( r , silent = True ) + got = G1D.GoF1DToys ( gauss , data_g ) logger.info ( 'Goodness-of-fit with toys:\n%s' % got ) + ## Try to use multidimensional methods + run_PPD ( gauss , data_g , r , logger ) + # ============================================================================= def test_good_fit_2 ( ) : + """ Make a test for presumably good fit: fit Gauss+Bkg to Gauss + """ logger = getLogger ( 'test_good_fit_2' ) - logger.info ( 'Make a test for presumably good fit' ) + logger.info ( 'Make a test for presumably good fit: fit Gauss+Bkg to Gauss' ) + + with use_canvas ( 'test_good_fit_2: G+B -> G' , wait = 1 ) : + r , f = model.fitTo ( data_g , **fitconf ) + + with use_canvas ( 'test_good_fit_2: GoF' , wait = 1 ) : + model.load_params ( r , silent = True ) + gof = G1D.GoF1D ( model , data_g ) + logger.info ( 'Goodness-of-fit:\n%s' % gof ) - gauss.mean = 5 - gauss.sigma = 0.75 - data = gauss.generate ( ND , sample = True ) + model.load_params ( r , silent = True ) + got = G1D.GoF1DToys ( model , data_g , 500 ) + logger.info ( 'Goodness-of-fit with toys:\n%s' % got ) + ## Try to use multidimensional methods + run_PPD ( model , data_g , r , logger ) - with use_canvas ( 'test_good_fit_2' , wait = 1 ) : - model.fitTo ( data , draw = True , nbins = 50 , refit = 5 , quite = True ) +# ============================================================================= +def test_good_fit_3 ( ) : + + logger = getLogger ( 'test_good_fit_3' ) + logger.info ( 'Make a test for presumably good fit: fit Gauss+Bkg to Gauss+Bkg' ) + + with use_canvas ( 'test_good_fit_3: G+B -> G+G' , wait = 1 ) : + r , f = model.fitTo ( data_b , **fitconf ) with use_canvas ( 'test_good_fit_2: GoF' , wait = 1 ) : - gof = G1D.GoF1D ( model , data ) + model.load_params ( r , silent = True ) + gof = G1D.GoF1D ( model , data_b ) logger.info ( 'Goodness-of-fit:\n%s' % gof ) - got = G1D.GoF1DToys ( model , data , 500 ) - got.run ( 500 ) - got.run ( 500 ) + model.load_params ( r , silent = True ) + got = G1D.GoF1DToys ( model , data_b , 500 ) logger.info ( 'Goodness-of-fit with toys:\n%s' % got ) - + ## Try to use multidimensional methods + run_PPD ( model , data_b , r , logger ) + # ============================================================================= def test_bad_fit_1 ( ) : - + """ Make a test for presumably bad fit: fit Gauss to Gauss+Bkg + """ logger = getLogger ( 'test_bad_fit_1' ) - logger.info ( 'Make a test for presumably bad fit' ) - - gauss.mean = 5 - gauss.sigma = 0.75 - model.S = 0.96 * ND - model.B = 0.04 * ND - - data = model.generate ( ND , sample = True ) + logger.info ( 'Make a test for presumably bad fit: fit Gauss to Gauss+Bkg' ) - - with use_canvas ( 'test_bad_fit_1' , wait = 1 ) : - gauss.fitTo ( data , draw = True , nbins = 50 , quite = True ) + with use_canvas ( 'test_bad_fit_1: G -> G+B' , wait = 1 ) : + r , f = gauss.fitTo ( data_b , **fitconf ) with use_canvas ( 'test_bad_fit_1: GoF' , wait = 1 ) : - gof = G1D.GoF1D ( gauss , data ) + + gauss.load_params ( r , silent = True ) + gof = G1D.GoF1D ( gauss , data_b ) logger.info ( 'Goodness-of-fit:\n%s' % gof ) - gof.draw() - got = G1D.GoF1DToys ( gauss , data , 5000 ) + gof.draw() + + gauss.load_params ( r , silent = True ) + got = G1D.GoF1DToys ( gauss , data_b , 5000 ) got.run ( 10000 ) logger.info ( 'Goodness-of-fit with toys:\n%s' % got ) @@ -108,14 +177,19 @@ def test_bad_fit_1 ( ) : dza = got.draw('ZA') with use_canvas ( 'test_bad_fit_1: GoF/ZC' , wait = 3 ) : dzc = got.draw('ZC') + + ## Try to use multidimensional methods + run_PPD ( gauss , data_b , r , logger ) + # =============================================================================== if '__main__' == __name__ : - test_good_fit_1 () - test_good_fit_2 () - test_bad_fit_1 () - + test_good_fit_1 () ## fit Gauss to Gauss + test_good_fit_2 () ## fit Gauss+Bkg to Gauss + test_good_fit_3 () ## fit Gauss+Bkg to Gauss+Bkg + test_bad_fit_1 () ## fit Gauss to Gauss+Bkg + # =============================================================================== ## The END # =============================================================================== diff --git a/ostap/stats/tests/test_stats_gofnd.py b/ostap/stats/tests/test_stats_gofnd.py index 42401ec3..4796f397 100644 --- a/ostap/stats/tests/test_stats_gofnd.py +++ b/ostap/stats/tests/test_stats_gofnd.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # -*- coding: utf-8 -*- # ============================================================================== # @file ostap/stats/tests/test_stats_gof1d.py