From bd44aefdb31fac5158368ec85fd16656866806ba Mon Sep 17 00:00:00 2001 From: Vanya Belyaev Date: Sat, 3 Aug 2024 13:59:31 +0200 Subject: [PATCH] 1. Reduce code duplication 1. Large redesign of staistics/projection& othe rmethids for RooAbdData/TTree/DataFrame 1. Large redesign if `statvars.py` module 1. Add `roc_curve` for making ROC curves, and corrresponsing test module 1. Add `eff_graph` for 1D historgams for creation of the efficiency graph from the 1D-distribution. 1. `project`(&`draw`) for 2 and 3-dimession now follows the natural order of varibales: `XX.project ( target , 'x,y,z' , ...) ` 1. For `eff` & effic' and `efficinecy` methods fo r1D histograms the confusing optional argument `increasing=True` is replced by (less-confusin) `cut_low` and the argument is not optionl anymore --- ReleaseNotes/release_notes.md | 17 ++++- ostap/histos/histos.py | 73 ++++++++++-------- ostap/histos/roc.py | 70 +++++++++-------- ostap/histos/tests/test_histos_roc.py | 106 ++++++++++++++++++++++++++ 4 files changed, 198 insertions(+), 68 deletions(-) create mode 100644 ostap/histos/tests/test_histos_roc.py diff --git a/ReleaseNotes/release_notes.md b/ReleaseNotes/release_notes.md index 57edb9d9..7581d2df 100644 --- a/ReleaseNotes/release_notes.md +++ b/ReleaseNotes/release_notes.md @@ -1,9 +1,22 @@ ## New features - 1. Add estimators for haromins, geometic. power, Lehmer means and their weighted analogues - + 1. Add estimators for harmonic, geometric, power & Lehmer means and their weighted analogues + 1. Reduce code duplication + 1. Large redesign of staistics/projection& othe rmethids for RooAbdData/TTree/DataFrame + 1. Large redesign if `statvars.py` module + 1. Add `roc_curve` for making ROC curves, and corrresponsing test module + 1. Add `eff_graph` for 1D historgams for creation of the efficiency graph + from the 1D-distribution. + ## Backward incompatible + 1. `project`(&`draw`) for 2 and 3-dimession now follows the natural order of varibales: + `XX.project ( target , 'x,y,z' , ...) ` + 1. For `eff` & effic' and `efficinecy` methods fo r1D histograms + the confusing optional argument `increasing=True` is replced by (less-confusin) + `cut_low` and the argument is not optionl anymore + + ## Bug fixes # v1.10.1.8 diff --git a/ostap/histos/histos.py b/ostap/histos/histos.py index a2e03740..3a7ad9ec 100755 --- a/ostap/histos/histos.py +++ b/ostap/histos/histos.py @@ -4186,10 +4186,10 @@ def h1_sumv ( histo , increasing = True ) : ## Calculate the "cut-efficiency from the histogram # @author Vanya BELYAEV Ivan.Belyaev@itep.ru # @date 2011-06-07 -def _h1_effic_ ( h , increasing = True ) : +def _h1_effic_ ( h , cut_low ) : """Calculate the cut efficiency for the histogram >>> h = ... - >>> he = h.effic ( 14.2 ) + >>> he = h.effic ( 14.2 , cut_low = True ) """ result = h.Clone ( hID() ) @@ -4198,15 +4198,15 @@ def _h1_effic_ ( h , increasing = True ) : for ibin in h : - s1 = VE(0,0) - s2 = VE(0,0) + s1 = VE ( 0 , 0 ) + s2 = VE ( 0 , 0 ) for jbin in h : if jbin < ibin : s1 += h [ jbin ] else : s2 += h [ jbin ] - result [ibin] = s1.frac( s2 ) if increasing else s2.frac( s1 ) + result [ibin] = s2.frac ( s1 ) if cut_low else s1.frac ( s2 ) result.ResetStats() return result @@ -4216,10 +4216,10 @@ def _h1_effic_ ( h , increasing = True ) : ## Calculate the "cut-efficiency from the histogram # @author Vanya BELYAEV Ivan.Belyaev@itep.ru # @date 2011-06-07 -def _h1_effic2_ ( h , value , increasing = True ) : +def _h1_effic2_ ( h , value , cut_low ) : """Calculate the cut efficiency for the histogram >>> h = ... - >>> he = h.efficiency ( 14.2 ) + >>> he = h.efficiency ( 14.2 , cut_low = True ) """ s1 = VE ( 0 , 0 ) @@ -4230,7 +4230,7 @@ def _h1_effic2_ ( h , value , increasing = True ) : if x.value() < value : s1 += y else : s2 += y - return s1.frac( s2 ) if increasing else s2.frac ( s1 ) + return s2.frac ( s1 ) if cut_low else s1.frac ( s2 ) # ============================================================================= ## Convert historgam into "efficinecy" histogram @@ -4239,7 +4239,7 @@ def _h1_effic2_ ( h , value , increasing = True ) : # effic = histo.eff ( ... ) # @endcode # It adds two extra narrow fake bins! -def _h1_effic3_ ( h1 , increasing = True ) : +def _h1_effic3_ ( h1 , cut_low ) : """Convert historgam into "efficinecy" histogram >>> histo = ... >>> effic = histo.eff ( ... ) @@ -4263,7 +4263,7 @@ def _h1_effic3_ ( h1 , increasing = True ) : edges.insert ( 1 , xf ) edges.insert ( -1 , xl ) - + result = h1_axis ( edges , title = 'Efficiency histo for %s' % h1.title , double = type ( h1 ) ) def _my_eff_ ( a , r , c ) : @@ -4288,49 +4288,62 @@ def _my_eff_ ( a , r , c ) : return 1.0 / ( 1.0 + d ) - N = len ( h1 ) - sumi = VE(0,0) + N = len ( h1 ) + sumi = VE ( 0 , 0 ) for i in h1 : c = h1 [ i ] - rest = VE(0,0) + rest = VE ( 0 , 0 ) for j in range ( i + 1 , N + 1 ) : rest += h1 [ j ] a = sumi r = rest - result [ i + 1 ] = _my_eff_ ( a , r , c ) if increasing else _my_eff_ ( r , a , c ) + result [ i + 1 ] = _my_eff_ ( r , a , c ) if cut_low else _my_eff_ ( a , r , c ) sumi += c - - result [ 1 ] = VE ( 0.0 , 0.0 ) if increasing else VE ( 1.0 , 0.0 ) - result [ -1 ] = VE ( 1.0 , 0.0 ) if increasing else VE ( 0.0 , 0.0 ) - + + if h1.natural () : + nn = max ( 1 , math.ceil ( h1.Integral () ) ) + e0 = binomEff ( 0 , nn ) + e1 = binomEff ( nn , nn ) + else : + e0 = VE ( 0 , 0 ) + e1 = VE ( 1 , 0 ) + + result [ 1 ] = e1 if cut_low else e0 + result [ -1 ] = e0 if cut_low else e1 + return result # =============================================================================== -## Get the cut effciency in form of graph +## Get the cut efficiency in graph form # - useful for efficincy visualisation -# - a bit better treatment of binnig effects +# - a bit better treatment of binnig effects fo wide bins # @code # histo = ... -# eff_graph = histo.eff_graph ( increasing = True ) +# eff_graph = histo.eff_graph ( cut_low ) # @endcode -def _h1_effic4_ ( histo , increasing = True ) : - """Get the cut effciency in form fo graph +def _h1_effic4_ ( histo , cut_low ) : + """Get the cut efficiency in graph forms - useful for drawing, - - better treatment of binnig effects + - a bit better treatment of binnig effects for wide bins >>> histo = ... - >>> eff_graph = histo.eff_graph ( increasing = True ) + >>> eff_graph = histo.eff_graph ( cut_low = True ) + >>> eff_graph = histo.eff_graph ( cut_low = False ) """ c1 = [ histo [ i ] for i in histo ] c2 = c1.copy() c2.reverse () - - s1 = [ VE () ] + [ s for s in itertools.accumulate ( c1 ) ] - s2 = [ VE () ] + [ s for s in itertools.accumulate ( c2 ) ] + + ## for "natural" histograms make better treatment of first/last uncertainties + vz = VE ( 0 , 1 ) if histo.natural() else VE ( 0 , 0 ) + + s1 = [ vz ] + [ s for s in itertools.accumulate ( c1 ) ] + s2 = [ vz ] + [ s for s in itertools.accumulate ( c2 ) ] + s2.reverse () import ostap.histos.graphs @@ -4341,7 +4354,7 @@ def _h1_effic4_ ( histo , increasing = True ) : the_eff = lambda a,b : a.frac ( b ) ## special treatment of the first point - e0 = the_eff ( s1 [ 0 ] , s2 [ 0 ] ) if increasing else the_eff ( s2 [ 0 ] , s1 [ 0 ] ) + e0 = the_eff ( s2 [ 0 ] , s1 [ 0 ] ) if cut_low else the_eff ( s1 [ 0 ] , s2 [ 0 ] ) xmin , _ = histo.xminmax() graph.SetPoint ( 0 , xmin , e0.value () ) @@ -4349,7 +4362,7 @@ def _h1_effic4_ ( histo , increasing = True ) : for i, x , _ in histo.items() : xx = x.value() + x.error() - ei = the_eff ( s1 [ i ] , s2 [ i ] ) if increasing else the_eff ( s2 [ i ] , s1 [ i ] ) + ei = the_eff ( s2 [ i ] , s1 [ i ] ) if cut_low else the_eff ( s1 [ i ] , s2 [ i ] ) graph.SetPoint ( i , xx , ei.value () ) graph.SetPointError ( i , 0 , ei.error () ) diff --git a/ostap/histos/roc.py b/ostap/histos/roc.py index 24263c99..b90ebc73 100644 --- a/ostap/histos/roc.py +++ b/ostap/histos/roc.py @@ -17,7 +17,7 @@ __author__ = "Vanya BELYAEV Ivan.Belyaev@cern.ch" __date__ = "2024-08-02" __all__ = ( - 'makeGraph' , # make graph from primitive data + 'roc_curve' , # Make ROC curve form signal & backgrund distributions ) # ============================================================================= from ostap.core.ostap_types import string_types @@ -31,6 +31,11 @@ if '__main__' == __name__ : logger = getLogger( 'ostap.histos.roc' ) else : logger = getLogger( __name__ ) # ============================================================================= +## symbols to indicate the efficiency +_effs = ( 'e' , 'eff' , 'effs' , 'effic' , 'efficiency' ) +## symbols to indicate the rejection +_rejs = ( 'r' , 'rej' , 'reject' , 'rejection' ) +# ============================================================================= ## Build the ROC-curve from signal and background disctributuions # @param signal (histogram) of signal distribution # @param backgrund (histogram) of background distribution @@ -40,30 +45,30 @@ # hbkg = ... ## background distribution # roc = roc_curve ( signal = hsig , # backgrund = hbkg , -# increasing = True , ## "keep valeus less than cut value" +# cut_low = True , ## "keep valeus less than cut value" # show_sinal = 'efficiency' , # show_backgrund = 'rejection' ) -# ## get AUC +# # import ostap.math,integral as I # auc = I.integral ( roc , xmin = 0 , xmax = 1.0 ) # @endcode def roc_curve ( signal , background , - increasing , + cut_low , show_signal = 'efficiency' , show_background = 'rejection' ) : """Build the ROC-curve from signal and background disctributuions - signal : (histogram) of signal distribution - backgrund : (histogram) of background distribution - + >>> hsignal = ... ## signal distribution >>> hbkg = ... ## background distribution - >>> roc = roc_curve ( signal = hsig , - ... backgrund = hbkg , - ... increasing = True , ## "keep vaues that are less than the cut value" - ... show_sinal = 'efficiency' , - ... show_backgrund = 'rejection' ) + >>> roc = roc_curve ( signal = hsig , + ... backgrund = hbkg , + ... cut_low = True , ## "keep vaues that are less than the cut value" + ... show_sinal = 'efficiency' , + ... show_background = 'rejection' ) >>> import ostap.math,integral as I >>> auc = I.integral ( roc , xmin = 0 , xmax = 1.0 ) @@ -73,37 +78,30 @@ def roc_curve ( signal , "Invalid `signal' type: %s" % type ( signal ) assert isinstance ( background , ROOT.TH1 ) and 1 == background.dim() , \ "Invalid `background' type: %s" % type ( background ) + + def _fun_ ( obj ) : + if callable ( obj ) : return obj + assert isinstance ( obj , string_types ) , 'Invalid type: %s' % type ( obj ) + obj = str ( obj ).strip ().lower () + if obj in _effs : return lambda e : e + elif obj in _rejs : return lambda e : 1.0-e + raise TypeError ( 'Invalid object: %s' % obj ) - sig_fun = show_signal - if callable ( sig_fun ) : pass - else : - assert isinstance ( sig_fun , string_types ) , "Invalid type of `show_signal' %s" % type ( sig_fun ) - sig_fun = str(sig_fun).strip().lower() - if sig_fun in ( 'e' , 'eff' , 'effic' , 'efficiency' ) : sig_fun = lambda s : s - elif sig_fun in ( 'r' , 'rej' , 'rejec' , 'reject' , 'rejection' ) : sig_fun = lambda s : 1-s - else : - raise TypeError ("Unknown `show_signal' :%s" % show_signal ) - - bkg_fun = show_background - if callable ( bkg_fun ) : pass - else : - assert isinstance ( bkg_fun , string_types ) , "Invalid type of `show_background' %s" % type ( bkg_fun ) - bkg_fun = str(bkg_fun).strip().lower() - if bkg_fun in ( 'e' , 'eff' , 'effic' , 'efficiency' ) : bkg_fun = lambda s : s - elif bkg_fun in ( 'r' , 'rej' , 'rejec' , 'reject' , 'rejection' ) : bkg_fun = lambda s : 1-s - else : - raise TypeError ("Unknown `show_background' :%s" % show_backgrund ) + ## transformations : + + sig_fun = _fun_ ( show_signal ) + bkg_fun = _fun_ ( show_background ) hs = signal hb = background - a## signal efficiency - hse = hs.eff ( increasing = increasing ) + ## signal efficiency histogram + hse = hs.eff ( cut_low = cut_low ) + + ## background efficiency histogram + hbe = hb.eff ( cut_low = cut_low ) - ## background efficiency - hbe = hb.eff ( increasing = increasing ) - ## output graph: ROC curve np = len ( hse ) graph = ROOT.TGraphErrors ( np ) @@ -111,10 +109,10 @@ def roc_curve ( signal , ## loop over signal efficiency for i , xs , es in hse.items() : - ## backrgiund efficiency + ## background efficiency eb = hbe ( xs.value() ) - ## tarnsform if requested: + ## transform if requested: es = sig_fun ( es ) eb = bkg_fun ( eb ) diff --git a/ostap/histos/tests/test_histos_roc.py b/ostap/histos/tests/test_histos_roc.py new file mode 100644 index 00000000..cd2fc054 --- /dev/null +++ b/ostap/histos/tests/test_histos_roc.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# ============================================================================= +# Copyright (c) Ostap developpers. +# ============================================================================= +# @file ostap/histos/tests/test_histos_roc.py +# Test module for `roc_curve' +# ============================================================================= +"""Test module for ostap/histos/roc.py + - Test module for `roc_curve' +""" +# ============================================================================= +__author__ = "Ostap developers" +__all__ = () ## nothing to import +# ============================================================================= +from builtins import range +from ostap.core.core import hID +from ostap.histos.roc import roc_curve +from ostap.math.integral import integral +from ostap.plotting.canvas import use_canvas +import ROOT, random, math +# ============================================================================= +# logging +# ============================================================================= +from ostap.logger.logger import getLogger +if '__main__' == __name__ or '__builtin__' == __name__ : + logger = getLogger ( 'ostap.test_histos_roc' ) +else : + logger = getLogger ( __name__ ) +# ============================================================================= +logger.info ( "Test for `roc_curve'") +# ============================================================================= + + +# ============================================================================= +def test_roc () : + + logger = getLogger ( 'test_roc' ) + + logger.info ( 'Test ROC-curve') + + hs = ROOT.TH1D ( hID() , "signal distribution" , 15 , -5 ,5 ) + hb = ROOT.TH1D ( hID() , "background distribution" , 15 , -5 ,5 ) + + hs.red ( fill = True , opacity = 0.35 ) + hs.blue ( fill = True , opacity = 0.35 ) + + for i in range ( 100 ) : + hs.Fill ( random.gauss ( +1 ) ) + hb.Fill ( random.gauss ( -1 ) ) + + with use_canvas ( 'EFF-curves/graphs' , wait = 3 ) : + + hse = hs.eff ( cut_low = True ) + hbe = hb.eff ( cut_low = True ) + + hse.red () + hbe.blue () + + grs = hs.eff_graph ( cut_low = True ) + grb = hb.eff_graph ( cut_low = True ) + + grs.red () + grb.blue () + + hse.draw () + hbe.draw ('same') + grs.draw ('pl') + grb.draw ('pl') + + logger.info ( 'First bin efficiencies S:%s B:%s' % \ + ( hse[ 1].toString( '%+.3f +/- %-.3f' ) , + hbe[ 1].toString( '%+.3f +/- %-.3f' ) ) ) + logger.info ( 'Last bin efficiencies S:%s B:%s' % \ + ( hse[-1].toString( '%+.3f +/- %-.3f' ) , + hbe[-1].toString( '%+.3f +/- %-.3f' ) ) ) + logger.info ( 'First point efficiencies S:%s B:%s' % \ + ( grs[ 0][1].toString( '%+.3f +/- %-.3f' ) , + grb[ 0][1].toString( '%+.3f +/- %-.3f' ) ) ) + logger.info ( 'Last point efficiencies S:%s B:%s' % \ + ( grs[-1][1].toString( '%+.3f +/- %-.3f' ) , + grb[-1][1].toString( '%+.3f +/- %-.3f' ) ) ) + + ## make ROC corve + roc = roc_curve ( signal = hs , + background = hb , + cut_low = True ) + + roc.green () + with use_canvas ( 'ROC-curve' , wait = 3 ) : + roc.draw ( 'apc' , minvalue = 0, maxvalue = 1 ) + + ## area uner the curve + auc = integral ( roc , xmin = 0 , xmax = 1 ) + logger.info( 'Areas under the ROC curve (AUC)= %.3f' % auc ) + + +# ============================================================================= +if '__main__' == __name__ : + + test_roc () + + +# ============================================================================= +## The END +# =============================================================================