From e7bea8cf33d2f0b2f20b0c018d9aa975d1b3eb60 Mon Sep 17 00:00:00 2001 From: Vanya Belyaev Date: Thu, 8 Aug 2024 16:10:34 +0200 Subject: [PATCH] 1. add `loop` methdod for `RooAbsData` and implement `rows` in terms of `loop` 1. allow more recusion in `vars_and_cuts` function 1. add new test 1. From now for weighted datasets `dataset[i]` returns `(entry,weight)` tuple 1. from now iteration over weighted dataset gives `(entry,weight)` tuple 1. change sinature of `dataset.loop` , `dataset.rows` methods to return triplets `index, entry, weight` --- ReleaseNotes/release_notes.md | 6 +- ostap/fitting/dataset.py | 195 +++++++++++------- ostap/fitting/ds2numpy.py | 9 +- ostap/fitting/roostats.py | 5 +- ostap/fitting/tests/test_fitting_dataset.py | 90 ++++++++ ostap/fitting/tests/test_fitting_ds2numpy.py | 5 +- .../fitting/tests/test_fitting_duplicates.py | 55 ++--- ostap/trees/cuts.py | 15 +- 8 files changed, 261 insertions(+), 119 deletions(-) create mode 100644 ostap/fitting/tests/test_fitting_dataset.py diff --git a/ReleaseNotes/release_notes.md b/ReleaseNotes/release_notes.md index 1cf9ad09..47cb4dff 100644 --- a/ReleaseNotes/release_notes.md +++ b/ReleaseNotes/release_notes.md @@ -13,6 +13,9 @@ 1. Some tweaks for style configuration 1. update `ostap.utils.valerrors` & and new test 1. allow to use `width` keyword when `line_width` is not specified for `XXX.draw` method + 1. add `loop` methdod for `RooAbsData` and implement `rows` in terms of `loop` + 1. allow more recusion in `vars_and_cuts` function + 1. add new test ## Backward incompatibl @@ -23,7 +26,8 @@ `cut_low` and the argument is not optionl anymore 1. From now for weighted datasets `dataset[i]` returns `(entry,weight)` tuple 1. from now iteration over weighted dataset gives `(entry,weight)` tuple - + 1. change sinature of `dataset.loop` , `dataset.rows` methods to return triplets `index, entry, weight` + ## Bug fixes 1. fix a typo in `ostap.ploting.canvas` diff --git a/ostap/fitting/dataset.py b/ostap/fitting/dataset.py index 5c8f659b..2c111c3e 100644 --- a/ostap/fitting/dataset.py +++ b/ostap/fitting/dataset.py @@ -25,6 +25,7 @@ ) # ============================================================================= from builtins import range +from ostap.utils.progress_bar import progress_bar from collections import defaultdict from ostap.core.meta_info import root_info, ostap_version from ostap.core.core import ( Ostap, VE, SE , @@ -61,60 +62,113 @@ _minv = -0.99 * sys.float_info.max # ============================================================================= ## iterator for RooAbsData entries -# - For unweighted dataset `entry` is a RooArsSet # @cdoe # dataset = ... -# for entry in dataset : ... -# @endcode -# @code -# - For weighted datatsets, `entry` is a tuple of (RooArgSet,weight) -# weighted = ... -# for enry, weight in weighted : ... +# for entry, weight in dataset : ... # @endcode +# - For unweighted datatsets, `weight` is `None` # @author Vanya BELYAEV Ivan.Belyaev@itep.ru # @date 2011-06-07 def _rad_iter_ ( self ) : """Iterator for RooAbsData - - for unweighted dataset `entry` is a RooArsSet >>> dataset = ... - >>> for entry in dataset : ... - - for weighted datatsets, `entry` is a tuple of (RooArgSet,weight) - >>> weighted = ... - >>> for entry, weight in weighted : ... + >>> for entry,weight in dataset : ... + - for unweighted datatsets, `weight` is `None` """ _l = len ( self ) for i in range ( 0 , _l ) : yield self [ i ] +# =========================================================================== +## Iterator over "good" events in dataset +# @code +# dataset = ... +# for index , entry, weight in dataset.loop ( 'pt>1' ) : +# print (index, entry, weight) +# @endcode +def _rad_loop_ ( dataset , cuts = '' , cutrange = '' , first = 0 , last = LAST_ENTRY , progress = False ) : + """Iterator for `good' events in dataset + >>> dataset = ... + >>> for index, entry, weight in dataset.loop ( ''pt>1' ) : + >>> print (index, entry, weight) + """ + + first, last = evt_range ( len ( dataset ) , first , last ) + + assert isinstance ( cuts , expression_types ) or not cuts, \ + "Invalid type of cuts: %s" % type ( cuts ) + + assert isinstance ( cutrange , expression_types ) or not cutrange, \ + "Invalid type of cutrange: %s" % type ( cutrange ) + + cuts = str(cuts).strip() if cuts else '' + cutrange = str(cutrange).strip() if cutrange else '' + + fcuts = None + if cuts : fcuts = make_formula ( cuts , cuts , dataset.varlist() ) + + weighted = dataset.isWeighted () + store_errors = weighted and dataset.store_errors () + store_asym_errors = weighted and dataset.store_asym_errors () + simple_weight = weighted and ( not store_errors ) and ( not store_asym_errors ) + + ## loop over dataset + source = range ( first , last ) + if progress : source = progress_bar ( source ) + + nevents = 0 + for event in source : + + vars , ww = dataset [ event ] + if not vars : break + + if cutrange and not vars.allInRange ( cutrange ) : continue + + wc = fcuts.getVal() if fcuts else 1.0 + if not wc : continue + + entry , weight = dataset [ event ] + + if weight is None : weight = wc if cuts else weight + else : weight = weight * wc + + nevents += 1 + yield event , entry , weight + + del fcuts + ## report summary + if progress : logger.info ( 'loop: %d from %d entries' % ( nevents , last - first ) ) + + +ROOT.RooAbsData.loop = _rad_loop_ + + # ============================================================================= ## access to the entries in RooAbsData # @code # dataset = ... -# weighted = ... ## weighte dataset -# event = dataset [4] ## index -# event , weight = weighted [4] ## index -# +# event , weight = dataset [4] ## index # events = dataset[0:1000] ## slice # events = dataset[0:-1:10] ## slice # events = dataset[ (1,2,3,10) ] ## sequence of indices -# @eendcode +# @endcode +# - For unweighted ddatasets `weight` is `None` # @author Vanya BELYAEV Ivan.Belyaev@itep.ru # @date 2013-03-31 def _rad_getitem_ ( data , index ) : """Get the entry from RooDataSet - >>> dataset = ... ## normal dataset - >>> weighted = ... ## weighted dataset - - >>> event = dataset [4] ## index - >>> event, weight = weighted [4] ## index - + >>> dataset = ... + >>> event, weight = dataset [4] ## index + + - For unweighted ddatsets `weight` is `None` + >>> events = dataset[0:1000] ## slice >>> events = dataset[0:-1:10] ## slice >>> events = dataset[ (1,2,3,4,10) ] ## sequnce of indices """ - + N = len ( data ) if isinstance ( index , integer_types ) and index < 0 : index += N @@ -123,7 +177,7 @@ def _rad_getitem_ ( data , index ) : if isinstance ( index , integer_types ) and 0 <= index < N : entry = data.get ( index ) - if not data.isWeighted() : return entry ## RETURN + if not data.isWeighted() : return entry, None ## RETURN weight = data.weight() if data.store_asym_error () : @@ -186,6 +240,10 @@ def _rad_getitem_ ( data , index ) : raise IndexError ( 'Invalid index %s'% index ) +# ============================================================================== + + + # ============================================================================== ## Get (asymmetric) weigth errors for the current entry in dataset # @code @@ -405,7 +463,7 @@ def _rad_mul_ ( ds1 , ds2 ) : wel , weh = ds1.weightErrors() entry = ds1.get ( i ) , ds1.weight () , wel , weh elif store_error : - entry = ds1.get ( i ) , ds1.weight () , ds1.weightError () + entry = ds1.get ( i ) , ds1.weight () , ds1.weightError () elif weighted : entry = ds1.get ( i ) , ds1.weight () else: @@ -852,24 +910,20 @@ def _rds_make_unique_ ( dataset , criterium = criterium , seed = seed , report = report ) : - - if weighted : - entry , weight = dataset [ index ] - - if store_asym_errors and isinstance ( weight , VAE ) : - ds.add ( entry , weight.value , weight.neg_error , weight.pos_erroe ) - elif store_errors and isinstance ( weight , VE ) : - ds.add ( entry , weight.value () ) - elif isinsance ( weight , num_types ) : - ds.add ( entry , float ( weight ) ) - else : + entry, weight = dataset [ index ] + + if store_asym_errors and isinstance ( weight , VAE ) : + ds.add ( entry , weight.value , weight.neg_error , weight.pos_erroe ) + elif store_errors and isinstance ( weight , VE ) : + ds.add ( entry , weight.value () ) + elif weighted and isinstance ( weight , num_types ) : + ds.add ( entry , float ( weight ) ) + elif weighted : raise TypeError ( 'Inconsistent sae/se/w %s/%s/%s ' % ( store_asym_errors , store_errors , type ( weight ) ) ) else : - - entry = dataset [ index ] ds.add ( entry ) return ds @@ -3058,62 +3112,43 @@ def get_result ( data ) : return _array ( 'd' , data ) ## Iterator for rows in dataset # @code # dataset = ... -# for row , weight in dataset.rows ( 'pt, pt/p, mass ' , 'pt>1' ) : -# print (row, weight) +# for index, row , weight in dataset.rows ( 'pt, pt/p, mass ' , 'pt>1' ) : +# print (index, row, weight) # @endcode -def _rad_rows_ ( dataset , variables = [] , cuts = '' , cutrange = '' , first = 0 , last = LAST_ENTRY ) : +def _rad_rows_ ( dataset , + variables = [] , + cuts = '' , + cutrange = '' , + first = 0 , + last = LAST_ENTRY , + progress = False ) : """Iterator for rows in dataset >>> dataset = ... >>> for row , weight in dataset.rows ( 'pt, pt/p, mass ' , 'pt>1' ) : >>> print (row, weight) """ - first, last = evt_range ( len ( dataset ) , first , last ) - - varlst, cuts, _ = vars_and_cuts ( variables , cuts ) - - vars = strings ( varlst ) - + first , last = evt_range ( len ( dataset ) , first , last ) + varlst , cuts, _ = vars_and_cuts ( variables , cuts ) + formulas = [] varlist = dataset.varlist () - for v in vars : + for v in varlst : f0 = make_formula ( v , v , varlist ) formulas.append ( f0 ) - fcuts = None - if cuts : fcuts = make_formula ( cuts , cuts , varlist ) - - weighted = dataset.isWeighted () - store_errors = weighted and daatset.store_errors () - store_asym_errors = weighted and daatset.store_asym_errors () - simple_weight = weighted and ( not srore_errors ) and ( not store_asym_errors ) - ## loop over dataset - for event in range ( first , last ) : - - ww = None - if weighted : vars, ww = dataset [ event ] - else : vars = dataset [ event ] - - if not vars : break - - if cutrange and not vars.allInRange ( cutrange ) : continue - - wc = fcuts.getVal() if fcuts else 1.0 - if not wc : continue - - wd = ww if weighted and not ( ww is None ) else 1.0 + ## loop over dataset + for index, entry, weight in _rad_loop_ ( dataset , + cuts = cuts , + cutrange = cutrange , + first = first , + last = last , + progress = progress ) : - w = wc * wd - if simple_weight and not w : continue - - weight = w if weighted else None - result = tuple ( tuple ( float ( f ) for f in formulas ) ) - yield get_result ( result ) , weight - - del fcuts - del formulas + yield index , get_result ( result ) , weight + del formulas ROOT.RooAbsData.rows = _rad_rows_ diff --git a/ostap/fitting/ds2numpy.py b/ostap/fitting/ds2numpy.py index e147415e..f55c1890 100644 --- a/ostap/fitting/ds2numpy.py +++ b/ostap/fitting/ds2numpy.py @@ -197,7 +197,8 @@ def ds2numpy ( dataset , var_lst , silent = True , more_vars = {} ) : ## add PDF values if funcs : - for i, entry in enumerate ( source ) : + for i, item in enumerate ( source ) : + entry , weight = item for vname , func , obsvars in funcs : obsvars.assign ( entry ) data [ vname ] [ i ] = func.getVal() @@ -298,15 +299,15 @@ def ds2numpy ( dataset , var_lst , silent = False , more_vars = {} ) : ## make an explict loop: for i , item in enumerate ( progress_bar ( dataset , silent = silent ) ) : - if weighted : evt, the_weight = item - else : evt = item + evt, the_weight = item for v in evt : vname = v.name if vname in doubles : data [ vname ] [ i ] = float ( v ) elif vname in categories : data [ vname ] [ i ] = int ( v ) - if weighted and weight : data [ weight ] [ i ] = float ( the_weight ) + if weighted and weight and not ( the_weigth is None ) : + data [ weight ] [ i ] = float ( the_weight ) ## add PDF values for vname , func , obsvars in funcs : diff --git a/ostap/fitting/roostats.py b/ostap/fitting/roostats.py index fd2ad3cc..673cde96 100644 --- a/ostap/fitting/roostats.py +++ b/ostap/fitting/roostats.py @@ -793,10 +793,7 @@ def plot ( self ) : gr2.blue () ps = fc.GetPointsToScan() - - weighted = ps.isWeighted() - for entry in ps : - if weighted : entry , _ = entry + for entry, _ in ps : point = float ( entry[0] ) if fci.IsInInterval ( entry ) : gr1.append ( point , 1 ) else : gr2.append ( point , 0 ) diff --git a/ostap/fitting/tests/test_fitting_dataset.py b/ostap/fitting/tests/test_fitting_dataset.py new file mode 100644 index 00000000..3d463b34 --- /dev/null +++ b/ostap/fitting/tests/test_fitting_dataset.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# ============================================================================= +# Copyright (c) Ostap developers. +# ============================================================================= +# @file test_fitting_dataset.py +# - It tests some decorators fore dataset +# ============================================================================= +""" - It tests some decorators fore dataset +""" +# ============================================================================= +__author__ = "Ostap developers" +__all__ = () ## nothing to import +# ============================================================================= +import ostap.fitting.roofit +import ostap.fitting.models as Models +from ostap.core.core import cpp, VE, dsID, rooSilent +from ostap.utils.timing import timing +from builtins import range +from ostap.plotting.canvas import use_canvas +from ostap.utils.utils import wait +from ostap.fitting.background import make_bkg +import ostap.logger.table as T +import ROOT, random +# ============================================================================= +# logging +# ============================================================================= +from ostap.logger.logger import getLogger +if '__main__' == __name__ or '__builtin__' == __name__ : + logger = getLogger ( 'test_fitting_dataset' ) +else : + logger = getLogger ( __name__ ) +# ============================================================================= + +evt = ROOT.RooRealVar ( 'Evt' , '#event' , 0 , 1000000 ) +run = ROOT.RooRealVar ( 'Run' , '#run' , 0 , 1000000 ) +mass = ROOT.RooRealVar ( 'Mass' , 'mass-variable' , 0 , 100 ) +weight = ROOT.RooRealVar ( 'Weight' , 'some weight' , -10 , 10 ) + +varset = ROOT.RooArgSet ( evt , run , mass , weight ) +dataset = ROOT.RooDataSet ( dsID () , 'Test Data set-0' , varset ) + +for r in range ( 100 ) : + + run.setVal ( r ) + for e in range ( 100 ) : + + evt .setVal ( e ) + mass.setVal ( random.uniform ( 0 , 10 ) ) + weight.setVal ( random.gauss ( 1 , 0.1 ) ) + + dataset.add ( varset ) + + if ( 1 <= r <= 2 ) and 3 <= e <= 3 : + + mass.setVal ( random.uniform ( 0 , 10 ) ) + dataset.add ( varset ) + +# ========================================================================================= +weighted = dataset.makeWeighted ( 'Weight' ) + +# ========================================================================================= +## (1) print datasets +# ========================================================================================= +logger.info ( 'Print unweighted dataset:\n%s' % dataset .table ( prefix = '# ' ) ) +logger.info ( 'Print weighted dataset:\n%s' % weighted.table ( prefix = '# ' ) ) + +# ========================================================================================= +## (2) loop over some subset of entries +# ========================================================================================= +logger.info ('Loop over unweighted dataset') +for index, entry , weight in dataset .loop ( '(Evt<10) && (Mass<10)' , first = 100 , last = 1000 , progress = True ) : pass + +logger.info ('Loop over weighted dataset') +for index, entry , weight in weighted.loop ( '(Evt<10) && (Mass<10)' , first = 100 , last = 1000 , progress = True ) : pass + +# ========================================================================================= +## (3) loop and get certain information as arrays/rows +# ========================================================================================= +logger.info ('Loop over unweighted dataset') +for index, row , weight in dataset .rows ( 'Mass, 2*Mass, Mass/2' , '(Evt<5) && (Mass<5)' , first = 100 , last = 1000 , progress = False ) : + print ( index, row , weight ) +logger.info ('Loop over weighted dataset') +for index, row , weight in weighted .rows ( 'Mass, 2*Mass, Mass/2' , '(Evt<5) && (Mass<5)' , first = 100 , last = 1000 , progress = False ) : + print ( index, row , weight ) + + +# ============================================================================= +## The END +# ============================================================================= diff --git a/ostap/fitting/tests/test_fitting_ds2numpy.py b/ostap/fitting/tests/test_fitting_ds2numpy.py index 2a085f56..16a3941c 100644 --- a/ostap/fitting/tests/test_fitting_ds2numpy.py +++ b/ostap/fitting/tests/test_fitting_ds2numpy.py @@ -35,7 +35,7 @@ def test_small_ds(): # Заполняем датасет случайными данными random_generator = ROOT.TRandom3(42) # устанавливаем seed - for e in range( NN ): + for e in range ( NN ): x_val = random_generator.Uniform(0, 10) y_val = random_generator.Uniform(0, 10) x.setVal ( x_val ) @@ -44,6 +44,7 @@ def test_small_ds(): i.setVal ( f % 2 ) data.add ( varset ) + g1 = Models.Gauss_pdf ( 'G1' , xvar = x , mean = 5 , sigma = 1 ) g2 = Models.Gauss_pdf ( 'G2' , xvar = y , mean = 5 , sigma = 1 ) @@ -51,7 +52,7 @@ def test_small_ds(): 'gaus2' : g2 } ) print ( ws ) - + # ============================================================================= def test_small_ds_with_weights(): diff --git a/ostap/fitting/tests/test_fitting_duplicates.py b/ostap/fitting/tests/test_fitting_duplicates.py index 02683e58..ab5f1438 100644 --- a/ostap/fitting/tests/test_fitting_duplicates.py +++ b/ostap/fitting/tests/test_fitting_duplicates.py @@ -15,14 +15,9 @@ __all__ = () ## nothing to import # ============================================================================= import ostap.fitting.roofit -import ostap.fitting.models as Models -from ostap.core.core import cpp, VE, dsID, rooSilent +from ostap.core.core import dsID from ostap.utils.timing import timing from builtins import range -from ostap.plotting.canvas import use_canvas -from ostap.utils.utils import wait -from ostap.fitting.background import make_bkg -from ostap.logger.colorized import attention import ostap.logger.table as T import ROOT, random # ============================================================================= @@ -36,19 +31,23 @@ # ============================================================================= -evt = ROOT.RooRealVar ( 'Evt' , '#event' , 0 , 1000000 ) -run = ROOT.RooRealVar ( 'Run' , '#run' , 0 , 1000000 ) -mass = ROOT.RooRealVar ( 'Mass' , 'mass-variable' , 0 , 100 ) +evt = ROOT.RooRealVar ( 'Evt' , '#event' , 0 , 1000000 ) +run = ROOT.RooRealVar ( 'Run' , '#run' , 0 , 1000000 ) +mass = ROOT.RooRealVar ( 'Mass' , 'mass-variable' , 0 , 100 ) +weight = ROOT.RooRealVar ( 'Weight' , 'some weight' , 0 , 2 ) -varset = ROOT.RooArgSet ( evt , run , mass ) -dataset = ROOT.RooDataSet ( dsID() , 'Test Data set-0' , varset ) +varset = ROOT.RooArgSet ( evt , run , mass , weight ) +dataset = ROOT.RooDataSet ( dsID () , 'Test Data set-0' , varset ) for r in range ( 5 ) : - run.setVal ( r ) + + run.setVal ( r ) + for e in range ( 5 ) : - evt .setVal ( e ) - mass.setVal ( random.uniform ( 0 , 10 ) ) + evt .setVal ( e ) + mass.setVal ( random.uniform ( 0 , 10 ) ) + weight.setVal ( random.uniform ( 0 , 2 ) ) dataset.add ( varset ) @@ -56,14 +55,10 @@ mass.setVal ( random.uniform ( 0 , 10 ) ) dataset.add ( varset ) - - ## mass.setVal ( random.uniform ( 0 , 10 ) ) - ## dataset.add ( varset ) - rows = [ ( '#' , 'Run' , 'Evt' , 'Mass' ) ] -for i, e in enumerate ( dataset ) : - +for i, item in enumerate ( dataset ) : + e , _ = item row = '%3d' % i , \ '%3d' % float ( e.Run ) , \ '%3d' % float ( e.Evt ) , \ @@ -79,8 +74,8 @@ rows = [ ( '#' , 'Run' , 'Evt' , 'Mass' ) ] -for i, e in enumerate ( ds0 ) : - +for i, item in enumerate ( ds0 ) : + e , _ = item row = '%3d' % i , \ '%3d' % float ( e.Run ) , \ '%3d' % float ( e.Evt ) , \ @@ -93,12 +88,12 @@ dscl = dataset.emptyClone() for dups in dataset.duplicates ( event_tag ) : for i in dups : - dscl.add ( dataset[i] ) - + entry , weight = dataset [ i ] + dscl.add ( entry ) rows = [ ( '#' , 'Run' , 'Evt' , 'Mass' ) ] -for i, e in enumerate ( dscl ) : - +for i, item in enumerate ( dscl ) : + e, w = item row = '%3d' % i , \ '%3d' % float ( e.Run ) , \ '%3d' % float ( e.Evt ) , \ @@ -108,6 +103,14 @@ logger.info ( '%s:\n%s' % ( title , T.table ( rows ,title = title , prefix = '# ' ) ) ) +# ============================================================================= +## print first 20 rows +for row, weight in data.rows ( [ 'Evt' , 'Run' , + 'Mass' , 'Mass*2' , 'Mass/2' ] , + cuts = 'Mass<5' , last = 20 ) : + print ( row , weight ) + + # ============================================================================= ## The END diff --git a/ostap/trees/cuts.py b/ostap/trees/cuts.py index eca10987..503b46b9 100755 --- a/ostap/trees/cuts.py +++ b/ostap/trees/cuts.py @@ -50,12 +50,23 @@ def vars_and_cuts ( expressions , cuts ) : if isinstance ( expressions , expression_types ) : expressions = split_string ( str ( expressions ) , strip = True , respect_groups = True ) input_string = 1 == len ( expressions ) - + + assert expressions and all ( s and isinstance ( s , expression_types ) for s in expressions ) , \ + "Invalid expression(s) : %s" % str ( expressions ) + + full = [] + for subexpr in expressions : + subexpr = split_string ( str( subexpr ).strip() , strip = True , respect_groups = True ) + if subexpr : full += subexpr + expressions = tuple ( full ) + assert expressions and all ( s and isinstance ( s , expression_types ) for s in expressions ) , \ "Invalid expression(s) : %s" % str ( expressions ) exprs = tuple ( str(s).strip() for s in expressions ) - exprs = tuple ( s for s in expressions if s ) + + exprs = tuple ( s for s in exprs if s ) + assert exprs and all ( exprs ) , "Invalid expressions: %s" % str ( exprs ) assert isinstance ( cuts , expression_types ) or not cuts , \