Merge pull request #2 from UMN-CMS/dask_histograms

Dask histograms
UMN-CMS · May 17, 2024 · 9ec36aa · 9ec36aa
2 parents 1c9f958 + ca6ac15
commit 9ec36aa
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 26 deletions.
diff --git a/analyzer.py b/analyzer.py
@@ -5,14 +5,14 @@
 import copy
 import numpy as np
 import uproot
-from hist import Hist
+import hist
 import time
-
 from utils.file_output import save_histograms
 from utils.file_input import construct_fileset
 from utils.histos import config
 from utils.compute_variables import get_variables
-
+import hist.dask as hda
+import dask
 NanoAODSchema.warn_missing_crossrefs = False
 
 class WrAnalysis(processor.ProcessorABC):
@@ -25,7 +25,7 @@ def __init__(self):
                 self.hist_dict[mll][flavor] = {}
                 for i in range(len(config["histos"]["HISTO_NAMES"])):
                     self.hist_dict[mll][flavor][config["histos"]["HISTO_NAMES"][i]] =(
-                        Hist.new.Reg(bins=config["histos"]["N_BINS"][i],
+                        hda.Hist.new.Reg(bins=config["histos"]["N_BINS"][i],
                                           start=config["histos"]["BIN_LOW"][i],
                                           stop=config["histos"]["BIN_HIGH"][i],
                                           label=config["histos"]["HISTO_LABELS"][i])
@@ -42,7 +42,7 @@ def process(self, events):
         jets = events.Jet
 
         num_events = ak.num(elecs,axis=0).compute()
-        print(f"\nProcessing {num_events} events")
+        print(f"\nProcessing {num_events} events.")
 
         # Mask jets and leptons with their individual requirements
         good_elecs = elecs[(elecs.pt > 53) & (np.abs(elecs.eta) < 2.4) & (elecs.cutBased_HEEP)]
@@ -84,7 +84,7 @@ def process(self, events):
 
         num_selected = ak.num(passing_elecs,axis=0).compute()
 
-        print(f"\n{num_selected} events passed the selection ({num_selected/num_events*100:.2f}% efficiency)")
+        print(f"{num_selected} events passed the selection ({num_selected/num_events*100:.2f}% efficiency).\n")
 
         mll = (passing_leptons[:, 0] + passing_leptons[:, 1]).mass
 
@@ -108,15 +108,17 @@ def process(self, events):
              flavor_selection = selections.all(flavor)
              selected_leptons = passing_leptons[mll_selection & flavor_selection]
              selected_jets = passing_jets[mll_selection & flavor_selection]
-             print(f"\nCalculating kinematic variables and filling histograms for events with dilepton mass {mll} and flavor {flavor}")
+             print(f"Filling histograms for events with dilepton mass {mll} and flavor {flavor}.")
              variables = get_variables(selected_leptons, selected_jets) #This step takes forever (converting the dask arrays using .compute()).
              for i, variable in enumerate(variables):
-                 hist_dict[mll][flavor][config["histos"]["HISTO_NAMES"][i]].fill(variable)
+                hist_dict[mll][flavor][config["histos"]["HISTO_NAMES"][i]].fill(variable)
+
+        print("\nFinished processing events and filling histograms.\n")
 
         return hist_dict
 
     def postprocess(self, accumulator):
-        return accumulator
+        pass
 
 t0 = time.monotonic()
 
@@ -132,8 +134,11 @@ def postprocess(self, accumulator):
 p = WrAnalysis()
 out = p.process(events)
 
-print("\nSaving histograms")
-save_histograms(out, "example_histos.root")
+print("Computing histograms...")
+(computed,)=dask.compute(out)
+print("Histograms computed.\n")
+
+save_histograms(computed, "example_histos.root")
 
 exec_time = time.monotonic() - t0
 print(f"\nExecution took {exec_time:.2f} seconds")
diff --git a/utils/compute_variables.py b/utils/compute_variables.py
@@ -2,21 +2,21 @@
 
 def get_variables(leptons, jets):
 
-    leadlepton_pt = leptons[:,0].pt.compute()
-    subleadlepton_pt = leptons[:,1].pt.compute()
-    leadjet_pt = jets[:,0].pt.compute()
-    subleadjet_pt = jets[:,1].pt.compute()
-    leadlepton_eta = leptons[:,0].eta.compute()
-    subleadlepton_eta = leptons[:,1].eta.compute()
-    leadjet_eta = jets[:,0].eta.compute()
-    subleadjet_eta = jets[:,1].eta.compute()
-    leadlepton_phi = leptons[:,0].phi.compute()
-    subleadlepton_phi = leptons[:,1].phi.compute()
-    leadjet_phi = jets[:,0].phi.compute()
-    subleadjet_phi = jets[:,1].phi.compute()
-    dilepton_mass = (leptons[:, 0] + leptons[:, 1]).mass.compute()
-    dijet_mass = (jets[:, 0] + jets[:, 1]).mass.compute()
-    fourobject_mass = (leptons[:, 0] + leptons[:, 1] + jets[:, 0] + jets[:, 1]).mass.compute()
+    leadlepton_pt = leptons[:,0].pt
+    subleadlepton_pt = leptons[:,1].pt
+    leadjet_pt = jets[:,0].pt
+    subleadjet_pt = jets[:,1].pt
+    leadlepton_eta = leptons[:,0].eta
+    subleadlepton_eta = leptons[:,1].eta
+    leadjet_eta = jets[:,0].eta
+    subleadjet_eta = jets[:,1].eta
+    leadlepton_phi = leptons[:,0].phi
+    subleadlepton_phi = leptons[:,1].phi
+    leadjet_phi = jets[:,0].phi
+    subleadjet_phi = jets[:,1].phi
+    dilepton_mass = (leptons[:, 0] + leptons[:, 1]).mass
+    dijet_mass = (jets[:, 0] + jets[:, 1]).mass
+    fourobject_mass = (leptons[:, 0] + leptons[:, 1] + jets[:, 0] + jets[:, 1]).mass
 
     variables = [leadlepton_pt,
                 subleadlepton_pt,

diff --git a/utils/file_output.py b/utils/file_output.py
@@ -10,3 +10,4 @@ def save_histograms(hist_dict, filename):
             for flavor in hist_dict[mll].keys():
                 for histogram_name, histogram_data in hist_dict[mll][flavor].items():
                     f[f"{mll}/{flavor}/{histogram_name}"] = histogram_data
+    print(f"Histograms saved to {histograms_directory}/{filename}")