-
Notifications
You must be signed in to change notification settings - Fork 0
/
Coffea_NanoAOD_PP_schema.py
280 lines (253 loc) · 11.4 KB
/
Coffea_NanoAOD_PP_schema.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import warnings
from coffea.nanoevents import transforms
from coffea.nanoevents.schemas.base import BaseSchema, zip_forms
class NanoAODPPSchema(BaseSchema):
"""NanoAOD schema builder
The NanoAOD schema is built from all branches found in the supplied file, based on
the naming pattern of the branches. The following additional arrays are constructed:
- Any branches named ``n{name}`` are assumed to be counts branches and converted to offsets ``o{name}``
- Any local index branches with names matching ``{source}_{target}Idx*`` are converted to global indexes for the event chunk (postfix ``G``)
- Any `nested_items` are constructed, if the necessary branches are available
- Any `special_items` are constructed, if the necessary branches are available
From those arrays, NanoAOD collections are formed as collections of branches grouped by name, where:
- one branch exists named ``name`` and no branches start with ``name_``, interpreted as a single flat array;
- one branch exists named ``name``, one named ``n{name}``, and no branches start with ``name_``, interpreted as a single jagged array;
- no branch exists named ``{name}`` and many branches start with ``name_*``, interpreted as a flat table; or
- one branch exists named ``n{name}`` and many branches start with ``name_*``, interpreted as a jagged table.
Collections are assigned mixin types according to the `mixins` mapping.
All collections are then zipped into one `base.NanoEvents` record and returned.
There is a class-level variable ``warn_missing_crossrefs`` which will alter the behavior of
NanoAODSchema. If warn_missing_crossrefs is true then when a missing global index cross-ref
target is encountered a warning will be issued. Regardless, the cross-reference is dropped.
"""
warn_missing_crossrefs = True
mixins = {
"CaloMET": "MissingET",
"ChsMET": "MissingET",
"GenMET": "MissingET",
"MET": "MissingET",
"METFixEE2017": "MissingET",
"PuppiMET": "MissingET",
"RawMET": "MissingET",
"RawPuppiMET": "MissingET",
"TkMET": "MissingET",
# pseudo-lorentz: pt, eta, phi, mass=0
"IsoTrack": "PtEtaPhiMCollection",
"SoftActivityJet": "PtEtaPhiMCollection",
"TrigObj": "PtEtaPhiMCollection",
# True lorentz: pt, eta, phi, mass
"FatJet": "FatJet",
"GenDressedLepton": "PtEtaPhiMCollection",
"GenIsolatedPhoton": "PtEtaPhiMCollection",
"GenJet": "PtEtaPhiMCollection",
"GenJetAK8": "PtEtaPhiMCollection",
"Jet": "Jet",
"LHEPart": "PtEtaPhiMCollection",
"SubGenJetAK8": "PtEtaPhiMCollection",
"SubJet": "PtEtaPhiMCollection",
# Candidate: lorentz + charge
"Electron": "Electron",
"Muon": "Muon",
"Photon": "Photon",
"FsrPhoton": "FsrPhoton",
"Tau": "Tau",
"GenVisTau": "GenVisTau",
# special
"GenPart": "GenParticle",
"PV": "Vertex",
"SV": "SecondaryVertex",
}
"""Default configuration for mixin types, based on the collection name.
The types are implemented in the `coffea.nanoevents.methods.nanoaod` module.
"""
all_cross_references = {
"Electron_genPartIdx": "GenPart",
"Electron_jetIdx": "Jet",
"Electron_photonIdx": "Photon",
"FatJet_genJetAK8Idx": "GenJetAK8",
"FatJet_subJetIdx1": "SubJet",
"FatJet_subJetIdx2": "SubJet",
"FsrPhoton_muonIdx": "Muon",
"GenPart_genPartIdxMother": "GenPart",
"GenVisTau_genPartIdxMother": "GenPart",
"Jet_electronIdx1": "Electron",
"Jet_electronIdx2": "Electron",
"Jet_genJetIdx": "GenJet",
"Jet_muonIdx1": "Muon",
"Jet_muonIdx2": "Muon",
"Muon_fsrPhotonIdx": "FsrPhoton",
"Muon_genPartIdx": "GenPart",
"Muon_jetIdx": "Jet",
"Photon_electronIdx": "Electron",
"Photon_genPartIdx": "GenPart",
"Photon_jetIdx": "Jet",
"Tau_genPartIdx": "GenPart",
"Tau_jetIdx": "Jet",
}
"""Cross-references, where an index is to be interpreted with respect to another collection
Each such cross-reference will be converted to a global indexer, so that arbitrarily sliced events
can still resolve the indirection back the parent events
"""
nested_items = {
"FatJet_subJetIdxG": ["FatJet_subJetIdx1G", "FatJet_subJetIdx2G"],
"Jet_muonIdxG": ["Jet_muonIdx1G", "Jet_muonIdx2G"],
"Jet_electronIdxG": ["Jet_electronIdx1G", "Jet_electronIdx2G"],
}
"""Nested collections, where nesting is accomplished by a fixed-length set of indexers"""
nested_index_items = {
"Jet_pFCandsIdxG": ("Jet_nConstituents", "JetPFCands"),
"FatJet_pFCandsIdxG": ("FatJet_nConstituents", "FatJetPFCands"),
"GenJet_pFCandsIdxG": ("GenJet_nConstituents", "GenJetCands"),
"GenFatJet_pFCandsIdxG": ("GenJetAK8_nConstituents", "GenFatJetCands"),
}
"""Nested collections, where nesting is accomplished by assuming the target can be unflattened according to a source counts"""
special_items = {
"GenPart_distinctParentIdxG": (
transforms.distinctParent_form,
("GenPart_genPartIdxMotherG", "GenPart_pdgId"),
),
"GenPart_childrenIdxG": (
transforms.children_form,
(
"oGenPart",
"GenPart_genPartIdxMotherG",
),
),
"GenPart_distinctChildrenIdxG": (
transforms.children_form,
(
"oGenPart",
"GenPart_distinctParentIdxG",
),
),
}
"""Special arrays, where the callable and input arrays are specified in the value"""
def __init__(self, base_form, version="latest"):
super().__init__(base_form)
self._version = version
self.cross_references = dict(self.all_cross_references)
if version == "latest":
pass
else:
if int(version) < 7:
del self.cross_references["FatJet_genJetAK8Idx"]
if int(version) < 6:
del self.cross_references["FsrPhoton_muonIdx"]
del self.cross_references["Muon_fsrPhotonIdx"]
self._form["contents"] = self._build_collections(self._form["contents"])
self._form["parameters"]["metadata"]["version"] = self._version
@classmethod
def v7(cls, base_form):
"""Build the NanoEvents assuming NanoAODv7
For example, one can use ``NanoEventsFactory.from_root("file.root", schemaclass=NanoAODSchema.v7)``
to ensure NanoAODv7 compatibility.
"""
return cls(base_form, version="7")
@classmethod
def v6(cls, base_form):
"""Build the NanoEvents assuming NanoAODv6"""
return cls(base_form, version="6")
@classmethod
def v5(cls, base_form):
"""Build the NanoEvents assuming NanoAODv5"""
return cls(base_form, version="5")
def _build_collections(self, branch_forms):
# parse into high-level records (collections, list collections, and singletons)
collections = set(k.split("_")[0] for k in branch_forms)
collections -= set(
k for k in collections if k.startswith("n") and k[1:] in collections
)
isData = "GenPart" not in collections
# Create offsets virtual arrays
for name in collections:
if "n" + name in branch_forms:
branch_forms["o" + name] = transforms.counts2offsets_form(
branch_forms["n" + name]
)
# Create global index virtual arrays for indirection
for indexer, target in self.cross_references.items():
if target.startswith("Gen") and isData:
continue
if indexer not in branch_forms:
if self.warn_missing_crossrefs:
warnings.warn(
f"Missing cross-reference index for {indexer} => {target}",
RuntimeWarning,
)
continue
if "o" + target not in branch_forms:
if self.warn_missing_crossrefs:
warnings.warn(
f"Missing cross-reference target for {indexer} => {target}",
RuntimeWarning,
)
continue
branch_forms[indexer + "G"] = transforms.local2global_form(
branch_forms[indexer], branch_forms["o" + target]
)
# Create nested indexer from Idx1, Idx2, ... arrays
for name, indexers in self.nested_items.items():
if all(idx in branch_forms for idx in indexers):
branch_forms[name] = transforms.nestedindex_form(
[branch_forms[idx] for idx in indexers]
)
# Create nested indexer from n* counts arrays
for name, (local_counts, target) in self.nested_index_items.items():
if local_counts in branch_forms and "o" + target in branch_forms:
branch_forms[name] = transforms.counts2nestedindex_form(
branch_forms[local_counts], branch_forms["o" + target]
)
# Create any special arrays
for name, (fcn, args) in self.special_items.items():
if all(k in branch_forms for k in args):
branch_forms[name] = fcn(*(branch_forms[k] for k in args))
output = {}
for name in collections:
mixin = self.mixins.get(name, "NanoCollection")
if "o" + name in branch_forms and name not in branch_forms:
# list collection
offsets = branch_forms["o" + name]
content = {
k[len(name) + 1 :]: branch_forms[k]
for k in branch_forms
if k.startswith(name + "_")
}
output[name] = zip_forms(
content, name, record_name=mixin, offsets=offsets
)
output[name]["content"]["parameters"].update(
{
"__doc__": offsets["parameters"]["__doc__"],
"collection_name": name,
}
)
elif "o" + name in branch_forms:
# list singleton, can use branch's own offsets
output[name] = branch_forms[name]
output[name]["parameters"].update(
{"__array__": mixin, "collection_name": name}
)
elif name in branch_forms:
# singleton
output[name] = branch_forms[name]
else:
# simple collection
output[name] = zip_forms(
{
k[len(name) + 1 :]: branch_forms[k]
for k in branch_forms
if k.startswith(name + "_")
},
name,
record_name=mixin,
)
if "parameters" in output[name]:
output[name]["parameters"].update({"collection_name": name})
else:
output[name]["parameters"] = {"collection_name": name}
return output
@property
def behavior(self):
"""Behaviors necessary to implement this schema"""
from coffea.nanoevents.methods import nanoaod
return nanoaod.behavior