diff --git a/coffea/nanoevents/__init__.py b/coffea/nanoevents/__init__.py index 86152428c..ce397eab4 100644 --- a/coffea/nanoevents/__init__.py +++ b/coffea/nanoevents/__init__.py @@ -2,11 +2,17 @@ """ from coffea.nanoevents.factory import NanoEventsFactory -from coffea.nanoevents.schemas import BaseSchema, NanoAODSchema, TreeMakerSchema +from coffea.nanoevents.schemas import ( + BaseSchema, + NanoAODSchema, + PFNanoAODSchema, + TreeMakerSchema, +) __all__ = [ "NanoEventsFactory", "BaseSchema", "NanoAODSchema", + "PFNanoAODSchema", "TreeMakerSchema", ] diff --git a/coffea/nanoevents/factory.py b/coffea/nanoevents/factory.py index bb4bdb555..2367e78c6 100644 --- a/coffea/nanoevents/factory.py +++ b/coffea/nanoevents/factory.py @@ -110,7 +110,9 @@ def from_root( ) uuidpfn = {partition_key[0]: tree.file.file_path} mapping = UprootSourceMapping( - TrivialUprootOpener(uuidpfn, uproot_options), cache={}, access_log=access_log + TrivialUprootOpener(uuidpfn, uproot_options), + cache={}, + access_log=access_log, ) mapping.preload_column_source(partition_key[0], partition_key[1], tree) @@ -208,9 +210,11 @@ def from_parquet( TrivialParquetOpener(uuidpfn, parquet_options), access_log=access_log ) - format_ = 'parquet' - if 'ceph_config_path' in rados_parquet_options: - format_ = ds.RadosParquetFileFormat(rados_parquet_options['ceph_config_path'].encode()) + format_ = "parquet" + if "ceph_config_path" in rados_parquet_options: + format_ = ds.RadosParquetFileFormat( + rados_parquet_options["ceph_config_path"].encode() + ) dataset = ds.dataset(file, schema=table_file.schema_arrow, format=format_) @@ -340,13 +344,15 @@ def _from_mapping( Arbitrary metadata to add to the `base.NanoEvents` object """ - if not issubclass(schemaclass, BaseSchema): - raise RuntimeError("Invalid schema type") if persistent_cache is not None: mapping = CachedMapping(persistent_cache, mapping) if metadata is not None: base_form["parameters"]["metadata"] = metadata + if not callable(schemaclass): + raise ValueError("Invalid schemaclass type") schema = schemaclass(base_form) + if not isinstance(schema, BaseSchema): + raise RuntimeError("Invalid schema type") return cls(schema, mapping, tuple_to_key(partition_key), cache=runtime_cache) def __len__(self): diff --git a/coffea/nanoevents/methods/base.py b/coffea/nanoevents/methods/base.py index 1acaf0d07..53217e19f 100644 --- a/coffea/nanoevents/methods/base.py +++ b/coffea/nanoevents/methods/base.py @@ -29,6 +29,10 @@ class NanoCollection: and other advanced mixin types. """ + def _collection_name(self): + """The name of the collection (i.e. the field under events where it is found)""" + return self.layout.purelist_parameter("collection_name") + def _getlistarray(self): """Do some digging to find the initial listarray""" diff --git a/coffea/nanoevents/methods/nanoaod.py b/coffea/nanoevents/methods/nanoaod.py index 3e958a097..21e6c3c97 100644 --- a/coffea/nanoevents/methods/nanoaod.py +++ b/coffea/nanoevents/methods/nanoaod.py @@ -298,6 +298,12 @@ def matched_muons(self): def matched_gen(self): return self._events().GenJet._apply_global_index(self.genJetIdxG) + @property + def constituents(self): + if "pFCandsIdxG" not in self.fields: + raise RuntimeError("PF candidates are only available for PFNano") + return self._events().JetPFCands._apply_global_index(self.pFCandsIdxG) + _set_repr_name("Jet") @@ -336,6 +342,12 @@ def subjets(self): def matched_gen(self): return self._events().GenJetAK8._apply_global_index(self.genJetAK8IdxG) + @property + def constituents(self): + if "pFCandsIdxG" not in self.fields: + raise RuntimeError("PF candidates are only available for PFNano") + return self._events().FatJetPFCands._apply_global_index(self.pFCandsIdxG) + _set_repr_name("FatJet") @@ -351,6 +363,99 @@ def r(self): _set_repr_name("MissingET") + +@awkward.mixin_class(behavior) +class Vertex(vector.ThreeVector, base.NanoCollection): + """NanoAOD vertex object""" + + pass + + +_set_repr_name("Vertex") + + +@awkward.mixin_class(behavior) +class SecondaryVertex(Vertex): + """NanoAOD secondary vertex object""" + + @property + def p4(self): + """4-momentum vector of tracks associated to this SV""" + return awkward.zip( + { + "pt": self["pt"], + "eta": self["eta"], + "phi": self["phi"], + "mass": self["mass"], + }, + with_name="PtEtaPhiMLorentzVector", + ) + + +_set_repr_name("SecondaryVertex") + + +@awkward.mixin_class(behavior) +class AssociatedPFCand(base.NanoCollection): + """PFNano PF candidate to jet association object""" + + collection_map = { + "JetPFCands": ("Jet", "PFCands"), + "FatJetPFCands": ("FatJet", "PFCands"), + "GenJetCands": ("GenJet", "GenCands"), + "GenFatJetCands": ("GenJetAK8", "GenCands"), + } + + @property + def jet(self): + collection = self._events()[self.collection_map[self._collection_name()][0]] + return collection._apply_global_index(self.jetIdxG) + + @property + def pf(self): + collection = self._events()[self.collection_map[self._collection_name()][1]] + return collection._apply_global_index(self.pFCandsIdxG) + + +_set_repr_name("AssociatedPFCand") + + +@awkward.mixin_class(behavior) +class AssociatedSV(base.NanoCollection): + """PFNano secondary vertex to jet association object""" + + collection_map = { + "JetSVs": ("Jet", "SV"), + "FatJetSVs": ("FatJet", "SV"), + # these two are unclear + "GenJetSVs": ("GenJet", "SV"), + "GenFatJetSVs": ("GenJetAK8", "SV"), + } + + @property + def jet(self): + collection = self._events()[self.collection_map[self._collection_name()][0]] + return collection._apply_global_index(self.jetIdxG) + + @property + def sv(self): + collection = self._events()[self.collection_map[self._collection_name()][1]] + return collection._apply_global_index(self.sVIdxG) + + +_set_repr_name("AssociatedSV") + + +@awkward.mixin_class(behavior) +class PFCand(candidate.PtEtaPhiMCandidate, base.NanoCollection): + """PFNano particle flow candidate object""" + + pass + + +_set_repr_name("PFCand") + + __all__ = [ "PtEtaPhiMCollection", "GenParticle", @@ -363,4 +468,9 @@ def r(self): "Jet", "FatJet", "MissingET", + "Vertex", + "SecondaryVertex", + "AssociatedPFCand", + "AssociatedSV", + "PFCand", ] diff --git a/coffea/nanoevents/schemas/__init__.py b/coffea/nanoevents/schemas/__init__.py index cd692b6a6..111acce04 100644 --- a/coffea/nanoevents/schemas/__init__.py +++ b/coffea/nanoevents/schemas/__init__.py @@ -1,5 +1,5 @@ from .base import BaseSchema -from .nanoaod import NanoAODSchema +from .nanoaod import NanoAODSchema, PFNanoAODSchema from .treemaker import TreeMakerSchema -__all__ = ["BaseSchema", "NanoAODSchema", "TreeMakerSchema"] +__all__ = ["BaseSchema", "NanoAODSchema", "PFNanoAODSchema", "TreeMakerSchema"] diff --git a/coffea/nanoevents/schemas/nanoaod.py b/coffea/nanoevents/schemas/nanoaod.py index 780a414f7..b08481f57 100644 --- a/coffea/nanoevents/schemas/nanoaod.py +++ b/coffea/nanoevents/schemas/nanoaod.py @@ -27,10 +27,10 @@ class NanoAODSchema(BaseSchema): There is a class-level variable ``warn_missing_crossrefs`` which will alter the behavior of NanoAODSchema. If warn_missing_crossrefs is true then when a missing global index cross-ref - target is encountered a warning will be issued instead of an exception. (Default: False) + target is encountered a warning will be issued. Regardless, the cross-reference is dropped. """ - warn_missing_crossrefs = False + warn_missing_crossrefs = True mixins = { "CaloMET": "MissingET", @@ -40,6 +40,7 @@ class NanoAODSchema(BaseSchema): "METFixEE2017": "MissingET", "PuppiMET": "MissingET", "RawMET": "MissingET", + "RawPuppiMET": "MissingET", "TkMET": "MissingET", # pseudo-lorentz: pt, eta, phi, mass=0 "IsoTrack": "PtEtaPhiMCollection", @@ -48,11 +49,11 @@ class NanoAODSchema(BaseSchema): # True lorentz: pt, eta, phi, mass "FatJet": "FatJet", "GenDressedLepton": "PtEtaPhiMCollection", + "GenIsolatedPhoton": "PtEtaPhiMCollection", "GenJet": "PtEtaPhiMCollection", - "GenJetAK8": "FatJet", + "GenJetAK8": "PtEtaPhiMCollection", "Jet": "Jet", "LHEPart": "PtEtaPhiMCollection", - "SV": "PtEtaPhiMCollection", "SubGenJetAK8": "PtEtaPhiMCollection", "SubJet": "PtEtaPhiMCollection", # Candidate: lorentz + charge @@ -64,17 +65,55 @@ class NanoAODSchema(BaseSchema): "GenVisTau": "GenVisTau", # special "GenPart": "GenParticle", + "PV": "Vertex", + "SV": "SecondaryVertex", } """Default configuration for mixin types, based on the collection name. The types are implemented in the `coffea.nanoevents.methods.nanoaod` module. """ + all_cross_references = { + "Electron_genPartIdx": "GenPart", + "Electron_jetIdx": "Jet", + "Electron_photonIdx": "Photon", + "FatJet_genJetAK8Idx": "GenJetAK8", + "FatJet_subJetIdx1": "SubJet", + "FatJet_subJetIdx2": "SubJet", + "FsrPhoton_muonIdx": "Muon", + "GenPart_genPartIdxMother": "GenPart", + "GenVisTau_genPartIdxMother": "GenPart", + "Jet_electronIdx1": "Electron", + "Jet_electronIdx2": "Electron", + "Jet_genJetIdx": "GenJet", + "Jet_muonIdx1": "Muon", + "Jet_muonIdx2": "Muon", + "Muon_fsrPhotonIdx": "FsrPhoton", + "Muon_genPartIdx": "GenPart", + "Muon_jetIdx": "Jet", + "Photon_electronIdx": "Electron", + "Photon_genPartIdx": "GenPart", + "Photon_jetIdx": "Jet", + "Tau_genPartIdx": "GenPart", + "Tau_jetIdx": "Jet", + } + """Cross-references, where an index is to be interpreted with respect to another collection + + Each such cross-reference will be converted to a global indexer, so that arbitrarily sliced events + can still resolve the indirection back the parent events + """ nested_items = { "FatJet_subJetIdxG": ["FatJet_subJetIdx1G", "FatJet_subJetIdx2G"], "Jet_muonIdxG": ["Jet_muonIdx1G", "Jet_muonIdx2G"], "Jet_electronIdxG": ["Jet_electronIdx1G", "Jet_electronIdx2G"], } - """Default nested collections, where nesting is accomplished by a fixed-length set of indexers""" + """Nested collections, where nesting is accomplished by a fixed-length set of indexers""" + nested_index_items = { + "Jet_pFCandsIdxG": ("Jet_nConstituents", "JetPFCands"), + "FatJet_pFCandsIdxG": ("FatJet_nConstituents", "FatJetPFCands"), + "GenJet_pFCandsIdxG": ("GenJet_nConstituents", "GenJetCands"), + "GenFatJet_pFCandsIdxG": ("GenJetAK8_nConstituents", "GenFatJetCands"), + } + """Nested collections, where nesting is accomplished by assuming the target can be unflattened according to a source counts""" special_items = { "GenPart_distinctParentIdxG": ( transforms.distinctParent_form, @@ -95,20 +134,49 @@ class NanoAODSchema(BaseSchema): ), ), } - """Default special arrays, where the callable and input arrays are specified in the value""" + """Special arrays, where the callable and input arrays are specified in the value""" - def __init__(self, base_form, version="6"): - self._version = version + def __init__(self, base_form, version="latest"): super().__init__(base_form) + self._version = version + self.cross_references = dict(self.all_cross_references) + if version == "latest": + pass + else: + if int(version) < 7: + del self.cross_references["FatJet_genJetAK8Idx"] + if int(version) < 6: + del self.cross_references["FsrPhoton_muonIdx"] + del self.cross_references["Muon_fsrPhotonIdx"] self._form["contents"] = self._build_collections(self._form["contents"]) self._form["parameters"]["metadata"]["version"] = self._version + @classmethod + def v7(cls, base_form): + """Build the NanoEvents assuming NanoAODv7 + + For example, one can use ``NanoEventsFactory.from_root("file.root", schemaclass=NanoAODSchema.v7)`` + to ensure NanoAODv7 compatibility. + """ + return cls(base_form, version="7") + + @classmethod + def v6(cls, base_form): + """Build the NanoEvents assuming NanoAODv6""" + return cls(base_form, version="6") + + @classmethod + def v5(cls, base_form): + """Build the NanoEvents assuming NanoAODv5""" + return cls(base_form, version="5") + def _build_collections(self, branch_forms): # parse into high-level records (collections, list collections, and singletons) collections = set(k.split("_")[0] for k in branch_forms) collections -= set( k for k in collections if k.startswith("n") and k[1:] in collections ) + isData = "GenPart" not in collections # Create offsets virtual arrays for name in collections: @@ -118,25 +186,26 @@ def _build_collections(self, branch_forms): ) # Create global index virtual arrays for indirection - idxbranches = [k for k in branch_forms if "Idx" in k] - for name in collections: - indexers = [k for k in idxbranches if k.startswith(name + "_")] - for k in indexers: - target = k[len(name) + 1 : k.find("Idx")] - target = target[0].upper() + target[1:] - if target not in collections: - problem = RuntimeError( - "Parsing indexer %s, expected to find collection %s but did not" - % (k, target) + for indexer, target in self.cross_references.items(): + if target.startswith("Gen") and isData: + continue + if indexer not in branch_forms: + if self.warn_missing_crossrefs: + warnings.warn( + f"Missing cross-reference index for {indexer} => {target}", + RuntimeWarning, ) - if self.__class__.warn_missing_crossrefs: - warnings.warn(str(problem), RuntimeWarning) - continue - else: - raise problem - branch_forms[k + "G"] = transforms.local2global_form( - branch_forms[k], branch_forms["o" + target] - ) + continue + if "o" + target not in branch_forms: + if self.warn_missing_crossrefs: + warnings.warn( + f"Missing cross-reference target for {indexer} => {target}", + RuntimeWarning, + ) + continue + branch_forms[indexer + "G"] = transforms.local2global_form( + branch_forms[indexer], branch_forms["o" + target] + ) # Create nested indexer from Idx1, Idx2, ... arrays for name, indexers in self.nested_items.items(): @@ -145,6 +214,13 @@ def _build_collections(self, branch_forms): [branch_forms[idx] for idx in indexers] ) + # Create nested indexer from n* counts arrays + for name, (local_counts, target) in self.nested_index_items.items(): + if local_counts in branch_forms and "o" + target in branch_forms: + branch_forms[name] = transforms.counts2nestedindex_form( + branch_forms[local_counts], branch_forms["o" + target] + ) + # Create any special arrays for name, (fcn, args) in self.special_items.items(): if all(k in branch_forms for k in args): @@ -200,3 +276,47 @@ def behavior(self): from coffea.nanoevents.methods import nanoaod return nanoaod.behavior + + +class PFNanoAODSchema(NanoAODSchema): + """PFNano schema builder + + PFNano is an extended NanoAOD format that includes PF candidates and secondary vertices + More info at https://github.com/cms-jet/PFNano + """ + + mixins = { + **NanoAODSchema.mixins, + "JetSVs": "AssociatedSV", + "FatJetSVs": "AssociatedSV", + "GenJetSVs": "AssociatedSV", + "GenFatJetSVs": "AssociatedSV", + "JetPFCands": "AssociatedPFCand", + "FatJetPFCands": "AssociatedPFCand", + "GenJetCands": "AssociatedPFCand", + "GenFatJetCands": "AssociatedPFCand", + "PFCands": "PFCand", + "GenCands": "PFCand", + } + all_cross_references = { + **NanoAODSchema.all_cross_references, + "FatJetPFCands_jetIdx": "FatJet", # breaks pattern + "FatJetPFCands_pFCandsIdx": "PFCands", + "FatJetSVs_jetIdx": "FatJet", # breaks pattern + "FatJetSVs_sVIdx": "SV", + "FatJet_electronIdx3SJ": "Electron", + "FatJet_muonIdx3SJ": "Muon", + "GenFatJetCands_jetIdx": "GenJetAK8", # breaks pattern + "GenFatJetCands_pFCandsIdx": "GenCands", # breaks pattern + "GenFatJetSVs_jetIdx": "GenJetAK8", # breaks pattern + "GenFatJetSVs_sVIdx": "SV", + "GenJetCands_jetIdx": "GenJet", # breaks pattern + "GenJetCands_pFCandsIdx": "GenCands", # breaks pattern + "GenJetSVs_jetIdx": "GenJet", # breaks pattern + "GenJetSVs_sVIdx": "SV", + "JetPFCands_jetIdx": "Jet", + "JetPFCands_pFCandsIdx": "PFCands", + "JetSVs_jetIdx": "Jet", + "JetSVs_sVIdx": "SV", + "SubJet_subGenJetAK8Idx": "SubGenJetAK8", + } diff --git a/coffea/nanoevents/transforms.py b/coffea/nanoevents/transforms.py index d5d81cca8..5721e4111 100644 --- a/coffea/nanoevents/transforms.py +++ b/coffea/nanoevents/transforms.py @@ -125,6 +125,43 @@ def local2global(stack): stack.append(out) +def counts2nestedindex_form(local_counts, target_offsets): + if not local_counts["class"].startswith("ListOffset"): + raise RuntimeError + if not target_offsets["class"] == "NumpyArray": + raise RuntimeError + form = { + "class": "ListOffsetArray64", + "offsets": "i64", + "content": copy.deepcopy(local_counts), + } + form["content"]["content"]["itemsize"] = 8 + form["content"]["content"]["primitive"] = "int64" + form["content"]["content"]["parameters"] = {} + key = concat( + local_counts["form_key"], target_offsets["form_key"], "!counts2nestedindex" + ) + form["form_key"] = local_counts["form_key"] + form["content"]["form_key"] = key + form["content"]["content"]["form_key"] = concat(key, "!content") + return form + + +def counts2nestedindex(stack): + """Turn jagged local counts into doubly-kagged global index into a target + + Signature: local_counts,target_offsets,!counts2nestedindex + Outputs a jagged array with same axis-0 shape as counts axis-1 + """ + target_offsets = stack.pop() + local_counts = stack.pop() + out = awkward.unflatten( + numpy.arange(target_offsets[-1], dtype=numpy.int64), + awkward.flatten(local_counts), + ) + stack.append(out) + + @numba.njit def _distinctParent_kernel(allpart_parent, allpart_pdg): out = numpy.empty(len(allpart_pdg), dtype=numpy.int64) diff --git a/tests/test_nanoevents.py b/tests/test_nanoevents.py index c8bf9de5f..d04b6d45e 100644 --- a/tests/test_nanoevents.py +++ b/tests/test_nanoevents.py @@ -1,6 +1,6 @@ import os import awkward as ak -from coffea.nanoevents import NanoEventsFactory +from coffea.nanoevents import NanoEventsFactory, NanoAODSchema import pytest @@ -28,7 +28,9 @@ def crossref(events): @pytest.mark.parametrize("suffix", suffixes) def test_read_nanomc(suffix): path = os.path.abspath(f'tests/samples/nano_dy.{suffix}') - factory = getattr(NanoEventsFactory, f'from_{suffix}')(path) + # parquet files were converted from even older nanoaod + nanoversion = NanoAODSchema.v6 if suffix == "root" else NanoAODSchema.v5 + factory = getattr(NanoEventsFactory, f'from_{suffix}')(path, schemaclass=nanoversion) events = factory.events() # test after views first @@ -59,7 +61,9 @@ def test_read_nanomc(suffix): @pytest.mark.parametrize("suffix", suffixes) def test_read_nanodata(suffix): path = os.path.abspath(f'tests/samples/nano_dimuon.{suffix}') - factory =getattr(NanoEventsFactory, f'from_{suffix}')(path) + # parquet files were converted from even older nanoaod + nanoversion = NanoAODSchema.v6 if suffix == "root" else NanoAODSchema.v5 + factory =getattr(NanoEventsFactory, f'from_{suffix}')(path, schemaclass=nanoversion) events = factory.events() crossref(events)