diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py index 3cd6b6bd4b..dbb4fb0cdf 100644 --- a/machine_learning_hep/analysis/analyzer_jets.py +++ b/machine_learning_hep/analysis/analyzer_jets.py @@ -22,7 +22,7 @@ from machine_learning_hep.analysis.analyzer import Analyzer from machine_learning_hep.fitting.roofitter import RooFitter from machine_learning_hep.utilities import folding, make_message_notfound -from machine_learning_hep.utils.hist import (bin_array, create_hist, +from machine_learning_hep.utils.hist import (bin_array, create_hist, norm_response, fold_hist, fill_hist_fast, get_axis, get_dim, get_bin_limits, get_nbins, project_hist, scale_bin, sum_hists, ensure_sumw2) @@ -936,24 +936,67 @@ def estimate_feeddown(self): continue # TODO: derive histogram - # TODO: change order of axes to be consistent - h3_fd_gen = create_hist('h3_feeddown_gen', - f';p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}', - bins_ptjet, self.bins_candpt, bins_obs[var]) - fill_hist_fast(h3_fd_gen, df[['pt_jet', 'pt_cand', f'{colname}']]) - self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png') + h3_fd_gen_orig = create_hist('h3_feeddown_gen', + f';p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}', + bins_ptjet, self.bins_candpt, bins_obs[var]) + fill_hist_fast(h3_fd_gen_orig, df[['pt_jet', 'pt_cand', f'{colname}']]) + self._save_hist(project_hist(h3_fd_gen_orig, [0, 2], {}), f'fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png') + + # new method + h3_fd_gen = h3_fd_gen_orig.Clone() + ensure_sumw2(h3_fd_gen) + self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen.png') + # apply np efficiency + for ipt in range(get_nbins(h3_fd_gen, 1)): + eff_np = self.hcandeff_gen['np'].GetBinContent(ipt+1) + for iptjet, ishape in itertools.product( + range(get_nbins(h3_fd_gen, 0)), range(get_nbins(h3_fd_gen, 2))): + scale_bin(h3_fd_gen, eff_np, iptjet+1, ipt+1, ishape+1) + self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_geneff.png') + + # 3d folding incl. kinematic efficiencies + with TFile(self.n_fileeff) as rfile: + h_effkine_gen = self._build_effkine( + rfile.Get(f'h_effkine_fd_gen_nocuts_{var}'), + rfile.Get(f'h_effkine_fd_gen_cut_{var}')) + h_effkine_det = self._build_effkine( + rfile.Get(f'h_effkine_fd_det_nocuts_{var}'), + rfile.Get(f'h_effkine_fd_det_cut_{var}')) + h_response = rfile.Get(f'h_response_fd_{var}') + h_response_norm = norm_response(h_response, 3) + h3_fd_gen.Multiply(h_effkine_gen) + self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_genkine.png') + h3_fd_det = fold_hist(h3_fd_gen, h_response_norm) + self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_det.png') + h3_fd_det.Divide(h_effkine_det) + self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_det_detkine.png') + + # undo prompt efficiency + for ipt in range(get_nbins(h3_fd_det, 1)): + eff_pr = self.h_effnew_pthf['pr'].GetBinContent(ipt+1) + if np.isclose(eff_pr, 0.): + self.logger.error('Efficiency zero for %s in pt bin %d, continuing', var, ipt) + continue # TODO: how should we handle this? + for iptjet, ishape in itertools.product( + range(get_nbins(h3_fd_det, 0)), range(get_nbins(h3_fd_det, 2))): + scale_bin(h3_fd_det, 1./eff_pr, iptjet+1, ipt+1, ishape+1) + self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_det_deteff.png') + + # project to 2d (ptjet-shape) + h_fd_det = project_hist(h3_fd_det, [0, 2], {}) + # old method + h3_fd_gen = h3_fd_gen_orig.Clone() ensure_sumw2(h3_fd_gen) - for ipt in range(get_nbins(h3_fd_gen, axis=0)): + for ipt in range(get_nbins(h3_fd_gen, 1)): eff_pr = self.hcandeff['pr'].GetBinContent(ipt+1) eff_np = self.hcandeff['np'].GetBinContent(ipt+1) if np.isclose(eff_pr, 0.): self.logger.error('Efficiency zero for %s in pt bin %d, continuing', var, ipt) continue # TODO: how should we handle this? - for iptjet, ishape in itertools.product( - range(get_nbins(h3_fd_gen, axis=0)), range(get_nbins(h3_fd_gen, axis=2))): - scale_bin(h3_fd_gen, eff_np/eff_pr, ipt+1, iptjet+1, ishape+1) + range(get_nbins(h3_fd_gen, 0)), range(get_nbins(h3_fd_gen, 2))): + scale_bin(h3_fd_gen, eff_np/eff_pr, iptjet+1, ipt+1, ishape+1) h_fd_gen = project_hist(h3_fd_gen, [0, 2], {}) self._save_hist(h_fd_gen, f'fd/h_ptjet-{var}_feeddown_gen_effscaled.png') @@ -986,17 +1029,20 @@ def estimate_feeddown(self): hfeeddown_det.Divide(h_effkine_det) self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_kineeffscaled.png') - # TODO: check scaling - hfeeddown_det.Scale(powheg_xsection_scale_factor * self.cfg('branching_ratio')) - hfeeddown_det_mc = hfeeddown_det.Clone() - hfeeddown_det_mc.SetName(hfeeddown_det_mc.GetName() + '_mc') - hfeeddown_det.Scale(self.n_colls['data'] / self.cfg('xsection_inel')) - hfeeddown_det_mc.Scale(self.n_colls['mc'] / self.cfg('xsection_inel_mc')) - - self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_final.png') - self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_final_mc.png') - self.hfeeddown_det['data'][var] = hfeeddown_det - self.hfeeddown_det['mc'][var] = hfeeddown_det_mc + if self.cfg('fd_folding_method') == '3d': + self.logger.info('using 3d folding for feeddown estimation for %s', var) + hfeeddown_det = h_fd_det + # TODO: check scaling + hfeeddown_det.Scale(powheg_xsection_scale_factor * self.cfg('branching_ratio')) + hfeeddown_det_mc = hfeeddown_det.Clone() + hfeeddown_det_mc.SetName(hfeeddown_det_mc.GetName() + '_mc') + hfeeddown_det.Scale(self.n_colls['data'] / self.cfg('xsection_inel')) + hfeeddown_det_mc.Scale(self.n_colls['mc'] / self.cfg('xsection_inel_mc')) + + self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_final.png') + self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_final_mc.png') + self.hfeeddown_det['data'][var] = hfeeddown_det + self.hfeeddown_det['mc'][var] = hfeeddown_det_mc def _build_effkine(self, h_nocuts, h_cuts): diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml index f0d984acb0..aa54125cee 100644 --- a/machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml +++ b/machine_learning_hep/data/data_run3/database_ml_parameters_LcJet_pp.yml @@ -464,6 +464,7 @@ LcJet_pp: unfolding_iterations: 8 # used, maximum iteration unfolding_iterations_sel: 5 # used, selected iteration # systematics + fd_folding_method: 3d fd_root: '/data2/vkucera/powheg/trees_powheg_fd_F05_R05.root' fd_parquet: '/data2/jklein/powheg/Lc_powheg_fd_F05_R05.parquet' diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py index 5605efcb98..2a998ef26e 100644 --- a/machine_learning_hep/processer_jet.py +++ b/machine_learning_hep/processer_jet.py @@ -269,6 +269,13 @@ def process_efficiency_single(self, index): self.binarray_pthf) for (cat, var) in itertools.product(cats, observables) if not '-' in var} + h_response_fd = {var: + create_hist( + f'h_response_fd_{var}', + f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}} (GeV/#it{{c}})", + self.binarrays_ptjet['det'][var], self.binarrays_obs['det']['fPt'], self.binarrays_obs['det'][var], + self.binarrays_ptjet['gen'][var], self.binarrays_obs['gen']['fPt'], self.binarrays_obs['gen'][var]) + for var in self.cfg('observables', []) if not '-' in var} # TODO: derive bins from response histogram h_effkine = {(cat, level, cut, var): create_hist(f'h_effkine_{cat}_{level}_{cut}_{var}', @@ -277,6 +284,12 @@ def process_efficiency_single(self, index): for (var, var_spec), level, cat, cut in itertools.product(observables.items(), levels_effkine, cats, cuts) if not '-' in var} + h_effkine_fd = {(level, cut, var): create_hist(f'h_effkine_fd_{level}_{cut}_{var}', + f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var_spec['label']}", + self.binarrays_ptjet[level][var], self.binarrays_obs[level]['fPt'], self.binarrays_obs[level][var]) + for (var, var_spec), level, cut + in itertools.product(self.cfg('observables', {}).items(), levels_effkine, cuts) + if not '-' in var} h_mctruth = { (cat, var): create_hist( f'h_ptjet-pthf-{var}_{cat}_gen', @@ -358,11 +371,13 @@ def process_efficiency_single(self, index): if cat in dfmatch and dfmatch[cat] is not None: self._prepare_response(dfmatch[cat], h_effkine, h_response, cat, var) + self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, var) f = self.cfg('frac_mcana', .2) _, df_mccorr = self.split_df(dfmatch[cat], f if f < 1. else 0.) self._prepare_response(df_mccorr, h_effkine_frac, h_response_frac, cat, var) for name, obj in itertools.chain(h_eff.items(), h_effkine.items(), h_response.items(), + h_effkine_fd.items(), h_response_fd.items(), h_effkine_frac.items(), h_response_frac.items(), h_mctruth.items()): try: rfile.WriteObject(obj, obj.GetName()) @@ -393,3 +408,35 @@ def _prepare_response(self, dfi, h_effkine, h_response, cat, var): df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & (df[f'{var}'] >= axis_var_det.GetXmin()) & (df[f'{var}'] < axis_var_det.GetXmax())] fill_hist(h_effkine[(cat, 'gen', 'cut', var)], df[['fJetPt_gen', f'{var}_gen']]) + + + def _prepare_response_fd(self, dfi, h_effkine, h_response, var): + axis_ptjet_det = get_axis(h_response[var], 0) + axis_pthf_det = get_axis(h_response[var], 1) + axis_var_det = get_axis(h_response[var], 2) + axis_ptjet_gen = get_axis(h_response[var], 3) + axis_pthf_gen = get_axis(h_response[var], 4) + axis_var_gen = get_axis(h_response[var], 5) + + df = dfi + # TODO: the first cut should be taken care of by under-/overflow bins, check their usage in analyzer + df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & + (df.fPt >= axis_pthf_det.GetXmin()) & (df.fPt < axis_pthf_det.GetXmax()) & + (df[var] >= axis_var_det.GetXmin()) & (df[var] < axis_var_det.GetXmax())] + fill_hist(h_effkine[('det', 'nocuts', var)], df[['fJetPt', 'fPt', var]]) + df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) & + (df.fPt_gen >= axis_pthf_gen.GetXmin()) & (df.fPt_gen < axis_pthf_gen.GetXmax()) & + (df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())] + fill_hist(h_effkine[('det', 'cut', var)], df[['fJetPt', 'fPt', var]]) + + fill_hist(h_response[var], df[['fJetPt', 'fPt', f'{var}', 'fJetPt_gen', 'fPt_gen', f'{var}_gen']]) + + df = dfi + df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) & + (df.fPt_gen >= axis_pthf_gen.GetXmin()) & (df.fPt_gen < axis_pthf_gen.GetXmax()) & + (df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())] + fill_hist(h_effkine[('gen', 'nocuts', var)], df[['fJetPt_gen', 'fPt', f'{var}_gen']]) + df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) & + (df.fPt >= axis_pthf_det.GetXmin()) & (df.fPt < axis_pthf_det.GetXmax()) & + (df[f'{var}'] >= axis_var_det.GetXmin()) & (df[f'{var}'] < axis_var_det.GetXmax())] + fill_hist(h_effkine[('gen', 'cut', var)], df[['fJetPt_gen', 'fPt', f'{var}_gen']]) diff --git a/machine_learning_hep/utils/hist.py b/machine_learning_hep/utils/hist.py index 4d4dada39f..8b688da26c 100644 --- a/machine_learning_hep/utils/hist.py +++ b/machine_learning_hep/utils/hist.py @@ -1,6 +1,7 @@ from collections import deque import itertools +import math import numpy as np import pandas as pd import ROOT @@ -253,3 +254,68 @@ def sum_hists(hists, name = None): def ensure_sumw2(hist): if hist.GetSumw2N() < 1: hist.Sumw2() + + +def get_bin_val(hist, hbin): + if isinstance(hist, ROOT.TH1): + return hist.GetBinContent(*hbin) + if isinstance(hist, ROOT.THn): + return hist.GetBinContent(np.array(hbin, 'i')) + raise NotImplementedError + + +def get_bin_err(hist, hbin): + if isinstance(hist, ROOT.TH1): + return hist.GetBinError(*hbin) + if isinstance(hist, ROOT.THn): + return hist.GetBinError(np.array(hbin, 'i')) + raise NotImplementedError + + +def set_bin_val(hist, hbin, val): + if isinstance(hist, ROOT.TH1): + return hist.SetBinContent(*hbin, val) + if isinstance(hist, ROOT.THn): + return hist.SetBinContent(np.array(hbin, 'i'), val) + raise NotImplementedError + + +def set_bin_err(hist, hbin, val): + if isinstance(hist, ROOT.TH1): + return hist.SetBinError(*hbin, val) + if isinstance(hist, ROOT.THn): + return hist.SetBinError(np.array(hbin, 'i'), val) + raise NotImplementedError + + +def norm_response(response, dim_out): + response_norm = response.Clone() + for bin_in in itertools.product(*(range(1, get_nbins(response_norm, iaxis) + 1) + for iaxis in range(dim_out, get_dim(response_norm)))): + print(f'{bin_in=}', flush=True) + for iaxis, val in enumerate(bin_in, dim_out): + get_axis(response_norm, iaxis).SetRange(val, val) + norm = response_norm.Projection(0).Integral() + if np.isclose(norm, 0.): + continue + for bin_out in itertools.product(*(range(1, get_nbins(response_norm, i)+1) for i in range(dim_out))): + set_bin_val(response_norm, bin_out + bin_in, get_bin_val(response_norm, bin_out + bin_in) / norm) + set_bin_err(response_norm, bin_out + bin_in, get_bin_err(response_norm, bin_out + bin_in) / norm) + return response_norm + + +def fold_hist(hist, response): + """Fold hist with response""" + assert get_dim(response) > get_dim(hist) + dim_out = get_dim(response) - get_dim(hist) + axes_spec = list(np.array(get_axis(response, i).GetXbins(), 'd') for i in range(dim_out)) + hfold = create_hist('test', 'test', *axes_spec) + for bin_out in itertools.product(*(range(1, get_nbins(hfold, i)+1) for i in range(get_dim(hfold)))): + val = 0. + err = 0. + for bin_in in itertools.product(*(range(1, get_nbins(hist, i)+1) for i in range(get_dim(hist)))): + val += get_bin_val(hist, bin_in) * get_bin_val(response, bin_out + bin_in) + err += get_bin_err(hist, bin_in)**2 * get_bin_val(response, bin_out + bin_in)**2 + set_bin_val(hfold, bin_out, val) + set_bin_err(hfold, bin_out, math.sqrt(err)) + return hfold