Skip to content

Commit

Permalink
Add new feeddown folding
Browse files Browse the repository at this point in the history
  • Loading branch information
qgp committed Sep 4, 2024
1 parent aea07b3 commit 91b6f28
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 23 deletions.
92 changes: 69 additions & 23 deletions machine_learning_hep/analysis/analyzer_jets.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
from machine_learning_hep.analysis.analyzer import Analyzer
from machine_learning_hep.fitting.roofitter import RooFitter
from machine_learning_hep.utilities import folding, make_message_notfound
from machine_learning_hep.utils.hist import (bin_array, create_hist,
fill_hist_fast, get_axis, get_dim,
from machine_learning_hep.utils.hist import (bin_array, create_hist, norm_response,
fill_hist_fast, fold_hist, get_axis, get_dim,
get_nbins, project_hist,
scale_bin, sum_hists, ensure_sumw2)

Expand Down Expand Up @@ -918,24 +918,67 @@ def estimate_feeddown(self):
continue

# TODO: derive histogram
# TODO: change order of axes to be consistent
h3_fd_gen = create_hist('h3_feeddown_gen',
f';p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}',
bins_ptjet, self.bins_candpt, bins_obs[var])
fill_hist_fast(h3_fd_gen, df[['pt_jet', 'pt_cand', f'{colname}']])
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png')
h3_fd_gen_orig = create_hist('h3_feeddown_gen',
f';p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}',
bins_ptjet, self.bins_candpt, bins_obs[var])
fill_hist_fast(h3_fd_gen_orig, df[['pt_jet', 'pt_cand', f'{colname}']])
self._save_hist(project_hist(h3_fd_gen_orig, [0, 2], {}), f'fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png')

# new method
h3_fd_gen = h3_fd_gen_orig.Clone()
ensure_sumw2(h3_fd_gen)
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen.png')
# apply np efficiency
for ipt in range(get_nbins(h3_fd_gen, 1)):
eff_np = self.hcandeff_gen['np'].GetBinContent(ipt+1)
for iptjet, ishape in itertools.product(
range(get_nbins(h3_fd_gen, 0)), range(get_nbins(h3_fd_gen, 2))):
scale_bin(h3_fd_gen, eff_np, iptjet+1, ipt+1, ishape+1)
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_geneff.png')

# 3d folding incl. kinematic efficiencies
with TFile(self.n_fileeff) as rfile:
h_effkine_gen = self._build_effkine(
rfile.Get(f'h_effkine_fd_gen_nocuts_{var}'),
rfile.Get(f'h_effkine_fd_gen_cut_{var}'))
h_effkine_det = self._build_effkine(
rfile.Get(f'h_effkine_fd_det_nocuts_{var}'),
rfile.Get(f'h_effkine_fd_det_cut_{var}'))
h_response = rfile.Get(f'h_response_fd_{var}')
h_response_norm = norm_response(h_response, 3)
h3_fd_gen.Multiply(h_effkine_gen)
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_genkine.png')
h3_fd_det = fold_hist(h3_fd_gen, h_response_norm)
self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_det.png')
h3_fd_det.Divide(h_effkine_det)
self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_det_detkine.png')

# undo prompt efficiency
for ipt in range(get_nbins(h3_fd_det, 1)):
eff_pr = self.h_effnew_pthf['pr'].GetBinContent(ipt+1)
if np.isclose(eff_pr, 0.):
self.logger.error('Efficiency zero for %s in pt bin %d, continuing', var, ipt)
continue # TODO: how should we handle this?
for iptjet, ishape in itertools.product(
range(get_nbins(h3_fd_det, 0)), range(get_nbins(h3_fd_det, 2))):
scale_bin(h3_fd_det, 1./eff_pr, iptjet+1, ipt+1, ishape+1)
self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_det_deteff.png')

# project to 2d (ptjet-shape)
h_fd_det = project_hist(h3_fd_det, [0, 2], {})

# old method
h3_fd_gen = h3_fd_gen_orig.Clone()
ensure_sumw2(h3_fd_gen)
for ipt in range(get_nbins(h3_fd_gen, axis=0)):
for ipt in range(get_nbins(h3_fd_gen, 1)):
eff_pr = self.hcandeff['pr'].GetBinContent(ipt+1)
eff_np = self.hcandeff['np'].GetBinContent(ipt+1)
if np.isclose(eff_pr, 0.):
self.logger.error('Efficiency zero for %s in pt bin %d, continuing', var, ipt)
continue # TODO: how should we handle this?

for iptjet, ishape in itertools.product(
range(get_nbins(h3_fd_gen, axis=0)), range(get_nbins(h3_fd_gen, axis=2))):
scale_bin(h3_fd_gen, eff_np/eff_pr, ipt+1, iptjet+1, ishape+1)
range(get_nbins(h3_fd_gen, 0)), range(get_nbins(h3_fd_gen, 2))):
scale_bin(h3_fd_gen, eff_np/eff_pr, iptjet+1, ipt+1, ishape+1)

h_fd_gen = project_hist(h3_fd_gen, [0, 2], {})
self._save_hist(h_fd_gen, f'fd/h_ptjet-{var}_feeddown_gen_effscaled.png')
Expand Down Expand Up @@ -968,17 +1011,20 @@ def estimate_feeddown(self):
hfeeddown_det.Divide(h_effkine_det)
self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_kineeffscaled.png')

# TODO: check scaling
hfeeddown_det.Scale(powheg_xsection_scale_factor * self.cfg('branching_ratio'))
hfeeddown_det_mc = hfeeddown_det.Clone()
hfeeddown_det_mc.SetName(hfeeddown_det_mc.GetName() + '_mc')
hfeeddown_det.Scale(self.n_colls['data'] / self.cfg('xsection_inel'))
hfeeddown_det_mc.Scale(self.n_colls['mc'] / self.cfg('xsection_inel_mc'))

self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_final.png')
self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_final_mc.png')
self.hfeeddown_det['data'][var] = hfeeddown_det
self.hfeeddown_det['mc'][var] = hfeeddown_det_mc
if self.cfg('fd_folding_method') == '3d':
self.logger.info('using 3d folding for feeddown estimation for %s', var)
hfeeddown_det = h_fd_det
# TODO: check scaling
hfeeddown_det.Scale(powheg_xsection_scale_factor * self.cfg('branching_ratio'))
hfeeddown_det_mc = hfeeddown_det.Clone()
hfeeddown_det_mc.SetName(hfeeddown_det_mc.GetName() + '_mc')
hfeeddown_det.Scale(self.n_colls['data'] / self.cfg('xsection_inel'))
hfeeddown_det_mc.Scale(self.n_colls['mc'] / self.cfg('xsection_inel_mc'))

self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_final.png')
self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_final_mc.png')
self.hfeeddown_det['data'][var] = hfeeddown_det
self.hfeeddown_det['mc'][var] = hfeeddown_det_mc


def _build_effkine(self, h_nocuts, h_cuts):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,7 @@ LcJet_pp:
unfolding_iterations: 8 # used, maximum iteration
unfolding_iterations_sel: 5 # used, selected iteration # systematics

fd_folding_method: 3d
fd_root: '/data2/vkucera/powheg/trees_powheg_fd_F05_R05.root'
fd_parquet: '/data2/jklein/powheg/Lc_powheg_fd_F05_R05.parquet'

Expand Down
47 changes: 47 additions & 0 deletions machine_learning_hep/processer_jet.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,13 @@ def process_efficiency_single(self, index):
self.binarray_pthf)
for (cat, var) in itertools.product(cats, observables)
if not '-' in var}
h_response_fd = {var:
create_hist(
f'h_response_fd_{var}',
f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}} (GeV/#it{{c}})",
self.binarrays_ptjet['det'][var], self.binarrays_obs['det']['fPt'], self.binarrays_obs['det'][var],
self.binarrays_ptjet['gen'][var], self.binarrays_obs['gen']['fPt'], self.binarrays_obs['gen'][var])
for var in self.cfg('observables', []) if not '-' in var}
# TODO: derive bins from response histogram
h_effkine = {(cat, level, cut, var):
create_hist(f'h_effkine_{cat}_{level}_{cut}_{var}',
Expand All @@ -277,6 +284,12 @@ def process_efficiency_single(self, index):
for (var, var_spec), level, cat, cut
in itertools.product(observables.items(), levels_effkine, cats, cuts)
if not '-' in var}
h_effkine_fd = {(level, cut, var): create_hist(f'h_effkine_fd_{level}_{cut}_{var}',
f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var_spec['label']}",
self.binarrays_ptjet[level][var], self.binarrays_obs[level]['fPt'], self.binarrays_obs[level][var])
for (var, var_spec), level, cut
in itertools.product(self.cfg('observables', {}).items(), levels_effkine, cuts)
if not '-' in var}
h_mctruth = {
(cat, var): create_hist(
f'h_ptjet-pthf-{var}_{cat}_gen',
Expand Down Expand Up @@ -358,11 +371,13 @@ def process_efficiency_single(self, index):

if cat in dfmatch and dfmatch[cat] is not None:
self._prepare_response(dfmatch[cat], h_effkine, h_response, cat, var)
self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, var)
f = self.cfg('frac_mcana', .2)
_, df_mccorr = self.split_df(dfmatch[cat], f if f < 1. else 0.)
self._prepare_response(df_mccorr, h_effkine_frac, h_response_frac, cat, var)

for name, obj in itertools.chain(h_eff.items(), h_effkine.items(), h_response.items(),
h_effkine_fd.items(), h_response_fd.items(),
h_effkine_frac.items(), h_response_frac.items(), h_mctruth.items()):
try:
rfile.WriteObject(obj, obj.GetName())
Expand Down Expand Up @@ -393,3 +408,35 @@ def _prepare_response(self, dfi, h_effkine, h_response, cat, var):
df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) &
(df[f'{var}'] >= axis_var_det.GetXmin()) & (df[f'{var}'] < axis_var_det.GetXmax())]
fill_hist(h_effkine[(cat, 'gen', 'cut', var)], df[['fJetPt_gen', f'{var}_gen']])


def _prepare_response_fd(self, dfi, h_effkine, h_response, var):
axis_ptjet_det = get_axis(h_response[var], 0)
axis_pthf_det = get_axis(h_response[var], 1)
axis_var_det = get_axis(h_response[var], 2)
axis_ptjet_gen = get_axis(h_response[var], 3)
axis_pthf_gen = get_axis(h_response[var], 4)
axis_var_gen = get_axis(h_response[var], 5)

df = dfi
# TODO: the first cut should be taken care of by under-/overflow bins, check their usage in analyzer
df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) &
(df.fPt >= axis_pthf_det.GetXmin()) & (df.fPt < axis_pthf_det.GetXmax()) &
(df[var] >= axis_var_det.GetXmin()) & (df[var] < axis_var_det.GetXmax())]
fill_hist(h_effkine[('det', 'nocuts', var)], df[['fJetPt', 'fPt', var]])
df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) &
(df.fPt_gen >= axis_pthf_gen.GetXmin()) & (df.fPt_gen < axis_pthf_gen.GetXmax()) &
(df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())]
fill_hist(h_effkine[('det', 'cut', var)], df[['fJetPt', 'fPt', var]])

fill_hist(h_response[var], df[['fJetPt', 'fPt', f'{var}', 'fJetPt_gen', 'fPt_gen', f'{var}_gen']])

df = dfi
df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) &
(df.fPt_gen >= axis_pthf_gen.GetXmin()) & (df.fPt_gen < axis_pthf_gen.GetXmax()) &
(df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())]
fill_hist(h_effkine[('gen', 'nocuts', var)], df[['fJetPt_gen', 'fPt', f'{var}_gen']])
df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) &
(df.fPt >= axis_pthf_det.GetXmin()) & (df.fPt < axis_pthf_det.GetXmax()) &
(df[f'{var}'] >= axis_var_det.GetXmin()) & (df[f'{var}'] < axis_var_det.GetXmax())]
fill_hist(h_effkine[('gen', 'cut', var)], df[['fJetPt_gen', 'fPt', f'{var}_gen']])
65 changes: 65 additions & 0 deletions machine_learning_hep/utils/hist.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import deque
import itertools

import math
import numpy as np
import pandas as pd
import ROOT
Expand Down Expand Up @@ -249,3 +250,67 @@ def sum_hists(hists, name = None):
def ensure_sumw2(hist):
if hist.GetSumw2N() < 1:
hist.Sumw2()


def get_bin_val(hist, hbin):
if isinstance(hist, ROOT.TH1):
return hist.GetBinContent(*hbin)
if isinstance(hist, ROOT.THn):
return hist.GetBinContent(np.array(hbin, 'i'))
raise NotImplementedError


def get_bin_err(hist, hbin):
if isinstance(hist, ROOT.TH1):
return hist.GetBinError(*hbin)
if isinstance(hist, ROOT.THn):
return hist.GetBinError(np.array(hbin, 'i'))
raise NotImplementedError


def set_bin_val(hist, hbin, val):
if isinstance(hist, ROOT.TH1):
return hist.SetBinContent(*hbin, val)
if isinstance(hist, ROOT.THn):
return hist.SetBinContent(np.array(hbin, 'i'), val)
raise NotImplementedError


def set_bin_err(hist, hbin, val):
if isinstance(hist, ROOT.TH1):
return hist.SetBinError(*hbin, val)
if isinstance(hist, ROOT.THn):
return hist.SetBinError(np.array(hbin, 'i'), val)
raise NotImplementedError


def norm_response(response, dim_out):
response_norm = response.Clone()
for bin_in in itertools.product(*(range(1, get_nbins(response_norm, i)+1)
for i in range(get_dim(response_norm)-dim_out))):
for iaxis, val in enumerate(bin_in):
get_axis(response_norm, iaxis + dim_out).SetRange(val, val)
norm = response_norm.Projection(0).Integral()
if np.isclose(norm, 0.):
continue
for bin_out in itertools.product(*(range(1, get_nbins(response_norm, i)+1) for i in range(dim_out))):
set_bin_val(response_norm, bin_out + bin_in, get_bin_val(response_norm, bin_out + bin_in) / norm)
set_bin_err(response_norm, bin_out + bin_in, get_bin_err(response_norm, bin_out + bin_in) / norm)
return response_norm


def fold_hist(hist, response):
"""Fold hist with response"""
assert get_dim(response) > get_dim(hist)
dim_out = get_dim(response) - get_dim(hist)
axes_spec = list(np.array(get_axis(response, i).GetXbins(), 'd') for i in range(dim_out))
hfold = create_hist('test', 'test', *axes_spec)
for bin_out in itertools.product(*(range(1, get_nbins(hfold, i)+1) for i in range(get_dim(hfold)))):
val = 0
err = 0
for bin_in in itertools.product(*(range(1, get_nbins(hist, i)+1) for i in range(get_dim(hist)))):
val += get_bin_val(hist, bin_in) * get_bin_val(response, bin_out + bin_in)
err += get_bin_err(hist, bin_in)**2 * get_bin_val(response, bin_out + bin_in)**2
set_bin_val(hfold, bin_out, val)
set_bin_err(hfold, bin_out, math.sqrt(err))
return hfold

0 comments on commit 91b6f28

Please sign in to comment.