Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HP approval version #951

Merged
merged 6 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions machine_learning_hep/analysis/analyzer_jets.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ def fit(self):
h_invmass = project_hist(hsel, [0], cuts_proj)
if h_invmass.GetEntries() < 100: # TODO: reconsider criterion
self.logger.error('Not enough entries to fit %s iptjet %s ipt %d',
level, iptjet, ipt)
level, iptjet, ipt)
continue
roows = self.roows.get(ipt) if iptjet is None else self.roows_ptjet.get((iptjet, ipt))
if roows is None and level != self.fit_levels[0]:
Expand Down Expand Up @@ -724,7 +724,7 @@ def _analyze(self, method = 'sidesub'):
rfilename = self.n_filemass_mc if mcordata == "mc" else self.n_filemass
with TFile(rfilename) as rfile:
for var in [None] + self.observables['all']:
self.logger.info('Running analysis for %s using %s', var, method)
self.logger.info('Running analysis for obs. %s, %s using %s', var, mcordata, method)
label = f'-{var}' if var else ''
self.logger.debug('looking for %s', f'h_mass-ptjet-pthf{label}')
if fh := rfile.Get(f'h_mass-ptjet-pthf{label}'): # TODO: add sanity check
Expand All @@ -736,6 +736,8 @@ def _analyze(self, method = 'sidesub'):
h_in = project_hist(fh, axes_proj, {2: (ipt+1, ipt+1)})
ensure_sumw2(h_in)
# Signal extraction
self.logger.info("Signal extraction (method %s): obs. %s, %s, ipt %d",
method, var, mcordata, ipt)
if method == 'sidesub':
h = self._subtract_sideband(h_in, var, mcordata, ipt)
elif method == 'sigextr':
Expand All @@ -757,6 +759,8 @@ def _analyze(self, method = 'sidesub'):
h = h_proj
# Efficiency correction
if mcordata == 'data' or not self.cfg('closure.use_matched'):
self.logger.info("Efficiency correction: obs. %s, %s, ipt %d",
var, mcordata, ipt)
self.logger.info('correcting efficiency')
self._correct_efficiency(h, ipt)
fh_sub.append(h)
Expand All @@ -778,6 +782,7 @@ def _analyze(self, method = 'sidesub'):

fh_sum_fdsub = fh_sum.Clone()
# Feed-down subtraction
self.logger.info("Feed-down subtraction: obs. %s, %s", var, mcordata)
if mcordata == 'data' or not self.cfg('closure.exclude_feeddown_det'):
self._subtract_feeddown(fh_sum_fdsub, var, mcordata)
self._clip_neg(fh_sum_fdsub)
Expand Down Expand Up @@ -819,6 +824,7 @@ def _analyze(self, method = 'sidesub'):
self._save_hist(
hproj, f'uf/h_{var}_{method}_{mcordata}_{string_range_ptjet(range_ptjet)}.png')
# Unfolding
self.logger.info("Unfolding: obs. %s, %s", var, mcordata)
fh_unfolded = self._unfold(fh_sum_fdsub, var, mcordata)
for i, h in enumerate(fh_unfolded):
self._save_hist(h, f'h_ptjet-{var}_{method}_unfolded_{mcordata}_{i}.png')
Expand Down Expand Up @@ -858,6 +864,7 @@ def _analyze(self, method = 'sidesub'):
self._save_canvas(c,
f'uf/h_{var}_{method}_convergence_{mcordata}_' +
f'{string_range_ptjet(range_ptjet)}.png')
self.logger.info("Analysis complete: obs. %s, %s", var, mcordata)


def analyze_with_sidesub(self):
Expand Down
33 changes: 29 additions & 4 deletions machine_learning_hep/analysis/do_systematics.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
from machine_learning_hep.logger import get_logger

# HF specific imports
from machine_learning_hep.utilities import ( # make_plot,
from machine_learning_hep.utilities import (
make_plot,
combine_graphs,
draw_latex,
get_colour,
Expand Down Expand Up @@ -545,11 +546,12 @@ def do_jet_systematics(self, var: str):
)
input_histograms_sys[iptjet][sys_cat][sys_var].Draw("same")
nsys = nsys + 1
latex_text = "%g #leq %s < %g GeV/#it{c}" % (self.edges_ptjet_gen_min[iptjet],
self.latex_ptjet, self.edges_ptjet_gen_max[iptjet])
latex = TLatex(
0.15,
0.82,
"%g #leq %s < %g GeV/#it{c}"
% (self.edges_ptjet_gen_min[iptjet], self.latex_ptjet, self.edges_ptjet_gen_max[iptjet]),
latex_text,
)
draw_latex(latex)
leg_sysvar_each.Draw("same")
Expand All @@ -566,19 +568,28 @@ def do_jet_systematics(self, var: str):
leg_sysvar_ratio = TLegend(0.77, 0.2, 0.95, 0.85, self.systematic_catlabels[sys_cat]) # Rg
setup_legend(leg_sysvar_ratio)
histo_ratio = []

n_bins = input_histograms_default[iptjet].GetNbinsX()
# Make the histograms for the distribution of var/default values per bin of observable.
list_his_cat_vars = [TH1F(f"his_cat_vars_{var}_{suffix}_{suffix2}_{ibin + 1}",
f"{self.systematic_catlabels[sys_cat]} distribution, bin {ibin + 1};"
"var/def;counts", 6, 0., 2.) for ibin in range(n_bins)]

for sys_var in range(self.systematic_variations[sys_cat]):
default_his = input_histograms_default[iptjet].Clone("default_his")
var_his = input_histograms_sys[iptjet][sys_cat][sys_var].Clone("var_his")
var_his.Divide(default_his)
histo_ratio.append(var_his)
# Fill the histogram for the distribution of var/default values.
for ibin in range(n_bins):
list_his_cat_vars[ibin].Fill(var_his.GetBinContent(ibin + 1))
l_his_all = []
for his_var in histo_ratio:
if his_var.Integral() != 0:
l_his_all.append(his_var)
y_min, y_max = get_y_window_his(l_his_all, False)
y_margin_up = 0.15
y_margin_down = 0.05

for sys_var in range(self.systematic_variations[sys_cat]):
if sys_var == 0:
histo_ratio[sys_var].GetYaxis().SetRangeUser(
Expand All @@ -605,6 +616,20 @@ def do_jet_systematics(self, var: str):
leg_sysvar_ratio.Draw("same")
self.save_canvas(csysvar_ratio, f"sys_var_{var}_{suffix}_{suffix2}_ratio")

# print([[h.GetBinContent(i + 1) for i in range(h.GetNbinsX())] for h in list_his_cat_vars])
axis_x = var_his.GetXaxis()
can_dist, _ = make_plot(f"sys_var_{var}_{suffix}_{suffix2}_ratio_dist",
list_obj=list_his_cat_vars, labels_obj=[f"{axis_x.GetBinLowEdge(ibin + 1)}-"
f"{axis_x.GetBinUpEdge(ibin + 1)}"
for ibin in range(n_bins)],
opt_leg_g=self.opt_leg_g, opt_plot_g=self.opt_plot_g, opt_plot_h="p l",
offsets_xy=self.offsets_axes,
leg_pos=[0.7, 0.7, 0.8, 0.85],
margins_y=[0.05, 0.05], margins_c=self.margins_can,
title=f"{latex_obs} {latex_text} {self.systematic_catlabels[sys_cat]};"
"var/default;counts")
self.save_canvas(can_dist, f"sys_var_{var}_{suffix}_{suffix2}_ratio_dist")

# Plot efficiency variations

csysvar_eff = TCanvas(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,13 @@ LcJet_pp:
seedmerge: [12] #list of periods
period: [LHC22o] #list of periods
select_period: [1]
prefix_dir: /data2/MLhep/real/
unmerged_tree_dir: [train_257594/alice/] #list of periods
pkl: [vkucera/train_257594/lcjet/pkl] #list of periods
pkl_skimmed: [vkucera/train_257594/lcjet/pklsk] #list of periods
pkl_skimmed_merge_for_ml: ['vkucera/train_257594/lcjet/pklskml'] #list of periods
pkl_skimmed_merge_for_ml_all: 'vkucera/train_257594/lcjet/pp_data_mltot'
pkl_evtcounter_all: 'vkucera/train_257594/lcjet/pp_data_evttot'
prefix_dir: /data2/MLhep/real/train_257594/
unmerged_tree_dir: [alice] #list of periods
pkl: ['${USER}/lcjet/pkl'] #list of periods
pkl_skimmed: ['${USER}/lcjet/pklsk'] #list of periods
pkl_skimmed_merge_for_ml: ['${USER}/lcjet/pklskml'] #list of periods
pkl_skimmed_merge_for_ml_all: '${USER}/lcjet/pp_data_mltot'
pkl_evtcounter_all: '${USER}/lcjet/pp_data_evttot'
mcreweights: [../Analyses] #list of periods
mc:
nprocessesparallel: 80
Expand All @@ -267,13 +267,13 @@ LcJet_pp:
seedmerge: [12] #list of periods
period: [LHC24d3b] #list of periods
select_period: [1]
prefix_dir: /data2/MLhep/sim/
unmerged_tree_dir: [train_257383/alice]
pkl: [vkucera/train_257383/lcjet/pkl] #list of periods
pkl_skimmed: [vkucera/train_257383/lcjet/pklsk] #list of periods
pkl_skimmed_merge_for_ml: ['vkucera/train_257383/lcjet/pklskml'] #list of periods
pkl_skimmed_merge_for_ml_all: 'vkucera/lcjet/pp_mc_prod_mltot'
pkl_evtcounter_all: 'vkucera/lcjet/pp_mc_prod_evttot'
prefix_dir: /data2/MLhep/sim/train_257383/
unmerged_tree_dir: [alice]
pkl: ['${USER}/lcjet/pkl'] #list of periods
pkl_skimmed: ['${USER}/lcjet/pklsk'] #list of periods
pkl_skimmed_merge_for_ml: ['${USER}/lcjet/pklskml'] #list of periods
pkl_skimmed_merge_for_ml_all: '${USER}/lcjet/pp_mc_prod_mltot'
pkl_evtcounter_all: '${USER}/lcjet/pp_mc_prod_evttot'
mcreweights: [../Analyses] #list of periods

ml:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -529,24 +529,27 @@ categories:
rms_both_sides: true
variations:
ml:
activate: [no, no, no, yes, no, no, yes]
activate: [no, no, no, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes]
label:
- "default" # working point (for tests, should be same as the default result)
- "null" # no cuts (for tests, whatever was applied on Hyperloop)
- "loosest" # same cuts as Hyperloop (for tests, should be same as null)
- "loose" # to increase efficiency by 20 %
- "tight 2"
- "tight 4"
- "tight" # to decrease efficiency by 20 %
# default: working point (for tests, should be same as the default result)
# null: no cuts (for tests, whatever was applied on Hyperloop)
# loosest: same cuts as Hyperloop (for tests, should be same as null)
["default", "null", "loosest", "loose 5", "loose 4", "loose 3", "loose 2", "loose 1", "tight 1", "tight 2", "tight 3", "tight 4", "tight 5"]
diffs:
analysis:
jet_obs:
use_cuts: [True, True, True, True, True, True, True]
use_cuts: [True, True, True, True, True, True, True, True, True, True, True, True, True]
cuts:
- ["mlPromptScore > 0.97", "mlPromptScore > 0.9", "mlPromptScore > 0.9", "mlPromptScore > 0.85", "mlPromptScore > 0.85", "mlPromptScore > 0.8", "mlPromptScore > 0.8", "mlPromptScore > 0.6", "mlPromptScore > 0.6"] # default
- [null,null,null,null,null,null,null,null,null,null]
- ["mlPromptScore > 0.85", "mlPromptScore > 0.6", "mlPromptScore > 0.6", "mlPromptScore > 0.4", "mlPromptScore > 0.4", "mlPromptScore > 0.4", "mlPromptScore > 0.4", "mlPromptScore > 0.15", "mlPromptScore > 0.15"] # loosest
- ["mlPromptScore > 0.961", "mlPromptScore > 0.83", "mlPromptScore > 0.84", "mlPromptScore > 0.74", "mlPromptScore > 0.74", "mlPromptScore > 0.62", "mlPromptScore > 0.63", "mlPromptScore > 0.15", "mlPromptScore > 0.15"] # loose
- ["mlPromptScore > 0.98", "mlPromptScore > 0.9", "mlPromptScore > 0.9", "mlPromptScore > 0.85", "mlPromptScore > 0.85", "mlPromptScore > 0.8", "mlPromptScore > 0.8", "mlPromptScore > 0.6", "mlPromptScore > 0.6"] # tight 2
- ["mlPromptScore > 0.97", "mlPromptScore > 0.9", "mlPromptScore > 0.9", "mlPromptScore > 0.85", "mlPromptScore > 0.85", "mlPromptScore > 0.8", "mlPromptScore > 0.8", "mlPromptScore > 0.6", "mlPromptScore > 0.6"] # tight 4
- ["mlPromptScore > 0.978", "mlPromptScore > 0.94", "mlPromptScore > 0.937", "mlPromptScore > 0.915", "mlPromptScore > 0.91", "mlPromptScore > 0.89", "mlPromptScore > 0.88", "mlPromptScore > 0.85", "mlPromptScore > 0.85"] # tight
- ["mlPromptScore > 0.961", "mlPromptScore > 0.83", "mlPromptScore > 0.84", "mlPromptScore > 0.74", "mlPromptScore > 0.74", "mlPromptScore > 0.62", "mlPromptScore > 0.63", "mlPromptScore > 0.15", "mlPromptScore > 0.15"] # loose 5
- ["mlPromptScore > 0.9628", "mlPromptScore > 0.844", "mlPromptScore > 0.852", "mlPromptScore > 0.762", "mlPromptScore > 0.762", "mlPromptScore > 0.656", "mlPromptScore > 0.664", "mlPromptScore > 0.24", "mlPromptScore > 0.24"] # loose 4
- ["mlPromptScore > 0.9646", "mlPromptScore > 0.858", "mlPromptScore > 0.864", "mlPromptScore > 0.784", "mlPromptScore > 0.784", "mlPromptScore > 0.692", "mlPromptScore > 0.698", "mlPromptScore > 0.33", "mlPromptScore > 0.33"] # loose 3
- ["mlPromptScore > 0.9664", "mlPromptScore > 0.872", "mlPromptScore > 0.876", "mlPromptScore > 0.806", "mlPromptScore > 0.806", "mlPromptScore > 0.728", "mlPromptScore > 0.732", "mlPromptScore > 0.42", "mlPromptScore > 0.42"] # loose 2
- ["mlPromptScore > 0.9682", "mlPromptScore > 0.886", "mlPromptScore > 0.888", "mlPromptScore > 0.828", "mlPromptScore > 0.828", "mlPromptScore > 0.764", "mlPromptScore > 0.766", "mlPromptScore > 0.51", "mlPromptScore > 0.51"] # loose 1
- ["mlPromptScore > 0.9716", "mlPromptScore > 0.908", "mlPromptScore > 0.9074", "mlPromptScore > 0.863", "mlPromptScore > 0.862", "mlPromptScore > 0.818", "mlPromptScore > 0.816", "mlPromptScore > 0.65", "mlPromptScore > 0.65"] # tight 1
- ["mlPromptScore > 0.9732", "mlPromptScore > 0.916", "mlPromptScore > 0.9148", "mlPromptScore > 0.876", "mlPromptScore > 0.874", "mlPromptScore > 0.836", "mlPromptScore > 0.832", "mlPromptScore > 0.7", "mlPromptScore > 0.7"] # tight 2
- ["mlPromptScore > 0.9748", "mlPromptScore > 0.924", "mlPromptScore > 0.9222", "mlPromptScore > 0.889", "mlPromptScore > 0.886", "mlPromptScore > 0.854", "mlPromptScore > 0.848", "mlPromptScore > 0.75", "mlPromptScore > 0.75"] # tight 3
- ["mlPromptScore > 0.9764", "mlPromptScore > 0.932", "mlPromptScore > 0.9296", "mlPromptScore > 0.902", "mlPromptScore > 0.898", "mlPromptScore > 0.872", "mlPromptScore > 0.864", "mlPromptScore > 0.8", "mlPromptScore > 0.8"] # tight 4
- ["mlPromptScore > 0.978", "mlPromptScore > 0.94", "mlPromptScore > 0.937", "mlPromptScore > 0.915", "mlPromptScore > 0.91", "mlPromptScore > 0.89", "mlPromptScore > 0.88", "mlPromptScore > 0.85", "mlPromptScore > 0.85"] # tight 5
Loading