From 5ad6f98376feac009e4363ecc52f5b83993ccb0c Mon Sep 17 00:00:00 2001 From: Jochen Klein Date: Mon, 27 Nov 2023 16:42:29 +0100 Subject: [PATCH] Prepare ML training for HF jets --- .../database_ml_parameters_D0pp_jet.yml | 70 ++++++++++--------- machine_learning_hep/optimiser.py | 21 +++--- machine_learning_hep/processer_jet.py | 1 - machine_learning_hep/utilities.py | 3 +- 4 files changed, 47 insertions(+), 48 deletions(-) diff --git a/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet.yml b/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet.yml index 80520cef1e..c20dd2274c 100644 --- a/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet.yml +++ b/machine_learning_hep/data/data_run3/database_ml_parameters_D0pp_jet.yml @@ -58,6 +58,7 @@ D0pp_jet: fErrorDecayLength, fErrorDecayLengthXY, fChi2PCA, fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fImpactParameterNormalised0, fPtProng0, fImpactParameterNormalised1, fPtProng1, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, + fNSigTpcPi0, fNSigTpcKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi0, fNSigTofKa0, fNSigTofPi1, fNSigTofKa1, fIndexHfCand2Prong_0] var_jet_data: [fIndexCollisions, fIndexD0ChargedJets, fIndexHfCand2Prong_0, fJetPt, fJetEta, fJetPhi, fJetNConstituents] var_jet_det: [fIndexCollisions, fIndexD0ChargedMCDetectorLevelJets, fIndexHfCand2Prong_0, fJetPt, fJetEta, fJetPhi, fJetNConstituents] @@ -78,7 +79,7 @@ D0pp_jet: var_gen: [fIndexCollisions, fPt, fY, fFlagMc, fOriginMcGen, fIndexHfCand2Prong_0] var_evt_match: [df, fIndexCollisions] var_evt_match_mc: [df, fIndexCollisions] - var_training: [[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1],[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fImpactParameterProduct, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1]] + var_training: [[fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1], [fDecayLength, fDecayLengthXY, fDecayLengthNormalised, fDecayLengthXYNormalised, fCpa, fCpaXY, fImpactParameter0, fImpactParameter1, fErrorImpactParameter0, fErrorImpactParameter1, fNSigTpcPi0, fNSigTpcKa0, fNSigTofPi0, fNSigTofKa0, fNSigTpcPi1, fNSigTpcKa1, fNSigTofPi1, fNSigTofKa1]] #TODO: add new variables for dca, max_norm_d0d0exp # sel_skim_binmin bins var_boundaries: [fCosThetaStar, fPtProng] @@ -141,38 +142,38 @@ D0pp_jet: multi: data: - nprocessesparallel: 1 - maxfiles: [5] #list of periods + nprocessesparallel: 20 + maxfiles: [-1] #list of periods chunksizeunp: [100] #list of periods chunksizeskim: [100] #list of periods fracmerge: [0.08] #list of periods seedmerge: [12] #list of periods - period: [LHC22o_pass4] #list of periods + period: [LHC22o] #list of periods select_period: [1] - prefix_dir: /home/jklein/ - unmerged_tree_dir: [data/alice/cern.ch/user/a/alihyperloop/jobs/0024/hy_240062/] #list of periods - pkl: [data/test/d0jet/pkl] #list of periods - pkl_skimmed: [data/test/d0jet/pklsk] #list of periods - pkl_skimmed_merge_for_ml: [data/test/d0jet/pklskml] #list of periods - pkl_skimmed_merge_for_ml_all: data/test/d0jet/pp_data_mltot - pkl_evtcounter_all: data/test/d0jet/pp_data_evttot + prefix_dir: /data2/MLhep/real/train_131050/ + unmerged_tree_dir: [alice/cern.ch/user/a/alihyperloop/jobs/0024/] #list of periods + pkl: [d0jet/pkl] #list of periods + pkl_skimmed: [d0jet/pklsk] #list of periods + pkl_skimmed_merge_for_ml: [d0jet/pklskml] #list of periods + pkl_skimmed_merge_for_ml_all: d0jet/pp_data_mltot + pkl_evtcounter_all: d0jet/pp_data_evttot mcreweights: [../Analyses] #list of periods mc: - nprocessesparallel: 40 - maxfiles: [5] #list of periods + nprocessesparallel: 20 + maxfiles: [-1] #list of periods chunksizeunp: [100] #list of periods chunksizeskim: [1000] #list of periods fracmerge: [1.0] #list of periods seedmerge: [12] #list of periods - period: [mctest] #list of periods + period: [LHC22b1b] #list of periods select_period: [1] - prefix_dir: /home/jklein/ - unmerged_tree_dir: [data/alice/cern.ch/user/a/alihyperloop/jobs/0024/hy_240092] #list of periods - pkl: [data/mctest/d0jet/pkl] #list of periods - pkl_skimmed: [data/mctest/d0jet/pklsk] #list of periods - pkl_skimmed_merge_for_ml: [data/mctest/d0jet/pklskml] #list of periods - pkl_skimmed_merge_for_ml_all: data/mctest/d0jet/pp_mc_prod_mltot - pkl_evtcounter_all: data/mctest/d0jet/pp_mc_prod_evttot + prefix_dir: /data2/MLhep/sim/train_131049/ + unmerged_tree_dir: [alice/cern.ch/user/a/alihyperloop/jobs/0024] + pkl: [d0jet/pkl] #list of periods + pkl_skimmed: [d0jet/pklsk] #list of periods + pkl_skimmed_merge_for_ml: [d0jet/pklskml] #list of periods + pkl_skimmed_merge_for_ml_all: d0jet/pp_mc_prod_mltot + pkl_evtcounter_all: d0jet/pp_mc_prod_evttot mcreweights: [../Analyses] #list of periods ml: @@ -195,8 +196,9 @@ D0pp_jet: binmax: [2,4,6,8,12,24,48] # must be equal to sel_skim_binmax (sel_skim_binmin bins) mltype: BinaryClassification ncorescrossval: 10 - mlplot: /home/jklein/data/mlplot # to be removed - mlout: /home/jklein/data/mlout # to be removed + prefix_dir_ml: /data2/jklein/MLhep + mlplot: mlplot + mlout: mlout opt: isFONLLfromROOT: true @@ -213,15 +215,17 @@ D0pp_jet: bkg_function: pol2 # fit function for bkg (among TH1 predefined fit functions, e.g. expo, pol1, pol2, ...) save_fit: True # save bkg fits with the various cuts on ML output raahp: [1,1,1,1,1,1,1] # sel_skim_binmin bins - presel_gen_eff: "abs(y_cand) < 0.5 and abs(z_vtx_gen) < 10" + presel_gen_eff: "abs(fY) < 0.5 and abs(fPosZ) < 10" mlapplication: data: - pkl_skimmed_dec: [/home/jklein/data/test/d0jet/pklskdec] #list of periods - pkl_skimmed_decmerged: [/home/jklein/data/test/d0jet/pklskdecmerged] #list of periods + prefix_dir_res: /data2/jklein/ + pkl_skimmed_dec: [LHC22pp/MLapplication/prod_LHC22o/skpkldecdata] #list of periods + pkl_skimmed_decmerged: [LHC22pp/MLapplication/prod_LHC22o/skpkldecdatamerged] #list of periods mc: - pkl_skimmed_dec: [/home/jklein/data/mctest/d0jet/pklskdec] #list of periods - pkl_skimmed_decmerged: [/home/jklein/mctest/d0jet/pklskdecmerged] #list of periods + prefix_dir_res: /data2/jklein/ + pkl_skimmed_dec: [LHC22pp_mc/MLapplication/prod_LHC22b1b/skpkldecmc] #list of periods + pkl_skimmed_decmerged: [LHC22pp_mc/MLapplication/prod_LHC22b1b/skpkldecmcmerged] #list of periods modelname: xgboost modelsperptbin: [xgboost_classifierD0pp_FF_dfselection_pt_cand_1.0_2.0.sav, xgboost_classifierD0pp_FF_dfselection_pt_cand_2.0_4.0.sav, @@ -241,7 +245,7 @@ D0pp_jet: cctype: 1 #kpp7 sigmav0: 57.8e-3 #NB: multiplied by 1e12 before giving to HFPtSpectrum! inputfonllpred: data/fonll/D0DplusDstarPredictions_13TeV_y05_all_300416_BDShapeCorrected.root - dir_general_plots: /home/jklein/data/analysis_plots + dir_general_plots: /data2/jklein/data/analysis_plots jet_zg: &jet_default proc_type: Jets @@ -284,12 +288,12 @@ D0pp_jet: mc: null data: &data_out_default runselection: [null] #FIXME - results: [/home/jklein/data/test/d0jet/resultsMBjetvspt] #list of periods - resultsallp: /home/jklein/data/test/d0jet/resultsMBjetvspt_all + results: [/data2/jklein/data/test/d0jet/resultsMBjetvspt] #list of periods + resultsallp: /data2/jklein/data/test/d0jet/resultsMBjetvspt_all mc: &mc_out_default runselection: [null] #FIXME - results: [/home/jklein/data/mctest/d0jet/resultsMBjetvspt] #list of periods - resultsallp: /home/jklein/data/mctest/d0jet/resultsMBjetvspt_all + results: [/data2/jklein/data/mctest/d0jet/resultsMBjetvspt] #list of periods + resultsallp: /data2/jklein/data/mctest/d0jet/resultsMBjetvspt_all data_proc: # alternative processor output used as the analyzer input <<: *data_out_default mc_proc: # alternative processor output used as the analyzer input diff --git a/machine_learning_hep/optimiser.py b/machine_learning_hep/optimiser.py index fbac5b9ba9..76c9d8589f 100644 --- a/machine_learning_hep/optimiser.py +++ b/machine_learning_hep/optimiser.py @@ -54,10 +54,11 @@ def __init__(self, data_param, case, typean, model_config, binmin, self.logger = get_logger() - dirprefix = data_param["multi"]["data"].get("prefix_dir", "") + dirprefixdata = data_param["multi"]["data"].get("prefix_dir", "") + dirprefixmc = data_param["multi"]["mc"].get("prefix_dir", "") dirprefix_ml = data_param["ml"].get("prefix_dir_ml", "") - dirmcml = dirprefix + data_param["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"] - dirdataml = dirprefix + data_param["multi"]["data"]["pkl_skimmed_merge_for_ml_all"] + dirmcml = dirprefixmc + data_param["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"] + dirdataml = dirprefixdata + data_param["multi"]["data"]["pkl_skimmed_merge_for_ml_all"] self.v_bin = data_param["var_binning"] #directory self.dirmlout = dirprefix_ml + data_param["ml"]["mlout"] @@ -75,6 +76,8 @@ def __init__(self, data_param, case, typean, model_config, binmin, print(f"rm -r {self.dirmlplot}") self.logger.fatal("Please remove above directories as indicated above first and " \ "run again") + if self.steps_done == None: + self.steps_done = [] #ml file names self.n_reco = data_param["files_names"]["namefile_reco"] @@ -205,7 +208,7 @@ def __init__(self, data_param, case, typean, model_config, binmin, self.f_mltest_applied = f"{self.dirmlout}/testsample_{self.s_suffix}_mldecision.pkl" self.df_mltest_applied = None - print(training_var) + self.logger.info('training variables: %s', training_var) def create_suffix(self): string_selection = createstringselection(self.v_bin, self.p_binmin, self.p_binmax) @@ -320,16 +323,10 @@ def preparesample(self): self.step_done("preparemlsamples") def step_done(self, step): - if self.steps_done is None: - self.steps_done = [] - step_name = f"{step}_{self.p_binmin}_{self.p_binmax}" if step_name in self.steps_done: - print("\n\n") self.logger.warning("Done ML step %s already. It's skipped now. Remove the step " \ - "from the list in the following file", step_name) - print(self.file_steps_done) - print("\n\n") + "from the list in %s", step_name, self.file_steps_done) return True # Add this steps and update the corresponsing file @@ -445,7 +442,7 @@ def do_test(self): self.df_mltest, self.v_train, self.v_sig) df_ml_test_to_root = self.dirmlout+"/testsample_%s_mldecision.root" % (self.s_suffix) pickle.dump(self.df_mltest_applied, openfile(self.f_mltest_applied, "wb"), protocol=4) - write_tree(df_ml_test_to_root, self.n_treetest, self.df_mltest_applied) + # write_tree(df_ml_test_to_root, self.n_treetest, self.df_mltest_applied) def do_apply(self): diff --git a/machine_learning_hep/processer_jet.py b/machine_learning_hep/processer_jet.py index 471a63c741..cece44f635 100644 --- a/machine_learning_hep/processer_jet.py +++ b/machine_learning_hep/processer_jet.py @@ -69,5 +69,4 @@ def process_histomass_single(self, index): f'hmass_{ipt}', "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) fill_hist(h_invmass_all, df.fM) - h_invmass_all.Print() h_invmass_all.Write() diff --git a/machine_learning_hep/utilities.py b/machine_learning_hep/utilities.py index 33766d167e..70077c9b6a 100644 --- a/machine_learning_hep/utilities.py +++ b/machine_learning_hep/utilities.py @@ -74,8 +74,7 @@ def openfile(filename, attr): if filename.lower().endswith('.lz4'): return lz4.frame.open(filename, attr) if filename.lower().endswith('.pkl'): - return open(filename, attr, encoding='utf-8') - return open(filename, attr) + return open(filename, attr, encoding='utf-8' if 'b' not in attr else None) def mask_df(df_to_mask, mask_config): """