Skip to content

Commit

Permalink
Merge branch 'main' into pre-commit-ci-update-config
Browse files Browse the repository at this point in the history
  • Loading branch information
cmantill authored Oct 3, 2024
2 parents 020f662 + 54ee5fc commit 81e9e3e
Show file tree
Hide file tree
Showing 261 changed files with 35,193 additions and 58 deletions.
2,103 changes: 2,103 additions & 0 deletions src/HH4b/boosted/FitB_0.95.ipynb

Large diffs are not rendered by default.

585 changes: 585 additions & 0 deletions src/HH4b/boosted/FitB_0.95_PoissonCDF_separate.ipynb

Large diffs are not rendered by default.

628 changes: 628 additions & 0 deletions src/HH4b/boosted/Optimize_WP_with_SmoothB.ipynb

Large diffs are not rendered by default.

272 changes: 272 additions & 0 deletions src/HH4b/boosted/PicoAODBDTInference.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import xgboost as xgb\n",
"from coffea import nanoevents\n",
"from coffea.nanoevents.methods.base import NanoEventsArray\n",
"import awkward as ak\n",
"import vector\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import mplhep as hep\n",
"import uproot\n",
"\n",
"plt.style.use(hep.style.CMS)\n",
"\n",
"\n",
"def to_np_array(ak_array, max_n=2, pad=0):\n",
" return ak.fill_none(ak.pad_none(ak_array, max_n, clip=True, axis=-1), pad).to_numpy()\n",
"\n",
"\n",
"file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL16_preVFP/picoAOD.root\"\n",
"# file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL16_postVFP/picoAOD.root\"\n",
"# file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL17/picoAOD.chunk0.root\"\n",
"# file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL17/picoAOD.chunk1.root\"\n",
"# file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL18/picoAOD.chunk0.root\"\n",
"# file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL18/picoAOD.chunk1.root\"\n",
"model_fname = \"src/HH4b/boosted/bdt_trainings_run2/model_xgboost_training_weights_qcd_and_ttbar_Run2_bdt_enhanced_v8p2/trained_bdt.model\"\n",
"bdt_model = xgb.XGBClassifier()\n",
"bdt_model.load_model(fname=model_fname)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"events = nanoevents.NanoEventsFactory.from_root(\n",
" file_name,\n",
" schemaclass=nanoevents.NanoAODSchema,\n",
").events()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sorted_by_bbtag = ak.argsort(\n",
" events[\"FatJet\"][\"particleNetMD_Xbb\"]\n",
" / (events[\"FatJet\"][\"particleNetMD_Xbb\"] + events[\"FatJet\"][\"particleNetMD_QCD\"]),\n",
" ascending=False,\n",
" axis=-1,\n",
")\n",
"fatjets_sorted = events[\"FatJet\"][sorted_by_bbtag]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fatjet_xbb = to_np_array(fatjets_sorted[\"particleNetMD_Xbb\"], max_n=2, pad=0)\n",
"fatjet_qcd = to_np_array(fatjets_sorted[\"particleNetMD_QCD\"], max_n=2, pad=0)\n",
"fatjet_txbb = fatjet_xbb / (fatjet_xbb + fatjet_qcd)\n",
"fatjet_qcdb = to_np_array(fatjets_sorted[\"particleNetMD_QCDb\"], max_n=2, pad=0)\n",
"fatjet_qcdbb = to_np_array(fatjets_sorted[\"particleNetMD_QCDbb\"], max_n=2, pad=0)\n",
"fatjet_qcdothers = to_np_array(fatjets_sorted[\"particleNetMD_QCDothers\"], max_n=2, pad=0)\n",
"fatjet_pnetmass = to_np_array(fatjets_sorted[\"particleNet_mass\"], max_n=2, pad=0)\n",
"\n",
"fatjet_pt = to_np_array(fatjets_sorted[\"pt\"], max_n=2, pad=0)\n",
"fatjet_eta = to_np_array(fatjets_sorted[\"eta\"], max_n=2, pad=0)\n",
"fatjet_phi = to_np_array(fatjets_sorted[\"phi\"], max_n=2, pad=0)\n",
"fatjet_msd = to_np_array(fatjets_sorted[\"msoftdrop\"], max_n=2, pad=0)\n",
"fatjet_tau2 = to_np_array(fatjets_sorted[\"tau2\"], max_n=2, pad=0)\n",
"fatjet_tau3 = to_np_array(fatjets_sorted[\"tau3\"], max_n=2, pad=0)\n",
"\n",
"mask = (\n",
" (fatjet_pt[:, 0] > 300)\n",
" & (fatjet_pt[:, 1] > 300)\n",
" & (fatjet_msd[:, 0] > 40)\n",
" & (fatjet_pnetmass[:, 1] > 50)\n",
" & (fatjet_txbb[:, 0] > 0.8)\n",
" & (np.abs(fatjet_eta[:, 0]) < 2.4)\n",
" & (np.abs(fatjet_eta[:, 1]) < 2.4)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"h1 = vector.array(\n",
" {\"pt\": fatjet_pt[:, 0], \"phi\": fatjet_phi[:, 0], \"eta\": fatjet_eta[:, 0], \"M\": fatjet_msd[:, 0]}\n",
")\n",
"h2 = vector.array(\n",
" {\"pt\": fatjet_pt[:, 1], \"phi\": fatjet_phi[:, 1], \"eta\": fatjet_eta[:, 1], \"M\": fatjet_msd[:, 1]}\n",
")\n",
"hh = h1 + h2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_events = pd.DataFrame(\n",
" {\n",
" # dihiggs system\n",
" \"HHPt\": hh.pt,\n",
" \"HHeta\": hh.eta,\n",
" \"HHmass\": hh.mass,\n",
" # met in the event\n",
" \"MET\": events[\"PuppiMET\"][\"pt\"].to_numpy(),\n",
" # fatjet tau32\n",
" \"H1T32\": fatjet_tau3[:, 0] / fatjet_tau2[:, 0],\n",
" \"H2T32\": fatjet_tau3[:, 1] / fatjet_tau2[:, 1],\n",
" # fatjet mass\n",
" \"H1Mass\": fatjet_msd[:, 0],\n",
" # fatjet kinematics\n",
" \"H1Pt\": fatjet_pt[:, 0],\n",
" \"H1eta\": fatjet_eta[:, 0],\n",
" # xbb\n",
" \"H1Xbb\": fatjet_txbb[:, 0],\n",
" \"H1QCDb\": fatjet_qcdb[:, 0],\n",
" \"H1QCDbb\": fatjet_qcdbb[:, 0],\n",
" \"H1QCDothers\": fatjet_qcdothers[:, 0],\n",
" \"H2Pt\": fatjet_pt[:, 1],\n",
" # ratios\n",
" \"H1Pt_HHmass\": fatjet_pt[:, 0] / hh.mass,\n",
" \"H2Pt_HHmass\": fatjet_pt[:, 1] / hh.mass,\n",
" \"H2Pt/H1Pt\": fatjet_pt[:, 1] / fatjet_pt[:, 0],\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_events[\"bdt_score\"] = bdt_model.predict_proba(df_events)[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bdt_fail = 0.03\n",
"bdt_bin1 = 0.43\n",
"bdt_bin2 = 0.11\n",
"xbb_bin1 = 0.98\n",
"xbb_bin2 = 0.95\n",
"\n",
"mask_bin1 = mask & (fatjet_txbb[:, 1] > xbb_bin1) & (df_events[\"bdt_score\"] > bdt_bin1)\n",
"mask_bin2 = (\n",
" mask\n",
" & ~mask_bin1\n",
" & (\n",
" ((fatjet_txbb[:, 1] > xbb_bin1) & (df_events[\"bdt_score\"] > bdt_bin2))\n",
" | ((fatjet_txbb[:, 1] > xbb_bin2) & (df_events[\"bdt_score\"] > bdt_bin1))\n",
" )\n",
")\n",
"mask_bin3 = (\n",
" mask\n",
" & ~mask_bin1\n",
" & ~mask_bin2\n",
" & (fatjet_txbb[:, 1] > xbb_bin2)\n",
" & (df_events[\"bdt_score\"] > bdt_fail)\n",
")\n",
"mask_fail = mask & ~mask_bin1 & ~mask_bin2 & ~mask_bin3 & (df_events[\"bdt_score\"] > bdt_fail)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_events[\"bdt_bin\"] = np.zeros(len(df_events))\n",
"df_events.loc[mask_bin1, \"bdt_bin\"] = 1\n",
"df_events.loc[mask_bin2, \"bdt_bin\"] = 2\n",
"df_events.loc[mask_bin3, \"bdt_bin\"] = 3\n",
"df_events.loc[mask_fail, \"bdt_bin\"] = 0\n",
"df_events.loc[~mask, \"bdt_bin\"] = -1\n",
"df_events.loc[~mask, \"bdt_score\"] = -1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# make 18 subfigures\n",
"plt.figure()\n",
"fig, axs = plt.subplots(3, 6, figsize=(40, 20), sharey=True)\n",
"for i, col in enumerate(df_events.columns):\n",
" if i > 17:\n",
" continue\n",
" ax = axs[i // 6, i % 6]\n",
" ax.hist(df_events[col][mask], bins=50, histtype=\"step\")\n",
" ax.set_xlabel(col)\n",
" if i % 6 == 0:\n",
" ax.set_ylabel(\"Events\")\n",
" ax.set_yscale(\"log\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with uproot.open(file_name) as f:\n",
" arrays = f[\"Events\"].arrays()\n",
" with uproot.recreate(file_name.replace(\".root\", \".withBDT.root\")) as f_out:\n",
" f_out[\"Events\"] = {field: arrays[field] for field in arrays.fields} | {\n",
" \"bdt_score\": df_events[\"bdt_score\"].to_numpy()\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure()\n",
"plt.scatter(df_events[\"bdt_score\"], fatjet_txbb[:, 1], c=df_events[\"bdt_bin\"])\n",
"plt.xlim(0, 1)\n",
"plt.ylim(0.93, 1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
| | Importance |
|---:|:-----------------------------------|
| 0 | ('H1Mass', 0.19536893) |
| 1 | ('H1Xbb', 0.14744073) |
| 2 | ('VBFjjMass', 0.09980717) |
| 3 | ('HHmass', 0.07135053) |
| 4 | ('H1Pt', 0.06352392) |
| 5 | ('H1QCDb', 0.05620533) |
| 6 | ('VBFjjDeltaEta', 0.05532601) |
| 7 | ('H1QCDbb', 0.052580874) |
| 8 | ('H1Pt_HHmass', 0.03716159) |
| 9 | ('H2AK4JetAway2mass', 0.034708854) |
| 10 | ('H1AK4JetAway1mass', 0.03399577) |
| 11 | ('MET', 0.021699335) |
| 12 | ('H1AK4JetAway1dR', 0.018593973) |
| 13 | ('H2AK4JetAway2dR', 0.018401144) |
| 14 | ('H1T32', 0.018363358) |
| 15 | ('H2Pt_HHmass', 0.016176052) |
| 16 | ('H2T32', 0.014278396) |
| 17 | ('HHPt', 0.014068713) |
| 18 | ('H2Pt', 0.013303636) |
| 19 | ('H1QCDothers', 0.005493068) |
| 20 | ('H1Pt/H2Pt', 0.0049682693) |
| 21 | ('HHeta', 0.0036979003) |
| 22 | ('H1eta', 0.0034863807) |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 81e9e3e

Please sign in to comment.