diff --git a/src/HH4b/boosted/PicoAODBDTInference.ipynb b/src/HH4b/boosted/PicoAODBDTInference.ipynb new file mode 100644 index 00000000..e1b2f600 --- /dev/null +++ b/src/HH4b/boosted/PicoAODBDTInference.ipynb @@ -0,0 +1,272 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xgboost as xgb\n", + "from coffea import nanoevents\n", + "from coffea.nanoevents.methods.base import NanoEventsArray\n", + "import awkward as ak\n", + "import vector\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import mplhep as hep\n", + "import uproot\n", + "\n", + "plt.style.use(hep.style.CMS)\n", + "\n", + "\n", + "def to_np_array(ak_array, max_n=2, pad=0):\n", + " return ak.fill_none(ak.pad_none(ak_array, max_n, clip=True, axis=-1), pad).to_numpy()\n", + "\n", + "\n", + "file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL16_preVFP/picoAOD.root\"\n", + "# file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL16_postVFP/picoAOD.root\"\n", + "# file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL17/picoAOD.chunk0.root\"\n", + "# file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL17/picoAOD.chunk1.root\"\n", + "# file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL18/picoAOD.chunk0.root\"\n", + "# file_name = \"/ceph/cms/store/user/woodson/boosted/GluGluToHHTo4B_cHHH1_UL18/picoAOD.chunk1.root\"\n", + "model_fname = \"src/HH4b/boosted/bdt_trainings_run2/model_xgboost_training_weights_qcd_and_ttbar_Run2_bdt_enhanced_v8p2/trained_bdt.model\"\n", + "bdt_model = xgb.XGBClassifier()\n", + "bdt_model.load_model(fname=model_fname)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "events = nanoevents.NanoEventsFactory.from_root(\n", + " file_name,\n", + " schemaclass=nanoevents.NanoAODSchema,\n", + ").events()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sorted_by_bbtag = ak.argsort(\n", + " events[\"FatJet\"][\"particleNetMD_Xbb\"]\n", + " / (events[\"FatJet\"][\"particleNetMD_Xbb\"] + events[\"FatJet\"][\"particleNetMD_QCD\"]),\n", + " ascending=False,\n", + " axis=-1,\n", + ")\n", + "fatjets_sorted = events[\"FatJet\"][sorted_by_bbtag]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fatjet_xbb = to_np_array(fatjets_sorted[\"particleNetMD_Xbb\"], max_n=2, pad=0)\n", + "fatjet_qcd = to_np_array(fatjets_sorted[\"particleNetMD_QCD\"], max_n=2, pad=0)\n", + "fatjet_txbb = fatjet_xbb / (fatjet_xbb + fatjet_qcd)\n", + "fatjet_qcdb = to_np_array(fatjets_sorted[\"particleNetMD_QCDb\"], max_n=2, pad=0)\n", + "fatjet_qcdbb = to_np_array(fatjets_sorted[\"particleNetMD_QCDbb\"], max_n=2, pad=0)\n", + "fatjet_qcdothers = to_np_array(fatjets_sorted[\"particleNetMD_QCDothers\"], max_n=2, pad=0)\n", + "fatjet_pnetmass = to_np_array(fatjets_sorted[\"particleNet_mass\"], max_n=2, pad=0)\n", + "\n", + "fatjet_pt = to_np_array(fatjets_sorted[\"pt\"], max_n=2, pad=0)\n", + "fatjet_eta = to_np_array(fatjets_sorted[\"eta\"], max_n=2, pad=0)\n", + "fatjet_phi = to_np_array(fatjets_sorted[\"phi\"], max_n=2, pad=0)\n", + "fatjet_msd = to_np_array(fatjets_sorted[\"msoftdrop\"], max_n=2, pad=0)\n", + "fatjet_tau2 = to_np_array(fatjets_sorted[\"tau2\"], max_n=2, pad=0)\n", + "fatjet_tau3 = to_np_array(fatjets_sorted[\"tau3\"], max_n=2, pad=0)\n", + "\n", + "mask = (\n", + " (fatjet_pt[:, 0] > 300)\n", + " & (fatjet_pt[:, 1] > 300)\n", + " & (fatjet_msd[:, 0] > 40)\n", + " & (fatjet_pnetmass[:, 1] > 50)\n", + " & (fatjet_txbb[:, 0] > 0.8)\n", + " & (np.abs(fatjet_eta[:, 0]) < 2.4)\n", + " & (np.abs(fatjet_eta[:, 1]) < 2.4)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "h1 = vector.array(\n", + " {\"pt\": fatjet_pt[:, 0], \"phi\": fatjet_phi[:, 0], \"eta\": fatjet_eta[:, 0], \"M\": fatjet_msd[:, 0]}\n", + ")\n", + "h2 = vector.array(\n", + " {\"pt\": fatjet_pt[:, 1], \"phi\": fatjet_phi[:, 1], \"eta\": fatjet_eta[:, 1], \"M\": fatjet_msd[:, 1]}\n", + ")\n", + "hh = h1 + h2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_events = pd.DataFrame(\n", + " {\n", + " # dihiggs system\n", + " \"HHPt\": hh.pt,\n", + " \"HHeta\": hh.eta,\n", + " \"HHmass\": hh.mass,\n", + " # met in the event\n", + " \"MET\": events[\"PuppiMET\"][\"pt\"].to_numpy(),\n", + " # fatjet tau32\n", + " \"H1T32\": fatjet_tau3[:, 0] / fatjet_tau2[:, 0],\n", + " \"H2T32\": fatjet_tau3[:, 1] / fatjet_tau2[:, 1],\n", + " # fatjet mass\n", + " \"H1Mass\": fatjet_msd[:, 0],\n", + " # fatjet kinematics\n", + " \"H1Pt\": fatjet_pt[:, 0],\n", + " \"H1eta\": fatjet_eta[:, 0],\n", + " # xbb\n", + " \"H1Xbb\": fatjet_txbb[:, 0],\n", + " \"H1QCDb\": fatjet_qcdb[:, 0],\n", + " \"H1QCDbb\": fatjet_qcdbb[:, 0],\n", + " \"H1QCDothers\": fatjet_qcdothers[:, 0],\n", + " \"H2Pt\": fatjet_pt[:, 1],\n", + " # ratios\n", + " \"H1Pt_HHmass\": fatjet_pt[:, 0] / hh.mass,\n", + " \"H2Pt_HHmass\": fatjet_pt[:, 1] / hh.mass,\n", + " \"H2Pt/H1Pt\": fatjet_pt[:, 1] / fatjet_pt[:, 0],\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_events[\"bdt_score\"] = bdt_model.predict_proba(df_events)[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bdt_fail = 0.03\n", + "bdt_bin1 = 0.43\n", + "bdt_bin2 = 0.11\n", + "xbb_bin1 = 0.98\n", + "xbb_bin2 = 0.95\n", + "\n", + "mask_bin1 = mask & (fatjet_txbb[:, 1] > xbb_bin1) & (df_events[\"bdt_score\"] > bdt_bin1)\n", + "mask_bin2 = (\n", + " mask\n", + " & ~mask_bin1\n", + " & (\n", + " ((fatjet_txbb[:, 1] > xbb_bin1) & (df_events[\"bdt_score\"] > bdt_bin2))\n", + " | ((fatjet_txbb[:, 1] > xbb_bin2) & (df_events[\"bdt_score\"] > bdt_bin1))\n", + " )\n", + ")\n", + "mask_bin3 = (\n", + " mask\n", + " & ~mask_bin1\n", + " & ~mask_bin2\n", + " & (fatjet_txbb[:, 1] > xbb_bin2)\n", + " & (df_events[\"bdt_score\"] > bdt_fail)\n", + ")\n", + "mask_fail = mask & ~mask_bin1 & ~mask_bin2 & ~mask_bin3 & (df_events[\"bdt_score\"] > bdt_fail)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_events[\"bdt_bin\"] = np.zeros(len(df_events))\n", + "df_events.loc[mask_bin1, \"bdt_bin\"] = 1\n", + "df_events.loc[mask_bin2, \"bdt_bin\"] = 2\n", + "df_events.loc[mask_bin3, \"bdt_bin\"] = 3\n", + "df_events.loc[mask_fail, \"bdt_bin\"] = 0\n", + "df_events.loc[~mask, \"bdt_bin\"] = -1\n", + "df_events.loc[~mask, \"bdt_score\"] = -1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# make 18 subfigures\n", + "plt.figure()\n", + "fig, axs = plt.subplots(3, 6, figsize=(40, 20), sharey=True)\n", + "for i, col in enumerate(df_events.columns):\n", + " if i > 17:\n", + " continue\n", + " ax = axs[i // 6, i % 6]\n", + " ax.hist(df_events[col][mask], bins=50, histtype=\"step\")\n", + " ax.set_xlabel(col)\n", + " if i % 6 == 0:\n", + " ax.set_ylabel(\"Events\")\n", + " ax.set_yscale(\"log\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with uproot.open(file_name) as f:\n", + " arrays = f[\"Events\"].arrays()\n", + " with uproot.recreate(file_name.replace(\".root\", \".withBDT.root\")) as f_out:\n", + " f_out[\"Events\"] = {field: arrays[field] for field in arrays.fields} | {\n", + " \"bdt_score\": df_events[\"bdt_score\"].to_numpy()\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure()\n", + "plt.scatter(df_events[\"bdt_score\"], fatjet_txbb[:, 1], c=df_events[\"bdt_bin\"])\n", + "plt.xlim(0, 1)\n", + "plt.ylim(0.93, 1)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}