Skip to content

Commit

Permalink
Added eventlist.py (#208)
Browse files Browse the repository at this point in the history
* Added notebook to retrieve eventlist

* style: pre-commit fixes

* better merge

* style: pre-commit fixes

* update

* style: pre-commit fixes

* saving loaded events in root, wip

* NB loads file, selects desired columns from hh4b, saves to root

* root files saved into a separate folder

* started making eventlist.py, made note of potential bug in EventList.ipynb

* minor touch ups

* make eventlist.py

* added README.md

* added README.md

* tested eventlist script, ready for merge

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Javier Duarte <[email protected]>
  • Loading branch information
3 people authored Aug 5, 2024
1 parent e1eeca7 commit 68aba52
Show file tree
Hide file tree
Showing 6 changed files with 484 additions and 4 deletions.
314 changes: 314 additions & 0 deletions src/HH4b/overlap/EventList.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# automatically reloads imported files on edits\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from argparse import Namespace\n",
"from HH4b.postprocessing.PostProcess import load_process_run3_samples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# fix: are we only loading 2022 despite the loop?\n",
"data_folder = \"24May24_v12_private_signal\"\n",
"args = Namespace(\n",
" templates_tag=\"24June27\",\n",
" data_dir=\"/ceph/cms/store/user/cmantill/bbbb/skimmer/\",\n",
" tag=\"24May24_v12_private_signal\",\n",
" years=[\"2022\"], # TODO: this line may be redundant, overridden later\n",
" training_years=None,\n",
" mass=\"H2PNetMass\",\n",
" bdt_model=\"24May31_lr_0p02_md_8_AK4Away\",\n",
" bdt_config=\"24May31_lr_0p02_md_8_AK4Away\",\n",
" txbb_wps=[0.975, 0.92],\n",
" bdt_wps=[0.98, 0.88, 0.03],\n",
" method=\"sideband\",\n",
" vbf_txbb_wp=0.95,\n",
" vbf_bdt_wp=0.98,\n",
" sig_keys=[\"hh4b\", \"vbfhh4b\"],\n",
" pt_first=300,\n",
" pt_second=250,\n",
" bdt_roc=False,\n",
" control_plots=False,\n",
" fom_scan=False,\n",
" fom_scan_bin1=True,\n",
" fom_scan_bin2=True,\n",
" fom_scan_vbf=False,\n",
" templates=False,\n",
" legacy=True,\n",
" vbf=True,\n",
" vbf_priority=False,\n",
" weight_ttbar_bdt=1,\n",
" blind=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bdt_training_keys = [\"qcd\", \"vbfhh4b-k2v0\", \"hh4b\", \"ttbar\"]\n",
"mass_window = np.array([105, 150])\n",
"years = [\"2022\", \"2022EE\", \"2023\", \"2023BPix\"]\n",
"\n",
"ev_dicts = []\n",
"for year in years:\n",
" ev_dict, _ = load_process_run3_samples(\n",
" args,\n",
" year=year,\n",
" bdt_training_keys=bdt_training_keys,\n",
" control_plots=False,\n",
" plot_dir=\"plot_dir\",\n",
" mass_window=mass_window,\n",
" )\n",
" ev_dicts.append((year, ev_dict))\n",
"\n",
"\"\"\"\n",
"python3 PostProcess.py --templates-tag 24June27 --tag 24May24_v12_private_signal --mass H2PNetMass --legacy --bdt-config 24May31_lr_0p02_md_8_AK4Away --bdt-model 24May31_lr_0p02_md_8_AK4Away --txbb-wps 0.975 0.92 --bdt-wps 0.98 0.88 0.03 --vbf-txbb-wp 0.95 --vbf-bdt-wp 0.98 --no-bdt-roc --no-fom-scan --no-fom-scan-bin2 --no-fom-scan-bin1 --data-dir /ceph/cms/store/user/cmantill/bbbb/skimmer/ --method abcd --no-vbf-priority --vbf --no-fom-scan-vbf --pt-second 250 --templates --years 2022 --sig-keys hh4b vbfhh4b\n",
"\"\"\"\n",
"# make array with event nr, signal category\n",
"# make mask with signal category cutoffs, apply to array\n",
"\n",
"# save as root"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import uproot\n",
"import pandas as pd\n",
"\n",
"# select columns to extract for eventlist\n",
"eventlist_dict = [\"event\", \"bdt_score\", \"bdt_score_vbf\", \"H2TXbb\", \"H2Msd\", \"run\", \"H2PNetMass\"]\n",
"\n",
"# Loop over all years and save event lists for each year in separate root files\n",
"for year, ev_dict in ev_dicts:\n",
"\n",
" hh4b_df = ev_dict[\"hh4b\"]\n",
" event_list = hh4b_df[eventlist_dict]\n",
" eventlist_folder = \"eventlist_files\"\n",
" array_to_save = {col: event_list[col].to_numpy() for col in event_list.columns}\n",
"\n",
" with uproot.recreate(f\"{eventlist_folder}/eventlist_boostedHH4b_{year}.root\") as file:\n",
" file[\"tree\"] = array_to_save"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" event bdt_score bdt_score_vbf H2TXbb H2Msd run \\\n",
"0 933278.0 -0.190691 -0.287247 -0.877972 10.437500 0.0 \n",
"1 933279.0 0.387595 0.172054 0.929077 -16.750000 0.0 \n",
"2 985809.0 -0.030619 -0.176729 -0.309961 -67.968750 0.0 \n",
"3 976389.0 -0.267515 0.785077 -0.206275 8.812500 0.0 \n",
"4 976378.0 -0.890878 -0.761103 -0.188850 4.888672 0.0 \n",
"... ... ... ... ... ... ... \n",
"123020 NaN NaN NaN NaN NaN NaN \n",
"123021 NaN NaN NaN NaN NaN NaN \n",
"123022 NaN NaN NaN NaN NaN NaN \n",
"123023 NaN NaN NaN NaN NaN NaN \n",
"123024 NaN NaN NaN NaN NaN NaN \n",
"\n",
" H2PNetMass \n",
"0 29.488575 \n",
"1 -5.171450 \n",
"2 -39.118672 \n",
"3 -10.132099 \n",
"4 -68.518494 \n",
"... ... \n",
"123020 NaN \n",
"123021 NaN \n",
"123022 NaN \n",
"123023 NaN \n",
"123024 NaN \n",
"\n",
"[123025 rows x 7 columns]\n",
"DataFrame for year 2022:\n",
" event bdt_score bdt_score_vbf H2TXbb H2Msd run H2PNetMass\n",
"0 1450699 0.777012 0.648450 0.000226 127.18750 1 126.742600\n",
"1 1450708 0.829893 0.179832 0.985870 119.00000 1 143.878378\n",
"2 1503251 0.860970 0.192123 0.662430 46.03125 1 83.081488\n",
"3 1503311 0.609028 0.843063 0.547774 104.56250 1 104.581630\n",
"4 1503323 0.069489 0.003767 0.794878 7.68750 1 90.861472\n",
"... ... ... ... ... ... ... ...\n",
"38743 1644555 0.756790 0.281042 0.956689 118.87500 1 108.395992\n",
"38744 1644579 0.974344 0.938412 0.993592 146.75000 1 134.922604\n",
"38745 1644584 0.965913 0.652996 0.977377 138.50000 1 141.811446\n",
"38746 2011738 0.877202 0.202041 0.964582 97.75000 1 110.795429\n",
"38747 2011784 0.937218 0.469333 0.981580 165.12500 1 175.498111\n",
"\n",
"[38748 rows x 7 columns]\n",
"\n",
"DataFrame for year 2022EE:\n",
" event bdt_score bdt_score_vbf H2TXbb H2Msd run \\\n",
"0 517421 0.967703 0.935697 0.878198 116.750000 1 \n",
"1 517429 0.442298 0.007778 0.056793 135.750000 1 \n",
"2 517442 0.891588 0.368852 0.972391 114.000000 1 \n",
"3 526922 0.876543 0.057986 0.754049 95.750000 1 \n",
"4 526945 0.960367 0.764870 0.983729 2.798828 1 \n",
"... ... ... ... ... ... ... \n",
"123020 7216879 0.951921 0.806762 0.912221 149.375000 1 \n",
"123021 5581281 0.719542 0.800304 0.798522 103.375000 1 \n",
"123022 5583347 0.953829 0.229519 0.588638 5.394531 1 \n",
"123023 5583502 0.265112 0.024427 0.979906 123.375000 1 \n",
"123024 5583601 0.979797 0.629910 0.981377 129.875000 1 \n",
"\n",
" H2PNetMass \n",
"0 97.254025 \n",
"1 149.049828 \n",
"2 122.200160 \n",
"3 114.713729 \n",
"4 159.379966 \n",
"... ... \n",
"123020 129.033750 \n",
"123021 93.161409 \n",
"123022 67.227980 \n",
"123023 123.595933 \n",
"123024 114.603574 \n",
"\n",
"[123025 rows x 7 columns]\n",
"\n",
"DataFrame for year 2023:\n",
" event bdt_score bdt_score_vbf H2TXbb H2Msd run \\\n",
"0 48406 0.871542 0.407782 0.355691 71.250000 1 \n",
"1 321346 0.598415 0.384864 0.828598 138.250000 1 \n",
"2 321348 0.940107 0.717888 0.239016 152.125000 1 \n",
"3 321384 0.578239 0.353820 0.877495 79.187500 1 \n",
"4 345023 0.969481 0.622445 0.917526 119.312500 1 \n",
"... ... ... ... ... ... ... \n",
"105782 5694280 0.962341 0.907263 0.382345 172.375000 1 \n",
"105783 5694500 0.914343 0.514028 0.939019 128.125000 1 \n",
"105784 5694846 0.833464 0.060283 0.558651 113.750000 1 \n",
"105785 5696413 0.989071 0.970044 0.981745 107.312500 1 \n",
"105786 5696417 0.035577 0.011297 0.949736 3.310547 1 \n",
"\n",
" H2PNetMass \n",
"0 84.481631 \n",
"1 121.689927 \n",
"2 141.587560 \n",
"3 104.512258 \n",
"4 144.622505 \n",
"... ... \n",
"105782 168.836048 \n",
"105783 157.558402 \n",
"105784 138.170798 \n",
"105785 126.471277 \n",
"105786 133.258726 \n",
"\n",
"[105787 rows x 7 columns]\n",
"\n",
"DataFrame for year 2023BPix:\n",
" event bdt_score bdt_score_vbf H2TXbb H2Msd run \\\n",
"0 61386 0.863008 0.475046 0.973397 126.562500 1 \n",
"1 331113 0.363661 0.265023 0.981445 114.250000 1 \n",
"2 331158 0.977542 0.573044 0.551880 60.843750 1 \n",
"3 331214 0.739434 0.237233 0.972184 171.875000 1 \n",
"4 405989 0.742482 0.059563 0.880698 59.937500 1 \n",
"... ... ... ... ... ... ... \n",
"45946 2346930 0.050010 0.019253 0.127778 4.527344 1 \n",
"45947 2936366 0.994227 0.997100 0.986877 130.375000 1 \n",
"45948 2936382 0.993308 0.990033 0.685347 108.000000 1 \n",
"45949 2958401 0.997014 0.659537 0.008160 109.125000 1 \n",
"45950 2958405 0.993123 0.860510 0.877880 93.187500 1 \n",
"\n",
" H2PNetMass \n",
"0 146.063271 \n",
"1 115.517843 \n",
"2 64.902849 \n",
"3 151.484867 \n",
"4 124.328030 \n",
"... ... \n",
"45946 65.574143 \n",
"45947 129.728316 \n",
"45948 104.398266 \n",
"45949 107.690576 \n",
"45950 105.548541 \n",
"\n",
"[45951 rows x 7 columns]\n",
"\n"
]
}
],
"source": [
"# Double-check file contents\n",
"years = [\"2022\", \"2022EE\", \"2023\", \"2023BPix\"]\n",
"dfs_from_root = {}\n",
"\n",
"for year in years:\n",
" filename = f\"eventlist_files/eventlist_boostedHH4b_{year}.root\"\n",
" with uproot.open(filename) as file:\n",
" tree = file[\"tree\"]\n",
" arrays = tree.arrays(library=\"np\")\n",
" df = pd.DataFrame(arrays)\n",
"\n",
" # Make dict\n",
" dfs_from_root[year] = df\n",
"\n",
"test_df = dfs_from_root[\"2022\"] - dfs_from_root[\"2022EE\"]\n",
"print(test_df)\n",
"# Display df\n",
"for year, df in dfs_from_root.items():\n",
" print(f\"DataFrame for year {year}:\\n{df}\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
6 changes: 6 additions & 0 deletions src/HH4b/overlap/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Eventlist and Overlap studies

run as follows
'''
python eventlist.py --data-dir /home/users/dprimosc/data --tag 24May24_v12_private_signal --mass H2PNetMass --year 2022 --sig-keys hh4b vbfhh4b
'''
Loading

0 comments on commit 68aba52

Please sign in to comment.