From 4c4773fea4fdd5cf3c70ba221f4668bb70b8a0bf Mon Sep 17 00:00:00 2001
From: Javier Duarte <jduarte@ucsd.edu>
Date: Thu, 23 May 2024 06:55:32 -0700
Subject: [PATCH] subtracted plots and mc closure

---
 src/HH4b/plotting.py                      | 241 +++++++++++++++++
 src/HH4b/postprocessing/CreateDatacard.py |   7 +-
 src/HH4b/postprocessing/PlotFits.ipynb    | 299 ----------------------
 3 files changed, 247 insertions(+), 300 deletions(-)
 delete mode 100644 src/HH4b/postprocessing/PlotFits.ipynb

diff --git a/src/HH4b/plotting.py b/src/HH4b/plotting.py
index 9d1059c8..958a673c 100644
--- a/src/HH4b/plotting.py
+++ b/src/HH4b/plotting.py
@@ -403,8 +403,10 @@ def ratioHistPlot(
         name (str): name of file to save plot
         sig_scale_dict (Dict[str, float]): if scaling signals in the plot, dictionary of factors
           by which to scale each signal
+        xlim (optional): x-limit on plot
         xlim_low (optional): x-limit low on plot
         ylim (optional): y-limit on plot
+        ylim_low (optional): y-limit low on plot
         show (bool): show plots or not
         syst (Tuple): Tuple of (wshift: name of systematic e.g. pileup,  wsamples: list of samples which are affected by this),
           to plot variations of this systematic.
@@ -871,6 +873,245 @@ def get_variances(bg_hist):
         plt.close()
 
 
+def subtractedHistPlot(
+    hists: Hist,
+    hists_fail: Hist,
+    year: str,
+    bg_keys: list[str],
+    bg_err: ArrayLike = None,
+    sortyield: bool = False,
+    title: str | None = None,
+    name: str = "",
+    xlim: int | None = None,
+    xlim_low: int | None = None,
+    ylim: int | None = None,
+    ylim_low: int | None = None,
+    show: bool = True,
+    bg_err_type: str = "shaded",
+    plot_data: bool = True,
+    bg_order=None,
+    log: bool = False,
+    logx: bool = False,
+    ratio_ylims: list[float] | None = None,
+    energy: str = "13.6",
+):
+    """
+    Makes and saves subtracted histogram plot, to show QCD transfer factor
+    with a data/mc ratio plot below
+
+    Args:
+        hists (Hist): input histograms per sample in pass region to plot
+        hists_fail (Hist): input histograms per sample in fail region to plot
+        year (str): datataking year
+        bg_keys (List[str]): background keys
+        title (str, optional): plot title. Defaults to None.
+        name (str): name of file to save plot
+        sig_scale_dict (Dict[str, float]): if scaling signals in the plot, dictionary of factors
+          by which to scale each signal
+        xlim_low (optional): x-limit low on plot
+        ylim (optional): y-limit on plot
+        show (bool): show plots or not
+        bg_order (List[str]): order in which to plot backgrounds
+        ratio_ylims (List[float]): y limits on the ratio plots
+        plot_significance (bool): plot Asimov significance below ratio plot
+    """
+
+
+    # copy hists and bg_keys so input objects are not changed
+    hists, bg_keys = deepcopy(hists), deepcopy(bg_keys)
+
+    if bg_order is None:
+        bg_order = bg_order_default
+
+    bg_keys, bg_colours, bg_labels, _, _, _ = _process_samples(
+        [], bg_keys, {}, None, None, bg_order
+    )
+
+    fig, (ax, rax) = plt.subplots(
+        2,
+        1,
+        figsize=(12, 12),
+        gridspec_kw={"height_ratios": [3.5, 1], "hspace": 0.18},
+        sharex=True,
+    )
+
+    # only use integers
+    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
+
+    plt.rcParams.update({"font.size": 30})
+
+    # plot histograms
+    ax.set_ylabel("Pass / Multijet in Fail")
+
+    # background samples
+    hep.histplot(
+        hists["qcd", :] / hists_fail["qcd", :],
+        ax=ax,
+        histtype="fill",
+        sort="yield" if sortyield else None,
+        stack=True,
+        edgecolor="black",
+        linewidth=2,
+        label="Multijet",
+        color=bg_colours[-1],
+    )
+
+    if bg_err is not None:
+        bg_tot = hists["qcd", :] / hists_fail["qcd", :]
+        if len(np.array(bg_err).shape) == 1:
+            bg_errs = [bg_tot - bg_err / hists_fail["qcd", :], 
+                       bg_tot + bg_err / hists_fail["qcd", :]]
+
+        if bg_err_type == "shaded":
+            ax.fill_between(
+                np.repeat(hists.axes[1].edges, 2)[1:-1],
+                np.repeat(bg_errs[0].values(), 2),
+                np.repeat(bg_errs[1].values(), 2),
+                color="black",
+                alpha=0.2,
+                hatch="//",
+                linewidth=0,
+                label="Multijet Unc.",
+            )
+        else:
+            ax.stairs(
+                bg_tot.values(),
+                hists.axes[1].edges,
+                color="black",
+                linewidth=3,
+                label="Bkg. Total",
+                baseline=bg_tot.values(),
+            )
+
+            ax.stairs(
+                bg_errs[0],
+                hists.axes[1].edges,
+                color="red",
+                linewidth=3,
+                label="Bkg. Down",
+                baseline=bg_errs[0],
+            )
+
+            ax.stairs(
+                bg_err[1],
+                hists.axes[1].edges,
+                color="#7F2CCB",
+                linewidth=3,
+                label="Bkg. Up",
+                baseline=bg_err[1],
+            )
+
+    # plot data
+    if plot_data:
+        data_val = hists[data_key, :].values()
+        qcd_fail_val = hists_fail["qcd", :].values()
+        yerr = ratio_uncertainty(data_val, qcd_fail_val, "poisson")
+        all_mc = sum(hists[bg_key, :] for bg_key in bg_keys if bg_key != "qcd")
+        yvalue = (hists[data_key, :] - all_mc) / hists_fail["qcd", :]
+        hep.histplot(
+            yvalue,
+            ax=ax,
+            yerr=yerr,
+            histtype="errorbar",
+            label="Data - Other Bkg.",
+            markersize=20,
+            color="black",
+        )
+
+    if log:
+        ax.set_yscale("log")
+    if logx:
+        ax.set_xscale("log")
+
+    handles, labels = ax.get_legend_handles_labels()
+    ax.legend(handles, labels, bbox_to_anchor=(1.03, 1), loc="upper left")
+
+    if xlim_low is not None:
+        if xlim is not None:
+            ax.set_xlim(xlim_low, xlim)
+        else:
+            ax.set_xlim(xlim_low, None)
+
+    y_lowlim = ylim_low if ylim_low is not None else 0 if not log else 0.001
+
+    if ylim is not None:
+        ax.set_ylim([y_lowlim, ylim])
+    else:
+        ax.set_ylim(y_lowlim)
+
+    ax.set_xlabel("")
+
+    # plot ratio below
+    if plot_data and len(bg_keys) > 0:
+        qcd_val = hists["qcd", :].values()
+        hep.histplot(
+            yvalue / (qcd_val / qcd_fail_val),
+            yerr=ratio_uncertainty(data_val, qcd_val, "poisson"),
+            ax=rax,
+            histtype="errorbar",
+            markersize=20,
+            color="black",
+            capsize=0,
+        )
+        rax.set_xlabel(hists.axes[1].label)
+
+        # fill error band of background
+        if bg_err is not None:
+            # (bkg + err) / bkg
+            rax.fill_between(
+                np.repeat(hists.axes[1].edges, 2)[1:-1],
+                np.repeat((bg_errs[0].values()) / (qcd_val / qcd_fail_val), 2),
+                np.repeat((bg_errs[1].values()) / (qcd_val / qcd_fail_val), 2),
+                color="black",
+                alpha=0.1,
+                hatch="//",
+                linewidth=0,
+            )
+    else:
+        rax.set_xlabel(hists.axes[1].label)
+
+    rax.set_ylabel("Ratio")
+    rax.set_ylim(ratio_ylims)
+    minor_locator = mticker.AutoMinorLocator(2)
+    rax.yaxis.set_minor_locator(minor_locator)
+    rax.grid(axis="y", linestyle="-", linewidth=2, which="both")
+
+    if title is not None:
+        ax.set_title(title, y=1.08)
+
+    if year == "all":
+        hep.cms.label(
+            "Work in Progress",
+            data=True,
+            lumi=f"{np.sum(list(LUMI.values())) / 1e3:.0f}",
+            year=None,
+            ax=ax,
+            com=energy,
+        )
+    else:
+        hep.cms.label(
+            "Work in Progress",
+            fontsize=24,
+            data=True,
+            lumi=f"{LUMI[year] / 1e3:.0f}",
+            year=year,
+            ax=ax,
+            com=energy,
+        )
+
+    if len(name):
+        if not name.endswith((".pdf", ".png")):
+            plt.savefig(f"{name}.pdf", bbox_inches="tight")
+            plt.savefig(f"{name}.png")
+        else:
+            plt.savefig(name, bbox_inches="tight")
+
+    if show:
+        plt.show()
+    else:
+        plt.close()
+
+
 def mesh2d(
     xbins: ArrayLike,
     ybins: ArrayLike,
diff --git a/src/HH4b/postprocessing/CreateDatacard.py b/src/HH4b/postprocessing/CreateDatacard.py
index 27e30118..a5b44a05 100644
--- a/src/HH4b/postprocessing/CreateDatacard.py
+++ b/src/HH4b/postprocessing/CreateDatacard.py
@@ -101,6 +101,7 @@
 add_bool_arg(parser, "vbf-region", "Add VBF region", default=False)
 add_bool_arg(parser, "unblinded", "unblinded so skip blinded parts", default=False)
 add_bool_arg(parser, "ttbar-rate-param", "Add freely floating ttbar rate param", default=False)
+add_bool_arg(parser, "mc-closure", "Perform MC closure test (fill data_obs with sum of MC bkg.", default=False)
 args = parser.parse_args()
 
 
@@ -540,7 +541,11 @@ def fill_regions(
             )
 
         # data observed
-        ch.setObservation(region_templates[data_key, :])
+        if args.mc_closure:
+            all_bg = sum([region_templates[bg_key, :] for bg_key in bg_keys + ["qcd"])
+            ch.setObservation(all_bg)
+        else:
+           ch.setObservation(region_templates[data_key, :])
 
 
 def alphabet_fit(
diff --git a/src/HH4b/postprocessing/PlotFits.ipynb b/src/HH4b/postprocessing/PlotFits.ipynb
deleted file mode 100644
index 81b5d39d..00000000
--- a/src/HH4b/postprocessing/PlotFits.ipynb
+++ /dev/null
@@ -1,299 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import argparse\n",
-    "import os\n",
-    "from pathlib import Path\n",
-    "from collections import OrderedDict\n",
-    "\n",
-    "import hist\n",
-    "import numpy as np\n",
-    "import uproot\n",
-    "\n",
-    "from HH4b import plotting\n",
-    "from HH4b.utils import ShapeVar\n",
-    "from HH4b.hh_vars import data_key"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "MAIN_DIR = Path(\"../../../\")\n",
-    "nTF = 1\n",
-    "\n",
-    "vbf = False\n",
-    "# k2v0sig = True\n",
-    "mreg = True\n",
-    "\n",
-    "plot_dir = MAIN_DIR / \"plots/PostFit/24Apr21_legacy_bdt_ggf_tighter\"\n",
-    "plot_dir.mkdir(exist_ok=True, parents=True)\n",
-    "\n",
-    "regions = \"all\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cards_dir = \"24Apr21_legacy_bdt_ggf_tighter\"\n",
-    "file = uproot.open(\n",
-    "    f\"/uscms/home/rkansal/hhcombine/hh4b/cards/{cards_dir}/FitShapes.root\"\n",
-    "    # f\"/uscms/home/rkansal/eos/bbVV/cards/{cards_dir}/FitShapes.root\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# (name in templates -> name in cards)\n",
-    "hist_label_map_inverse = OrderedDict(\n",
-    "    [\n",
-    "        (\"qcd\", \"CMS_bbbb_hadronic_qcd_datadriven\"),\n",
-    "        (\"others\", \"others\"),\n",
-    "        (\"ttbar\", \"ttbar\"),\n",
-    "        (\"vhtobb\", \"VH_hbb\"),\n",
-    "        (\"tthtobb\", \"ttH_hbb\"),\n",
-    "        (\"data\", \"data_obs\"),\n",
-    "    ]\n",
-    ")\n",
-    "\n",
-    "if vbf:\n",
-    "    hist_label_map_inverse[\"vbfhh4b-k2v0\"] = \"vbfhh4b-k2v0\"\n",
-    "else:\n",
-    "    hist_label_map_inverse[\"hh4b\"] = \"hh4b\"\n",
-    "\n",
-    "hist_label_map = {val: key for key, val in hist_label_map_inverse.items()}\n",
-    "samples = list(hist_label_map.values())\n",
-    "\n",
-    "fit_shape_var_msd = ShapeVar(\n",
-    "    \"H2Msd\",\n",
-    "    r\"$m^{j2}_\\mathrm{SD}$ (GeV)\",\n",
-    "    [16, 60, 220],\n",
-    "    reg=True,\n",
-    "    blind_window=[110, 140],\n",
-    ")\n",
-    "\n",
-    "fit_shape_var_mreg = ShapeVar(\n",
-    "    \"H2PNetMass\",\n",
-    "    r\"$m^{j2}_\\mathrm{reg}$ (GeV)\",\n",
-    "    [16, 60, 220],\n",
-    "    reg=True,\n",
-    "    blind_window=[110, 140],\n",
-    ")\n",
-    "shape_vars = [fit_shape_var_msd] if not mreg else [fit_shape_var_mreg]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "shapes = {\n",
-    "    \"prefit\": \"Pre-Fit\",\n",
-    "    # \"postfit\": \"S+B Post-Fit\",\n",
-    "    \"postfit\": \"B-only Post-Fit\",\n",
-    "}\n",
-    "\n",
-    "selection_regions_labels = {\n",
-    "    \"passbin1\": \"Pass Bin1\",\n",
-    "    \"passbin2\": \"Pass Bin2\",\n",
-    "    \"passbin3\": \"Pass Bin3\",\n",
-    "    \"fail\": \"Fail\",\n",
-    "}\n",
-    "\n",
-    "if vbf:\n",
-    "    selection_regions_labels[\"passvbf\"] = \"Pass VBF\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if regions == \"all\":\n",
-    "    signal_regions = [\"passbin1\", \"passbin2\", \"passbin3\"]\n",
-    "    if vbf:\n",
-    "        signal_regions = [\"passvbf\"] + signal_regions\n",
-    "else:\n",
-    "    signal_regions = [regions]\n",
-    "\n",
-    "bins = [*signal_regions, \"fail\"]\n",
-    "selection_regions = {key: selection_regions_labels[key] for key in bins}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "hists = {}\n",
-    "for shape in shapes:\n",
-    "    hists[shape] = {\n",
-    "        region: hist.Hist(\n",
-    "            hist.axis.StrCategory(samples, name=\"Sample\"),\n",
-    "            *[shape_var.axis for shape_var in shape_vars],\n",
-    "            storage=\"double\",\n",
-    "        )\n",
-    "        for region in selection_regions\n",
-    "    }\n",
-    "\n",
-    "    for region in selection_regions:\n",
-    "        h = hists[shape][region]\n",
-    "        templates = file[f\"{region}_{shape}\"]\n",
-    "        # print(templates)\n",
-    "        for key, file_key in hist_label_map_inverse.items():\n",
-    "            if key != data_key:\n",
-    "                if file_key not in templates:\n",
-    "                    print(f\"No {key} in {region}\")\n",
-    "                    continue\n",
-    "\n",
-    "                data_key_index = np.where(np.array(list(h.axes[0])) == key)[0][0]\n",
-    "                h.view(flow=False)[data_key_index, :] = templates[file_key].values()\n",
-    "\n",
-    "        data_key_index = np.where(np.array(list(h.axes[0])) == data_key)[0][0]\n",
-    "        h.view(flow=False)[data_key_index, :] = np.nan_to_num(\n",
-    "            templates[hist_label_map_inverse[data_key]].values()\n",
-    "        )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"Signal in mass window:\", np.sum(hists[\"postfit\"][\"passbin1\"][\"hh4b\", 5:8].values()))\n",
-    "\n",
-    "bg_tot = np.sum(\n",
-    "    [\n",
-    "        np.sum(hists[\"postfit\"][\"passbin1\"][key, 5:8].values())\n",
-    "        for key in hist_label_map_inverse\n",
-    "        if key not in [\"hh4b\", \"vbfhh4b-k2v0\", \"data\"]\n",
-    "    ]\n",
-    ")\n",
-    "print(\"BG in mass window:\", bg_tot)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print([key for key in hist_label_map_inverse if key not in [\"hh4b\", \"vbfhh4b-k2v0\", \"data\"]])\n",
-    "{\n",
-    "    key: np.sum(hists[\"postfit\"][\"passbin1\"][key, 5:8].values())\n",
-    "    for key in hist_label_map_inverse\n",
-    "    if key not in [\"hh4b\", \"vbfhh4b-k2v0\", \"data\"]\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "year = \"2022-2023\"\n",
-    "pass_ratio_ylims = [0, 2]\n",
-    "fail_ratio_ylims = [0, 2]\n",
-    "signal_scale = 5.0\n",
-    "\n",
-    "ylims = {\n",
-    "    \"passvbf\": 15,\n",
-    "    \"passbin1\": 10,\n",
-    "    \"passbin2\": 50,\n",
-    "    \"passbin3\": 800,\n",
-    "    \"fail\": 100000,\n",
-    "}\n",
-    "\n",
-    "for shape, shape_label in shapes.items():\n",
-    "    for region, region_label in selection_regions.items():\n",
-    "        pass_region = region.startswith(\"pass\")\n",
-    "        for shape_var in shape_vars:\n",
-    "            # print(hists[shape][region])\n",
-    "            plot_params = {\n",
-    "                \"hists\": hists[shape][region],\n",
-    "                \"sig_keys\": [\"hh4b\"] if not vbf else [\"vbfhh4b-k2v0\"],\n",
-    "                \"sig_scale_dict\": (\n",
-    "                    {\"hh4b\": signal_scale if pass_region else 1.0} if not vbf else None\n",
-    "                ),\n",
-    "                \"bg_keys\": [\"qcd\", \"ttbar\", \"vhtobb\", \"tthtobb\", \"others\"],\n",
-    "                \"show\": True,\n",
-    "                \"year\": year,\n",
-    "                \"ylim\": ylims[region],\n",
-    "                \"xlim\": 220,\n",
-    "                # \"xlim_low\": 50,\n",
-    "                \"xlim_low\": 60,\n",
-    "                \"ratio_ylims\": pass_ratio_ylims if pass_region else fail_ratio_ylims,\n",
-    "                \"title\": f\"{shape_label} {region_label} Region\",\n",
-    "                \"name\": f\"{plot_dir}/{shape}_{region}_{shape_var.var}.pdf\",\n",
-    "                \"bg_order\": [\"diboson\", \"vjets\", \"vhtobb\", \"ttbar\", \"qcd\"],\n",
-    "                \"energy\": 13.6,\n",
-    "            }\n",
-    "\n",
-    "            plotting.ratioHistPlot(**plot_params)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "bbVV",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.15"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "5b9eab485576227e6cf1b964bb8855c46cbdf15c3e77cecdb2bb309145d3e8d8"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}