Add private school student calibration (#37)

* Increase epochs per year to 10k * Update data urls * Add calibration improvements * Add private school student count calibration * Versioning
PolicyEngine · Oct 21, 2024 · e315fc8 · e315fc8
1 parent b8d68f3
commit e315fc8
Show file tree

Hide file tree

Showing 11 changed files with 5,200 additions and 405 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.7.0] - 2024-10-21 17:03:50
+
+### Added
+
+- Calibration for private school students.
+
 ## [1.6.0] - 2024-10-18 16:05:10
 
 ### Added
@@ -73,6 +79,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[1.7.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.6.0...1.7.0
 [1.6.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.5.0...1.6.0
 [1.5.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.4.0...1.5.0
 [1.4.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.3.0...1.4.0

diff --git a/Makefile b/Makefile
@@ -21,6 +21,7 @@ docker:
 
 documentation:
 	jb clean docs && jb build docs
+	python docs/add_plotly_to_book.py docs/book
 
 data:
 	python policyengine_uk_data/datasets/frs/dwp_frs.py

diff --git a/changelog.yaml b/changelog.yaml
@@ -60,3 +60,8 @@
     - Future year income targeting.
     - Random takeup variable values.
   date: 2024-10-18 16:05:10
+- bump: minor
+  changes:
+    added:
+    - Calibration for private school students.
+  date: 2024-10-21 17:03:50
diff --git a/docs/add_plotly_to_book.py b/docs/add_plotly_to_book.py
@@ -0,0 +1,27 @@
+import argparse
+from pathlib import Path
+
+# This command-line tools enables Plotly charts to show in the HTML files for the Jupyter Book documentation.
+
+parser = argparse.ArgumentParser()
+parser.add_argument("book_path", help="Path to the Jupyter Book.")
+
+args = parser.parse_args()
+
+# Find every HTML file in the Jupyter Book. Then, add a script tag to the start of the <head> tag in each file, with the contents:
+# <script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+
+book_folder = Path(args.book_path)
+
+for html_file in book_folder.glob("**/*.html"):
+    with open(html_file, "r") as f:
+        html = f.read()
+
+    # Add the script tag to the start of the <head> tag.
+    html = html.replace(
+        "<head>",
+        '<head><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>',
+    )
+
+    with open(html_file, "w") as f:
+        f.write(html)
diff --git a/docs/methodology.ipynb b/docs/methodology.ipynb
diff --git a/docs/validation.ipynb b/docs/validation.ipynb
diff --git a/policyengine_uk_data/datasets/frs/enhanced_frs.py b/policyengine_uk_data/datasets/frs/enhanced_frs.py
@@ -25,6 +25,10 @@ def generate(self):
 
         impute_cg_to_dataset(self)
 
+        self.save_dataset(data)
+
+        self.add_random_variables(data)
+
         # Reweighting
 
         data = self.load_dataset()
@@ -38,6 +42,29 @@ def generate(self):
 
         self.save_dataset(data)
 
+    def add_random_variables(self, data: dict):
+        from policyengine_uk import Microsimulation
+
+        simulation = Microsimulation(dataset=self)
+        RANDOM_VARIABLES = [
+            "would_evade_tv_licence_fee",
+            "would_claim_pc",
+            "would_claim_uc",
+            "would_claim_child_benefit",
+            "main_residential_property_purchased_is_first_home",
+            "household_owns_tv",
+            "is_higher_earner",
+            "attends_private_school",
+        ]
+        INPUT_PERIODS = list(range(self.time_period, self.time_period + 10))
+        for variable in RANDOM_VARIABLES:
+            simulation.get_holder(variable).delete_arrays()
+        for variable in RANDOM_VARIABLES:
+            value = simulation.calculate(variable, self.time_period).values
+            data[variable] = {period: value for period in INPUT_PERIODS}
+
+        self.save_dataset(data)
+
 
 class ReweightedFRS_2022_23(EnhancedFRS):
     name = "reweighted_frs_2022_23"
@@ -60,6 +87,68 @@ class EnhancedFRS_2022_23(EnhancedFRS):
     url = "release://PolicyEngine/ukda/1.5.0/enhanced_frs_2022_23.h5"
 
 
+def reweight(
+    original_weights,
+    loss_matrix,
+    targets_array,
+    dropout_rate=0.05,
+):
+    target_names = np.array(loss_matrix.columns)
+    loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
+    targets_array = torch.tensor(targets_array, dtype=torch.float32)
+    weights = torch.tensor(
+        np.log(original_weights), requires_grad=True, dtype=torch.float32
+    )
+
+    # TODO: replace this with a call to the python reweight.py package.
+    def loss(weights):
+        # Check for Nans in either the weights or the loss matrix
+        if torch.isnan(weights).any():
+            raise ValueError("Weights contain NaNs")
+        if torch.isnan(loss_matrix).any():
+            raise ValueError("Loss matrix contains NaNs")
+        estimate = weights @ loss_matrix
+        if torch.isnan(estimate).any():
+            raise ValueError("Estimate contains NaNs")
+        rel_error = (
+            ((estimate - targets_array) + 1) / (targets_array + 1)
+        ) ** 2
+        if torch.isnan(rel_error).any():
+            raise ValueError("Relative error contains NaNs")
+        return rel_error.mean()
+
+    def dropout_weights(weights, p):
+        if p == 0:
+            return weights
+        # Replace p% of the weights with the mean value of the rest of them
+        mask = torch.rand_like(weights) < p
+        mean = weights[~mask].mean()
+        masked_weights = weights.clone()
+        masked_weights[mask] = mean
+        return masked_weights
+
+    optimizer = torch.optim.Adam([weights], lr=1e-1)
+    from tqdm import trange
+
+    start_loss = None
+
+    iterator = trange(10_000)
+    for i in iterator:
+        optimizer.zero_grad()
+        weights_ = dropout_weights(weights, dropout_rate)
+        l = loss(torch.exp(weights_))
+        if start_loss is None:
+            start_loss = l.item()
+        loss_rel_change = (l.item() - start_loss) / start_loss
+        l.backward()
+        iterator.set_postfix(
+            {"loss": l.item(), "loss_rel_change": loss_rel_change}
+        )
+        optimizer.step()
+
+    return torch.exp(weights).detach().numpy()
+
+
 if __name__ == "__main__":
     ReweightedFRS_2022_23().generate()
     EnhancedFRS_2022_23().generate()
diff --git a/policyengine_uk_data/datasets/frs/frs.py b/policyengine_uk_data/datasets/frs/frs.py
@@ -99,14 +99,14 @@ def add_random_variables(self, frs: dict):
 
         simulation = Microsimulation(dataset=self)
         RANDOM_VARIABLES = [
-            "attends_private_school",
             "would_evade_tv_licence_fee",
             "would_claim_pc",
             "would_claim_uc",
             "would_claim_child_benefit",
             "main_residential_property_purchased_is_first_home",
             "household_owns_tv",
             "is_higher_earner",
+            "attends_private_school",
         ]
         INPUT_PERIODS = list(range(self.time_period, self.time_period + 10))
         for variable in RANDOM_VARIABLES:
@@ -138,7 +138,7 @@ class FRS_2022_23(FRS):
     label = "FRS (2022-23)"
     file_path = STORAGE_FOLDER / "frs_2022_23.h5"
     time_period = 2022
-    url = "release://PolicyEngine/ukda/1.5.0/frs_2022_23.h5"
+    url = "release://PolicyEngine/ukda/1.6.0/frs_2022_23.h5"
 
 
 def add_id_variables(frs: h5py.File, person: DataFrame, household: DataFrame):

diff --git a/policyengine_uk_data/storage/tax_benefit.csv b/policyengine_uk_data/storage/tax_benefit.csv
@@ -41,3 +41,4 @@ universal_credit_jobseekers,gbp-bn,obr_march_2024_efo,,,,,9.2,11.1,17.4,19.3,19.
 universal_credit_non_jobseekers,gbp-bn,obr_march_2024_efo,,,,,33.4,40.7,49.5,55.0,57.9,60.8,67.1,
 vat,gbp-bn,obr_march_2024_efo,,,,,159.7,169.6,173.9,180.3,188.0,195.9,204.1,
 winter_fuel_allowance,gbp-bn,obr_march_2024_efo,,,,,2.0,2.0,1.9,1.9,1.9,1.9,2.0,
+private_school_students,person-k,obr_march_2024_efo,557,557,557,557,557,557,557,557,557,557,557,
diff --git a/policyengine_uk_data/utils/loss.py b/policyengine_uk_data/utils/loss.py
@@ -128,6 +128,9 @@ def pe_count(*variables):
     df["obr/vat"] = pe("vat")
     df["obr/winter_fuel_allowance"] = pe("winter_fuel_allowance")
 
+    # Not strictly from the OBR but from the 2024 Independent Schools Council census. OBR will be using that.
+    df["obr/private_school_students"] = pe("attends_private_school")
+
     # Population statistics from the ONS.
 
     region = sim.calculate("region", map_to="person")
@@ -239,15 +242,20 @@ def pe_count(*variables):
     return df, combined_targets.value
 
 
-def get_loss_results(dataset, time_period, reform=None):
+def get_loss_results(
+    dataset, time_period, reform=None, household_weights=None
+):
     matrix, targets = create_target_matrix(dataset, time_period, reform)
     from policyengine_uk import Microsimulation
 
-    weights = (
-        Microsimulation(dataset=dataset, reform=reform)
-        .calculate("household_weight", time_period)
-        .values
-    )
+    if household_weights is None:
+        weights = (
+            Microsimulation(dataset=dataset, reform=reform)
+            .calculate("household_weight", time_period)
+            .values
+        )
+    else:
+        weights = household_weights
     estimates = weights @ matrix
     df = pd.DataFrame(
         {

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "policyengine_uk_data"
-version = "1.6.0"
+version = "1.7.0"
 description = "A package to create representative microdata for the UK."
 readme = "README.md"
 authors = [