Skip to content

Commit

Permalink
Add private school student calibration (#37)
Browse files Browse the repository at this point in the history
* Increase epochs per year to 10k

* Update data urls

* Add calibration improvements

* Add private school student count calibration

* Versioning
  • Loading branch information
nikhilwoodruff authored Oct 21, 2024
1 parent b8d68f3 commit e315fc8
Show file tree
Hide file tree
Showing 11 changed files with 5,200 additions and 405 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.7.0] - 2024-10-21 17:03:50

### Added

- Calibration for private school students.

## [1.6.0] - 2024-10-18 16:05:10

### Added
Expand Down Expand Up @@ -73,6 +79,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0



[1.7.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.6.0...1.7.0
[1.6.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.5.0...1.6.0
[1.5.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.4.0...1.5.0
[1.4.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.3.0...1.4.0
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ docker:

documentation:
jb clean docs && jb build docs
python docs/add_plotly_to_book.py docs/book

data:
python policyengine_uk_data/datasets/frs/dwp_frs.py
Expand Down
5 changes: 5 additions & 0 deletions changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,8 @@
- Future year income targeting.
- Random takeup variable values.
date: 2024-10-18 16:05:10
- bump: minor
changes:
added:
- Calibration for private school students.
date: 2024-10-21 17:03:50
27 changes: 27 additions & 0 deletions docs/add_plotly_to_book.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import argparse
from pathlib import Path

# This command-line tools enables Plotly charts to show in the HTML files for the Jupyter Book documentation.

parser = argparse.ArgumentParser()
parser.add_argument("book_path", help="Path to the Jupyter Book.")

args = parser.parse_args()

# Find every HTML file in the Jupyter Book. Then, add a script tag to the start of the <head> tag in each file, with the contents:
# <script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>

book_folder = Path(args.book_path)

for html_file in book_folder.glob("**/*.html"):
with open(html_file, "r") as f:
html = f.read()

# Add the script tag to the start of the <head> tag.
html = html.replace(
"<head>",
'<head><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>',
)

with open(html_file, "w") as f:
f.write(html)
5,345 changes: 5,001 additions & 344 deletions docs/methodology.ipynb

Large diffs are not rendered by default.

104 changes: 52 additions & 52 deletions docs/validation.ipynb

Large diffs are not rendered by default.

89 changes: 89 additions & 0 deletions policyengine_uk_data/datasets/frs/enhanced_frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ def generate(self):

impute_cg_to_dataset(self)

self.save_dataset(data)

self.add_random_variables(data)

# Reweighting

data = self.load_dataset()
Expand All @@ -38,6 +42,29 @@ def generate(self):

self.save_dataset(data)

def add_random_variables(self, data: dict):
from policyengine_uk import Microsimulation

simulation = Microsimulation(dataset=self)
RANDOM_VARIABLES = [
"would_evade_tv_licence_fee",
"would_claim_pc",
"would_claim_uc",
"would_claim_child_benefit",
"main_residential_property_purchased_is_first_home",
"household_owns_tv",
"is_higher_earner",
"attends_private_school",
]
INPUT_PERIODS = list(range(self.time_period, self.time_period + 10))
for variable in RANDOM_VARIABLES:
simulation.get_holder(variable).delete_arrays()
for variable in RANDOM_VARIABLES:
value = simulation.calculate(variable, self.time_period).values
data[variable] = {period: value for period in INPUT_PERIODS}

self.save_dataset(data)


class ReweightedFRS_2022_23(EnhancedFRS):
name = "reweighted_frs_2022_23"
Expand All @@ -60,6 +87,68 @@ class EnhancedFRS_2022_23(EnhancedFRS):
url = "release://PolicyEngine/ukda/1.5.0/enhanced_frs_2022_23.h5"


def reweight(
original_weights,
loss_matrix,
targets_array,
dropout_rate=0.05,
):
target_names = np.array(loss_matrix.columns)
loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
targets_array = torch.tensor(targets_array, dtype=torch.float32)
weights = torch.tensor(
np.log(original_weights), requires_grad=True, dtype=torch.float32
)

# TODO: replace this with a call to the python reweight.py package.
def loss(weights):
# Check for Nans in either the weights or the loss matrix
if torch.isnan(weights).any():
raise ValueError("Weights contain NaNs")
if torch.isnan(loss_matrix).any():
raise ValueError("Loss matrix contains NaNs")
estimate = weights @ loss_matrix
if torch.isnan(estimate).any():
raise ValueError("Estimate contains NaNs")
rel_error = (
((estimate - targets_array) + 1) / (targets_array + 1)
) ** 2
if torch.isnan(rel_error).any():
raise ValueError("Relative error contains NaNs")
return rel_error.mean()

def dropout_weights(weights, p):
if p == 0:
return weights
# Replace p% of the weights with the mean value of the rest of them
mask = torch.rand_like(weights) < p
mean = weights[~mask].mean()
masked_weights = weights.clone()
masked_weights[mask] = mean
return masked_weights

optimizer = torch.optim.Adam([weights], lr=1e-1)
from tqdm import trange

start_loss = None

iterator = trange(10_000)
for i in iterator:
optimizer.zero_grad()
weights_ = dropout_weights(weights, dropout_rate)
l = loss(torch.exp(weights_))
if start_loss is None:
start_loss = l.item()
loss_rel_change = (l.item() - start_loss) / start_loss
l.backward()
iterator.set_postfix(
{"loss": l.item(), "loss_rel_change": loss_rel_change}
)
optimizer.step()

return torch.exp(weights).detach().numpy()


if __name__ == "__main__":
ReweightedFRS_2022_23().generate()
EnhancedFRS_2022_23().generate()
4 changes: 2 additions & 2 deletions policyengine_uk_data/datasets/frs/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,14 @@ def add_random_variables(self, frs: dict):

simulation = Microsimulation(dataset=self)
RANDOM_VARIABLES = [
"attends_private_school",
"would_evade_tv_licence_fee",
"would_claim_pc",
"would_claim_uc",
"would_claim_child_benefit",
"main_residential_property_purchased_is_first_home",
"household_owns_tv",
"is_higher_earner",
"attends_private_school",
]
INPUT_PERIODS = list(range(self.time_period, self.time_period + 10))
for variable in RANDOM_VARIABLES:
Expand Down Expand Up @@ -138,7 +138,7 @@ class FRS_2022_23(FRS):
label = "FRS (2022-23)"
file_path = STORAGE_FOLDER / "frs_2022_23.h5"
time_period = 2022
url = "release://PolicyEngine/ukda/1.5.0/frs_2022_23.h5"
url = "release://PolicyEngine/ukda/1.6.0/frs_2022_23.h5"


def add_id_variables(frs: h5py.File, person: DataFrame, household: DataFrame):
Expand Down
1 change: 1 addition & 0 deletions policyengine_uk_data/storage/tax_benefit.csv
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,4 @@ universal_credit_jobseekers,gbp-bn,obr_march_2024_efo,,,,,9.2,11.1,17.4,19.3,19.
universal_credit_non_jobseekers,gbp-bn,obr_march_2024_efo,,,,,33.4,40.7,49.5,55.0,57.9,60.8,67.1,
vat,gbp-bn,obr_march_2024_efo,,,,,159.7,169.6,173.9,180.3,188.0,195.9,204.1,
winter_fuel_allowance,gbp-bn,obr_march_2024_efo,,,,,2.0,2.0,1.9,1.9,1.9,1.9,2.0,
private_school_students,person-k,obr_march_2024_efo,557,557,557,557,557,557,557,557,557,557,557,
20 changes: 14 additions & 6 deletions policyengine_uk_data/utils/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ def pe_count(*variables):
df["obr/vat"] = pe("vat")
df["obr/winter_fuel_allowance"] = pe("winter_fuel_allowance")

# Not strictly from the OBR but from the 2024 Independent Schools Council census. OBR will be using that.
df["obr/private_school_students"] = pe("attends_private_school")

# Population statistics from the ONS.

region = sim.calculate("region", map_to="person")
Expand Down Expand Up @@ -239,15 +242,20 @@ def pe_count(*variables):
return df, combined_targets.value


def get_loss_results(dataset, time_period, reform=None):
def get_loss_results(
dataset, time_period, reform=None, household_weights=None
):
matrix, targets = create_target_matrix(dataset, time_period, reform)
from policyengine_uk import Microsimulation

weights = (
Microsimulation(dataset=dataset, reform=reform)
.calculate("household_weight", time_period)
.values
)
if household_weights is None:
weights = (
Microsimulation(dataset=dataset, reform=reform)
.calculate("household_weight", time_period)
.values
)
else:
weights = household_weights
estimates = weights @ matrix
df = pd.DataFrame(
{
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "policyengine_uk_data"
version = "1.6.0"
version = "1.7.0"
description = "A package to create representative microdata for the UK."
readme = "README.md"
authors = [
Expand Down

0 comments on commit e315fc8

Please sign in to comment.