From dc7e289f2aa3321d69483ec72e96aafb845b8d2c Mon Sep 17 00:00:00 2001 From: = Date: Fri, 2 Aug 2024 12:13:31 -0400 Subject: [PATCH 01/22] Added data processing workflow and (unimplemented) processing script --- .github/workflows/schedule.yaml | 24 ++++++++++++++++++++++++ reweight/logic/process_data.py | 1 + 2 files changed, 25 insertions(+) create mode 100644 .github/workflows/schedule.yaml create mode 100644 reweight/logic/process_data.py diff --git a/.github/workflows/schedule.yaml b/.github/workflows/schedule.yaml new file mode 100644 index 0000000..d57cb6f --- /dev/null +++ b/.github/workflows/schedule.yaml @@ -0,0 +1,24 @@ +name: Scheduled Data Processing + +on: + schedule: + - cron: "0 0 1 * *" # Runs at 00:00 on the first day of every month + push: + branches: [main] # Runs on pushes to the main branch + pull_request: + branches: [main] # Runs on pull requests to the main branch + +jobs: + process_data: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install dependencies + run: make install + - name: Run data processing script + run: python reweight/logic/process_data.py diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py new file mode 100644 index 0000000..e843899 --- /dev/null +++ b/reweight/logic/process_data.py @@ -0,0 +1 @@ +raise NotImplementedError("Data processing function still in development") \ No newline at end of file From 2bd2a5421eb52f6c99ddd85ac7c8e74cc80ccbf7 Mon Sep 17 00:00:00 2001 From: = Date: Fri, 2 Aug 2024 18:10:09 -0400 Subject: [PATCH 02/22] Fixed setup.py installation issues with torch --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index db55313..8bdd301 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ install_requires=[ "numpy<2.0", "pandas", - "torch+cpu", + "torch", "tensorboard", "jupyter-book", "pytest", From 6367ec8aca81c81072689b52c4871b07aa9ec4b3 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 11:26:18 -0400 Subject: [PATCH 03/22] Added a gitignore to exclude items in root starting with the string test_ --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 712df78..1cf11d0 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,5 @@ docs/_build # Testing notebooks # ##################### -/*.ipynb \ No newline at end of file +/*.ipynb +/test_* \ No newline at end of file From 8b6e3e73255de7f1248f49dbe77463b0c60eeda3 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 11:27:15 -0400 Subject: [PATCH 04/22] Now ignores CSV files in root --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1cf11d0..9bbaef0 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,8 @@ docs/_build # Testing notebooks # ##################### /*.ipynb -/test_* \ No newline at end of file +/test_* + +# Temporary CSV files # +####################### +/*.csv \ No newline at end of file From e310f491f1015bac81dcfe9b97958bec8f9452f7 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 11:35:53 -0400 Subject: [PATCH 05/22] Wrote a script to process data and post it to the reweight repo --- reweight/logic/process_data.py | 105 ++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index e843899..90f4443 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -1 +1,104 @@ -raise NotImplementedError("Data processing function still in development") \ No newline at end of file +import pandas as pd +import numpy as np +import torch +from torch.utils.tensorboard import SummaryWriter +import os +import requests +import base64 + +import policyengine_uk +from policyengine_uk.data import RawFRS_2021_22 +from policyengine_uk.data.datasets.frs.calibration.calibrate import generate_model_variables + +from reweight import reweight + +#UK dataframe generation. + +RawFRS_2021_22().download() + +uk_weights_df = pd.DataFrame() + +for year in range(2024, 2029): + ( + household_weights, + weight_adjustment, + values_df, + targets, + targets_array, + equivalisation_factors_array + ) = generate_model_variables("frs_2021", year) + sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) + uk_final_weights = reweight(household_weights, sim_matrix, targets, targets_array, epochs=1_000) + uk_weight_series = pd.Series(uk_final_weights.numpy()) + uk_weights_df[str(year)] = uk_weight_series + + +csv_filename = "updated_uk_weights.csv" +uk_weights_df.to_csv(csv_filename) + + +#US dataframe generation. + +import policyengine_us +from policyengine_us.data.datasets.cps.enhanced_cps.loss import generate_model_variables + +us_weights_df = pd.DataFrame() + +for year in range(2024, 2029): + ( + household_weights, + weight_adjustment, + values_df, + targets, + targets_array, + equivalisation_factors_array + ) = generate_model_variables("cps_2021", year) + sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) + initial_weights = torch.tensor(household_weights, dtype=torch.float32) + targets_tensor = torch.tensor(targets_array, dtype=torch.float32) + us_final_weights = reweight(initial_weights, sim_matrix, targets, targets_tensor, epochs=1_000) + us_weight_series = pd.Series(us_final_weights.numpy()) + us_weights_df[str(year)] = us_weight_series + +#Now, for testing, save these dataframes as CSV. + +csv_filename = "updated_us_weights.csv" +us_weights_df.to_csv(csv_filename) + +#Now, create a GitHub release + +api_url = 'https://api.github.com/repos/PolicyEngine/reweight/releases' + +owner = 'pmberg' +repo = 'reweight' +token = os.environ.get('GITHUB_TOKEN') + +# Create release +headers = { + 'Authorization': f'token {token}', + 'Accept': 'application/vnd.github.v3+json' +} +release_data = { + 'tag_name': f'v{pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', + 'name': f'Data Release {pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', + 'body': 'Automated data release with updated weights' +} +response = requests.post(api_url.format(owner=owner, repo=repo), headers=headers, json=release_data) +release = response.json() + +# Upload assets +upload_url = release['upload_url'].split('{')[0] + +def upload_file(file_name): + with open(file_name, 'rb') as file: + content = file.read() + headers['Content-Type'] = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' + params = {'name': os.path.basename(file_name)} + response = requests.post(upload_url, headers=headers, params=params, data=content) + if response.status_code == 201: + print(f"File added successfully: {release['html_url']}") + else: + print(f"Failed to add file: {response.content}") + +for file_name in ["updated_uk_weights.csv", "updated_us_weights.csv"]: + upload_file(file_name) \ No newline at end of file From 1849351dfed9e700284ffbafcb6454a9cd541d93 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 11:41:07 -0400 Subject: [PATCH 06/22] Reformatted code --- reweight/logic/process_data.py | 68 +++++++++++++++++++++------------- reweight/logic/reweight.py | 2 +- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 90f4443..57e3151 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -8,11 +8,13 @@ import policyengine_uk from policyengine_uk.data import RawFRS_2021_22 -from policyengine_uk.data.datasets.frs.calibration.calibrate import generate_model_variables +from policyengine_uk.data.datasets.frs.calibration.calibrate import ( + generate_model_variables, +) from reweight import reweight -#UK dataframe generation. +# UK dataframe generation. RawFRS_2021_22().download() @@ -25,10 +27,12 @@ values_df, targets, targets_array, - equivalisation_factors_array + equivalisation_factors_array, ) = generate_model_variables("frs_2021", year) sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) - uk_final_weights = reweight(household_weights, sim_matrix, targets, targets_array, epochs=1_000) + uk_final_weights = reweight( + household_weights, sim_matrix, targets, targets_array, epochs=1_000 + ) uk_weight_series = pd.Series(uk_final_weights.numpy()) uk_weights_df[str(year)] = uk_weight_series @@ -37,10 +41,12 @@ uk_weights_df.to_csv(csv_filename) -#US dataframe generation. +# US dataframe generation. import policyengine_us -from policyengine_us.data.datasets.cps.enhanced_cps.loss import generate_model_variables +from policyengine_us.data.datasets.cps.enhanced_cps.loss import ( + generate_model_variables, +) us_weights_df = pd.DataFrame() @@ -51,54 +57,64 @@ values_df, targets, targets_array, - equivalisation_factors_array + equivalisation_factors_array, ) = generate_model_variables("cps_2021", year) sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) initial_weights = torch.tensor(household_weights, dtype=torch.float32) targets_tensor = torch.tensor(targets_array, dtype=torch.float32) - us_final_weights = reweight(initial_weights, sim_matrix, targets, targets_tensor, epochs=1_000) + us_final_weights = reweight( + initial_weights, sim_matrix, targets, targets_tensor, epochs=1_000 + ) us_weight_series = pd.Series(us_final_weights.numpy()) us_weights_df[str(year)] = us_weight_series -#Now, for testing, save these dataframes as CSV. +# Now, for testing, save these dataframes as CSV. csv_filename = "updated_us_weights.csv" us_weights_df.to_csv(csv_filename) -#Now, create a GitHub release +# Now, create a GitHub release -api_url = 'https://api.github.com/repos/PolicyEngine/reweight/releases' +api_url = "https://api.github.com/repos/PolicyEngine/reweight/releases" -owner = 'pmberg' -repo = 'reweight' -token = os.environ.get('GITHUB_TOKEN') +owner = "pmberg" +repo = "reweight" +token = os.environ.get("GITHUB_TOKEN") # Create release headers = { - 'Authorization': f'token {token}', - 'Accept': 'application/vnd.github.v3+json' + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", } release_data = { - 'tag_name': f'v{pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', - 'name': f'Data Release {pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', - 'body': 'Automated data release with updated weights' + "tag_name": f'v{pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', + "name": f'Data Release {pd.Timestamp.now().strftime("%Y.%m.%d.%H.%M.%S")}', + "body": "Automated data release with updated weights", } -response = requests.post(api_url.format(owner=owner, repo=repo), headers=headers, json=release_data) +response = requests.post( + api_url.format(owner=owner, repo=repo), headers=headers, json=release_data +) release = response.json() # Upload assets -upload_url = release['upload_url'].split('{')[0] +upload_url = release["upload_url"].split("{")[0] + def upload_file(file_name): - with open(file_name, 'rb') as file: + with open(file_name, "rb") as file: content = file.read() - headers['Content-Type'] = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' - params = {'name': os.path.basename(file_name)} - response = requests.post(upload_url, headers=headers, params=params, data=content) + headers["Content-Type"] = ( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + params = {"name": os.path.basename(file_name)} + response = requests.post( + upload_url, headers=headers, params=params, data=content + ) if response.status_code == 201: print(f"File added successfully: {release['html_url']}") else: print(f"Failed to add file: {response.content}") + for file_name in ["updated_uk_weights.csv", "updated_us_weights.csv"]: - upload_file(file_name) \ No newline at end of file + upload_file(file_name) diff --git a/reweight/logic/reweight.py b/reweight/logic/reweight.py index 4227c29..1e48f66 100644 --- a/reweight/logic/reweight.py +++ b/reweight/logic/reweight.py @@ -43,7 +43,7 @@ def reweight( optimizer = torch.optim.Adam([log_weights]) - #Report the initial loss: + # Report the initial loss: targets_estimate = torch.exp(log_weights) @ estimate_matrix # Calculate the loss loss = torch.mean( From 5513fbe487bbde84f26fdddd8dadb5feb3b81984 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 11:50:05 -0400 Subject: [PATCH 07/22] Added Microsimulation lines to process_data --- reweight/logic/process_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 57e3151..57fb7f4 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -7,6 +7,7 @@ import base64 import policyengine_uk +from policyengine_uk import Microsimulation from policyengine_uk.data import RawFRS_2021_22 from policyengine_uk.data.datasets.frs.calibration.calibrate import ( generate_model_variables, @@ -15,6 +16,7 @@ from reweight import reweight # UK dataframe generation. +sim = Microsimulation() RawFRS_2021_22().download() From 6bfab0220c9a24d52e542cd2558c63402e730fdd Mon Sep 17 00:00:00 2001 From: = Date: Wed, 7 Aug 2024 12:02:02 -0400 Subject: [PATCH 08/22] Reworked env in YAML file --- .github/workflows/schedule.yaml | 4 ++++ reweight/logic/process_data.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/schedule.yaml b/.github/workflows/schedule.yaml index d57cb6f..49bd335 100644 --- a/.github/workflows/schedule.yaml +++ b/.github/workflows/schedule.yaml @@ -22,3 +22,7 @@ jobs: run: make install - name: Run data processing script run: python reweight/logic/process_data.py + env: + POVERTYTRACKER_RAW_URL: ${{ secrets.POVERTYTRACKER_RAW_URL }} + POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN: ${{ secrets.POLICYENGINE_GITHUB_MICRODATA_AUTH_TOKEN}} + API_GITHUB_TOKEN: ${{ secrets.API_GITHUB_TOKEN }} diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 57fb7f4..5226a46 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -81,7 +81,7 @@ owner = "pmberg" repo = "reweight" -token = os.environ.get("GITHUB_TOKEN") +token = os.environ.get("API_GITHUB_TOKEN") # Create release headers = { From bb40b673bfd5b27046c849197f8fcc9b37e2eedf Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 13 Aug 2024 14:09:21 +0100 Subject: [PATCH 09/22] Add sketch of condensed code --- reweight/logic/process_data.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 5226a46..9369ab4 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -13,6 +13,18 @@ generate_model_variables, ) +def calibrate_country_weights( + household_weights, loss_matrix, target_labels, target_values, epochs +) -> pd.DataFrame: + pass + + +uk_inputs = ... +us_inputs = ... + +calibrate_country_weights(*uk_inputs) +calibrate_country_weights(*us_inputs) + from reweight import reweight # UK dataframe generation. From a96ac6a0797ff51f362d7fa2038b6d3c78562336 Mon Sep 17 00:00:00 2001 From: = Date: Tue, 13 Aug 2024 10:40:55 -0400 Subject: [PATCH 10/22] Refactored process_data, splitting repeated code into two functions. --- reweight/logic/process_data.py | 98 +++++++++++++--------------------- 1 file changed, 38 insertions(+), 60 deletions(-) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 9369ab4..3a463f0 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -9,32 +9,23 @@ import policyengine_uk from policyengine_uk import Microsimulation from policyengine_uk.data import RawFRS_2021_22 -from policyengine_uk.data.datasets.frs.calibration.calibrate import ( - generate_model_variables, -) - -def calibrate_country_weights( - household_weights, loss_matrix, target_labels, target_values, epochs -) -> pd.DataFrame: - pass - - -uk_inputs = ... -us_inputs = ... +from policyengine_uk.data.datasets.frs.calibration.calibrate import generate_model_variables as uk_generate -calibrate_country_weights(*uk_inputs) -calibrate_country_weights(*us_inputs) +import policyengine_us +from policyengine_us.data.datasets.cps.enhanced_cps.loss import generate_model_variables as us_generate from reweight import reweight -# UK dataframe generation. -sim = Microsimulation() - -RawFRS_2021_22().download() - -uk_weights_df = pd.DataFrame() +def generate_country_weights(year, data_source, generate_func): + """ + Parameters: + year (int): The year for which these country values are generated. + data_source (str): The name of the data source for that country. + generate_func (function): The function used to generate the initial values. -for year in range(2024, 2029): + Returns: + final_weights (torch.Tensor): a PyTorch tensor of final reweighted weights. + """ ( household_weights, weight_adjustment, @@ -42,50 +33,37 @@ def calibrate_country_weights( targets, targets_array, equivalisation_factors_array, - ) = generate_model_variables("frs_2021", year) - sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) - uk_final_weights = reweight( - household_weights, sim_matrix, targets, targets_array, epochs=1_000 - ) - uk_weight_series = pd.Series(uk_final_weights.numpy()) - uk_weights_df[str(year)] = uk_weight_series - - -csv_filename = "updated_uk_weights.csv" -uk_weights_df.to_csv(csv_filename) - - -# US dataframe generation. - -import policyengine_us -from policyengine_us.data.datasets.cps.enhanced_cps.loss import ( - generate_model_variables, -) - -us_weights_df = pd.DataFrame() - -for year in range(2024, 2029): - ( - household_weights, - weight_adjustment, - values_df, - targets, - targets_array, - equivalisation_factors_array, - ) = generate_model_variables("cps_2021", year) + ) = generate_func(data_source, year) sim_matrix = torch.tensor(values_df.to_numpy(), dtype=torch.float32) initial_weights = torch.tensor(household_weights, dtype=torch.float32) targets_tensor = torch.tensor(targets_array, dtype=torch.float32) - us_final_weights = reweight( + final_weights = reweight( initial_weights, sim_matrix, targets, targets_tensor, epochs=1_000 ) - us_weight_series = pd.Series(us_final_weights.numpy()) - us_weights_df[str(year)] = us_weight_series + return final_weights + +def generate_country_csv(start_year, end_year, data_source, generate_func, csv_filename): + """ + Parameters: + start_year (int): The year for which these country values start generating (inclusive). + end_year (int): The year for which these country values stop generating (non-inclusive). + data_source (str): The name of the data source for that country. + generate_func (function): The function used to generate the initial values. + csv_filename (str): The name of the file which the generated data are saved under. + + Returns: + None. Generates and saves a CSV file of reweighted weights. + """ + weights_df = pd.DataFrame() + for year in range(start_year, end_year): + final_weights = generate_country_weights(year, data_source, generate_func) + weight_series = pd.Series(final_weights.numpy()) + weights_df[str(year)] = weight_series + weights_df.to_csv(csv_filename) -# Now, for testing, save these dataframes as CSV. - -csv_filename = "updated_us_weights.csv" -us_weights_df.to_csv(csv_filename) +RawFRS_2021_22().download() +generate_country_csv(2024, 2029, "frs_2021", uk_generate, "updated_uk_weights.csv") +generate_country_csv(2024, 2029, "cps_2021", us_generate, "updated_us_weights.csv") # Now, create a GitHub release @@ -109,7 +87,7 @@ def calibrate_country_weights( api_url.format(owner=owner, repo=repo), headers=headers, json=release_data ) release = response.json() - +print(release) # Upload assets upload_url = release["upload_url"].split("{")[0] From 78be5c37b39f3c0b65ae84200985e468810d37bf Mon Sep 17 00:00:00 2001 From: = Date: Tue, 13 Aug 2024 11:19:51 -0400 Subject: [PATCH 11/22] Reformatted process_data --- reweight/logic/process_data.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/reweight/logic/process_data.py b/reweight/logic/process_data.py index 3a463f0..ec71c5e 100644 --- a/reweight/logic/process_data.py +++ b/reweight/logic/process_data.py @@ -9,13 +9,18 @@ import policyengine_uk from policyengine_uk import Microsimulation from policyengine_uk.data import RawFRS_2021_22 -from policyengine_uk.data.datasets.frs.calibration.calibrate import generate_model_variables as uk_generate +from policyengine_uk.data.datasets.frs.calibration.calibrate import ( + generate_model_variables as uk_generate, +) import policyengine_us -from policyengine_us.data.datasets.cps.enhanced_cps.loss import generate_model_variables as us_generate +from policyengine_us.data.datasets.cps.enhanced_cps.loss import ( + generate_model_variables as us_generate, +) from reweight import reweight + def generate_country_weights(year, data_source, generate_func): """ Parameters: @@ -42,7 +47,10 @@ def generate_country_weights(year, data_source, generate_func): ) return final_weights -def generate_country_csv(start_year, end_year, data_source, generate_func, csv_filename): + +def generate_country_csv( + start_year, end_year, data_source, generate_func, csv_filename +): """ Parameters: start_year (int): The year for which these country values start generating (inclusive). @@ -56,14 +64,21 @@ def generate_country_csv(start_year, end_year, data_source, generate_func, csv_f """ weights_df = pd.DataFrame() for year in range(start_year, end_year): - final_weights = generate_country_weights(year, data_source, generate_func) + final_weights = generate_country_weights( + year, data_source, generate_func + ) weight_series = pd.Series(final_weights.numpy()) weights_df[str(year)] = weight_series weights_df.to_csv(csv_filename) + RawFRS_2021_22().download() -generate_country_csv(2024, 2029, "frs_2021", uk_generate, "updated_uk_weights.csv") -generate_country_csv(2024, 2029, "cps_2021", us_generate, "updated_us_weights.csv") +generate_country_csv( + 2024, 2029, "frs_2021", uk_generate, "updated_uk_weights.csv" +) +generate_country_csv( + 2024, 2029, "cps_2021", us_generate, "updated_us_weights.csv" +) # Now, create a GitHub release From 9e1120d9a0ea61cb0893d630ee80afe5f7b2f763 Mon Sep 17 00:00:00 2001 From: = Date: Tue, 13 Aug 2024 14:00:18 -0400 Subject: [PATCH 12/22] Update reweight From ea5169fb269fa5d6456af33b456b60c913b5cd3e Mon Sep 17 00:00:00 2001 From: = Date: Wed, 14 Aug 2024 11:03:51 -0400 Subject: [PATCH 13/22] Added scripts for PyPI publication. --- .github/workflows/publish.yaml | 26 ++++++++++++++++++++++++++ .github/workflows/push.yaml | 24 ------------------------ pyproject.toml | 23 +++++++++++++++++++++++ setup.py | 2 ++ 4 files changed, 51 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/publish.yaml create mode 100644 pyproject.toml diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml new file mode 100644 index 0000000..b7065d9 --- /dev/null +++ b/.github/workflows/publish.yaml @@ -0,0 +1,26 @@ +name: Publish to PyPI.org +on: + release: + types: [published] +jobs: + pypi: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install package + run: make install + - name: Build package + run: make + - name: Publish a Python distribution to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI }} + skip-existing: true diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml index 2a83ac8..6c1b3f8 100755 --- a/.github/workflows/push.yaml +++ b/.github/workflows/push.yaml @@ -47,27 +47,3 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} BRANCH: gh-pages FOLDER: docs/_build/html - Publish: - runs-on: ubuntu-latest - if: | - (github.repository == 'PolicyEngine/reweight') - && (github.event.head_commit.message == 'Update reweight') - steps: - - name: Checkout repo - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - name: Publish a git tag - run: ".github/publish-git-tag.sh || true" - - name: Install package - run: make install - - name: Build package - run: make - - name: Publish a Python distribution to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - user: __token__ - password: ${{ secrets.PYPI }} - skip-existing: true diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..a9cabbd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[project] +name = "policyengine-reweight" +authors = [ + { name="Peter Berggren", email="berggrenpeterm@gmail.com" }, +] +description = "Reweighting package for survey weights in PolicyEngine" +readme = "README.md" +requires-python = ">=3.9" +classifiers = [ + "Programming Language :: Python :: 3", + License :: OSI Approved :: GNU Affero General Public License v3 + "Operating System :: OS Independent", +] + +[project.urls] +Homepage = "https://github.com/PolicyEngine/reweight" +Issues = "https://github.com/PolicyEngine/reweight/issues" + +[build-system] +requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] \ No newline at end of file diff --git a/setup.py b/setup.py index 8bdd301..adea0ec 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,8 @@ "dev": [ "black", "yaml-changelog", + "setuptools", + "setuptools_scm", ], }, # Windows CI requires Python 3.9. From 3b6f0ab342ed63f0b494bb5cec481d96cce9e69e Mon Sep 17 00:00:00 2001 From: = Date: Wed, 14 Aug 2024 11:07:22 -0400 Subject: [PATCH 14/22] Fixed pyproject.toml typo --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a9cabbd..bd04cb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ readme = "README.md" requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3", - License :: OSI Approved :: GNU Affero General Public License v3 + "License :: OSI Approved :: GNU Affero General Public License v3", "Operating System :: OS Independent", ] From dfc69af1a754b7b6d9266b1ce59999154967457b Mon Sep 17 00:00:00 2001 From: = Date: Wed, 14 Aug 2024 11:11:27 -0400 Subject: [PATCH 15/22] Removed excess information from pyproject.toml --- pyproject.toml | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bd04cb5..76b7319 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,21 +1,3 @@ -[project] -name = "policyengine-reweight" -authors = [ - { name="Peter Berggren", email="berggrenpeterm@gmail.com" }, -] -description = "Reweighting package for survey weights in PolicyEngine" -readme = "README.md" -requires-python = ">=3.9" -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: GNU Affero General Public License v3", - "Operating System :: OS Independent", -] - -[project.urls] -Homepage = "https://github.com/PolicyEngine/reweight" -Issues = "https://github.com/PolicyEngine/reweight/issues" - [build-system] requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" From 44c57a6f85baa05d2a7dbb185f93c4b476f48877 Mon Sep 17 00:00:00 2001 From: = Date: Wed, 14 Aug 2024 11:13:00 -0400 Subject: [PATCH 16/22] Added manual activation to the publish.yaml action --- .github/workflows/publish.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index b7065d9..8ccc488 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -2,6 +2,7 @@ name: Publish to PyPI.org on: release: types: [published] + workflow-dispatch: jobs: pypi: runs-on: ubuntu-latest From 6afdc4848af8f378286454f09d8dda888e8a8d0b Mon Sep 17 00:00:00 2001 From: = Date: Wed, 14 Aug 2024 11:15:12 -0400 Subject: [PATCH 17/22] Fixed workflow-dispatch --- .github/workflows/publish.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 8ccc488..39604b4 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -3,6 +3,11 @@ on: release: types: [published] workflow-dispatch: + inputs: + reason: + description: "Reason for manual trigger" + required: true + default: "Testing workflow" jobs: pypi: runs-on: ubuntu-latest From d444456659451ab6421be367d0b2eef7084b874d Mon Sep 17 00:00:00 2001 From: = Date: Wed, 14 Aug 2024 11:16:20 -0400 Subject: [PATCH 18/22] Fixed workflow_dispatch --- .github/workflows/publish.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 39604b4..e641b44 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -2,7 +2,7 @@ name: Publish to PyPI.org on: release: types: [published] - workflow-dispatch: + workflow_dispatch: inputs: reason: description: "Reason for manual trigger" From b4e6f937c1a705061be9a799fa606844eada73a0 Mon Sep 17 00:00:00 2001 From: = Date: Thu, 15 Aug 2024 13:43:19 -0400 Subject: [PATCH 19/22] Added new notebooks to documentation --- docs/_toc.yml | 2 + docs/features/process_data.ipynb | 38 +++++ docs/features/reweight.ipynb | 233 +++++++++++++++++++++++++++++++ docs/index.md | 2 +- 4 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 docs/features/process_data.ipynb create mode 100644 docs/features/reweight.ipynb diff --git a/docs/_toc.yml b/docs/_toc.yml index 551492a..75f2401 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -2,5 +2,7 @@ format: jb-book root: index.md chapters: - file: current-features + - file: features/reweight + - file: features/process_data - file: testing_notebooks/us-notebook - file: testing_notebooks/uk-notebook diff --git a/docs/features/process_data.ipynb b/docs/features/process_data.ipynb new file mode 100644 index 0000000..87d9646 --- /dev/null +++ b/docs/features/process_data.ipynb @@ -0,0 +1,38 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The process_data file\n", + "\n", + "The process_data file in this repo is used to process, once monthly, survey data from PolicyEngine UK and PolicyEngine US, using the reweight function.\n", + "\n", + "## generate_country_weights\n", + "\n", + "This is a helper function that uses reweight to generate optimized weights for a specific country and year.\n", + "\n", + "## generate_country_csv\n", + "\n", + "This is a helper function that generates optimized weights for a country over multiple years, and then saves these weights as a CSV file.\n", + "\n", + "## Main body of code\n", + "\n", + "First, `generate_country_csv` is used to generate weights files for both the UK and the US. Then, a GitHub release is generated on the reweight repo, to which the two CSV files are uploaded with a simple helper function called `upload_file`.\n", + "\n", + "## Notes\n", + "\n", + "If you're developing on this, replace \"pmberg\" with your username, and make an environment variable titled API_GITHUB_TOKEN containing an appropriate GitHub API token.\n", + "\n", + "Also, the UK data sources are not publicly available, so if you're developing on this, you need authorization to get an API key that works with them. If you lack the necessary permissions at any stage, the code will not run." + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/features/reweight.ipynb b/docs/features/reweight.ipynb new file mode 100644 index 0000000..5c31447 --- /dev/null +++ b/docs/features/reweight.ipynb @@ -0,0 +1,233 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reweighting Function Documentation\n", + "\n", + "## Purpose\n", + "\n", + "This notebook documents a Python function `reweight` that adjusts a set of initial weights to better match target statistics. It's particularly useful for calibrating survey data weights in microsimulation models, such as those used in PolicyEngine UK.\n", + "\n", + "## Import Required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import torch\n", + "from torch.utils.tensorboard import SummaryWriter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Function Definition and Overview\n", + "\n", + "The reweight function uses an optimization process to adjust initial weights so that the weighted sum of estimates more closely matches a set of target values.\n", + "### Parameters\n", + "\n", + "`initial_weights (torch.Tensor):` Initial weights for survey data.\n", + "\n", + "`estimate_matrix (torch.Tensor):` Matrix of estimates from a microsimulation model.\n", + "\n", + "`target_names (iterable):` Names of target statistics (not used in the function body).\n", + "\n", + "`target_values (torch.Tensor):` Values of target statistics to match.\n", + "\n", + "`epochs (int, optional):` Number of optimization iterations. Default is 1000.\n", + "\n", + "`epoch_step (int, optional):` Interval for printing loss during optimization. Default is 100.\n", + "\n", + "### Returns\n", + "\n", + "`final_weights (torch.Tensor):` Adjusted weights after optimization." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def reweight(\n", + " initial_weights,\n", + " estimate_matrix,\n", + " target_names,\n", + " target_values,\n", + " epochs=1000,\n", + " epoch_step=100,\n", + "):\n", + " \"\"\"\n", + " Main reweighting function, suitable for PolicyEngine UK use (PolicyEngine US use and testing TK)\n", + "\n", + " To avoid the need for equivalisation factors, use relative error:\n", + " |predicted - actual|/actual\n", + "\n", + " Parameters:\n", + " household_weights (torch.Tensor): The initial weights given to survey data, which are to be\n", + " adjusted by this function.\n", + " estimate_matrix (torch.Tensor): A large matrix of estimates, obtained from e.g. a PolicyEngine\n", + " Microsimulation instance.\n", + " target_names (iterable): The names of a set of target statistics treated as ground truth.\n", + " target_values (torch.Tensor): The values of these target statistics.\n", + " epochs: The number of iterations that the optimization loop should run for.\n", + " epoch_step: The interval at which to print the loss during the optimization loop.\n", + "\n", + " Returns:\n", + " final_weights: a reweighted set of household weights, obtained through an optimization process\n", + " over mean squared errors with respect to the target values.\n", + " \"\"\"\n", + " # Initialize a TensorBoard writer\n", + " writer = SummaryWriter()\n", + "\n", + " # Create a Torch tensor of log weights\n", + " log_weights = torch.log(initial_weights)\n", + " log_weights.requires_grad_()\n", + "\n", + " # estimate_matrix (cross) exp(log_weights) = target_values\n", + "\n", + " optimizer = torch.optim.Adam([log_weights])\n", + "\n", + " # Report the initial loss:\n", + " targets_estimate = torch.exp(log_weights) @ estimate_matrix\n", + " # Calculate the loss\n", + " loss = torch.mean(\n", + " ((targets_estimate - target_values) / target_values) ** 2\n", + " )\n", + " print(f\"Initial loss: {loss.item()}\")\n", + "\n", + " # Training loop\n", + " for epoch in range(epochs):\n", + "\n", + " # Estimate the targets\n", + " targets_estimate = torch.exp(log_weights) @ estimate_matrix\n", + " # Calculate the loss\n", + " loss = torch.mean(\n", + " ((targets_estimate - target_values) / target_values) ** 2\n", + " )\n", + "\n", + " writer.add_scalar(\"Loss/train\", loss, epoch)\n", + "\n", + " optimizer.zero_grad()\n", + "\n", + " # Perform backpropagation\n", + " loss.backward()\n", + "\n", + " # Update weights\n", + " optimizer.step()\n", + "\n", + " # Print loss whenever the epoch number, when one-indexed, is divisible by epoch_step\n", + " if (epoch + 1) % epoch_step == 0:\n", + " print(f\"Epoch {epoch+1}, Loss: {loss.item()}\")\n", + "\n", + " writer.flush()\n", + "\n", + " return torch.exp(log_weights.detach())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage Example\n", + "Here's how you might use the reweight function:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial loss: 0.14120370149612427\n", + "Epoch 100, Loss: 0.06793717294931412\n", + "Epoch 200, Loss: 0.03280560299754143\n", + "Epoch 300, Loss: 0.016901666298508644\n", + "Epoch 400, Loss: 0.010035503655672073\n", + "Epoch 500, Loss: 0.007239286322146654\n", + "Epoch 600, Loss: 0.0061649903655052185\n", + "Epoch 700, Loss: 0.005761378910392523\n", + "Epoch 800, Loss: 0.0055924332700669765\n", + "Epoch 900, Loss: 0.005493843927979469\n", + "Epoch 1000, Loss: 0.005410326179116964\n", + "Final weights: tensor([0.7894, 0.7471, 0.7306, 0.7218, 0.7163])\n" + ] + } + ], + "source": [ + "# Prepare your data as PyTorch tensors\n", + "initial_weights = torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0])\n", + "estimate_matrix = torch.tensor([\n", + " [1.0, 2.0, 3.0],\n", + " [2.0, 3.0, 4.0],\n", + " [3.0, 4.0, 5.0],\n", + " [4.0, 5.0, 6.0],\n", + " [5.0, 6.0, 7.0]\n", + "])\n", + "target_names = [\"Stat1\", \"Stat2\", \"Stat3\"]\n", + "target_values = torch.tensor([10.0, 15.0, 20.0])\n", + "\n", + "# Call the function\n", + "final_weights = reweight(\n", + " initial_weights,\n", + " estimate_matrix,\n", + " target_names,\n", + " target_values,\n", + " epochs=1000,\n", + " epoch_step=100\n", + ")\n", + "\n", + "print(\"Final weights:\", final_weights)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Important Notes\n", + "\n", + "* The function uses relative error (|predicted - actual|/actual) for optimization, avoiding the need for equivalisation factors.\n", + "\n", + "* It utilizes TensorBoard for logging the loss during training.\n", + "\n", + "* The optimization process uses the Adam optimizer and performs gradient descent on the log of the weights.\n", + "\n", + "## Warning\n", + "\n", + "This function expects input data in the form of PyTorch tensors. Using data in any other format (e.g., NumPy arrays, Pandas DataFrames) without converting to PyTorch tensors first will result in errors. Make sure to convert your input data to PyTorch tensors before passing them to the function." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "policyengine", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/index.md b/docs/index.md index e7a2981..89f3a2b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,4 +2,4 @@ The PolicyEngine reweight library is a library intended to reweight survey data based on known ground truth statistics, to adjust for sampling biases. This library is designed for use with the [PolicyEngine](https://policyengine.org) software packages. -Currently, this library is still very much a work in progress, and lacks e.g. systematic functions for the reweighting code, and the ability to reweight any survey data not already converted to PyTorch tensors. +Currently, this library is still very much a work in progress, and lacks e.g. a coherent versioning system, and the ability to reweight any survey data outside PolicyEngine UK or PolicyEngine US. From fce6e817874db68e2640cab0dd327f4f8d711ae9 Mon Sep 17 00:00:00 2001 From: = Date: Thu, 15 Aug 2024 13:47:22 -0400 Subject: [PATCH 20/22] Updated README files --- README.md | 2 +- reweight/README.md | 2 +- reweight/data/README.md | 2 +- reweight/logic/README.md | 2 +- reweight/tests/README.md | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4594dda..d2da5ec 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,3 @@ # reweight -This library will contain logic for consistently reweighting survey data across the PolicyEngine simulation sofware. \ No newline at end of file +This library is used for consistently reweighting survey data across the PolicyEngine simulation sofware, and includes both a function called `reweight` and a script called `process_data.py` that is used to run `reweight` on PolicyEngine data. diff --git a/reweight/README.md b/reweight/README.md index 8aacc51..d1b38f9 100644 --- a/reweight/README.md +++ b/reweight/README.md @@ -1,3 +1,3 @@ # Main codebase -This directory will contain the main codebase for the reweight library for PolicyEngine. \ No newline at end of file +This directory contains the main codebase for the reweight library for PolicyEngine, including logic, testing, and any datasets that will be used in the repo in the future. diff --git a/reweight/data/README.md b/reweight/data/README.md index 52ee343..ced9db0 100644 --- a/reweight/data/README.md +++ b/reweight/data/README.md @@ -1,3 +1,3 @@ # Data -This directory will contain datasets used in testing and implementing the reweight library for PolicyEngine. \ No newline at end of file +This directory will contain any datasets that will be used in testing and implementing the reweight library for PolicyEngine. diff --git a/reweight/logic/README.md b/reweight/logic/README.md index e5eb334..1162128 100644 --- a/reweight/logic/README.md +++ b/reweight/logic/README.md @@ -1,3 +1,3 @@ # Main codebase -This directory will contain logic used by the reweight library for PolicyEngine. \ No newline at end of file +This directory contains logic used by the reweight library for PolicyEngine, including the `reweight.py` reweighting function and the `process_data.py` automated data generation script. diff --git a/reweight/tests/README.md b/reweight/tests/README.md index 8aacc51..48e93e2 100644 --- a/reweight/tests/README.md +++ b/reweight/tests/README.md @@ -1,3 +1,3 @@ -# Main codebase +# Tests -This directory will contain the main codebase for the reweight library for PolicyEngine. \ No newline at end of file +This directory contains tests for the reweight library for PolicyEngine, including tests of both helper functions and installation sequences. From 06d481b9d071c2d486c6c08c8a9327f22078c982 Mon Sep 17 00:00:00 2001 From: = Date: Fri, 16 Aug 2024 14:35:21 -0400 Subject: [PATCH 21/22] Reworked setup scripts to match PolicyEngine format --- pyproject.toml | 5 ----- setup.py | 6 +++--- 2 files changed, 3 insertions(+), 8 deletions(-) delete mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 76b7319..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,5 +0,0 @@ -[build-system] -requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] -build-backend = "setuptools.build_meta" - -[tool.setuptools_scm] \ No newline at end of file diff --git a/setup.py b/setup.py index adea0ec..d671a8c 100644 --- a/setup.py +++ b/setup.py @@ -6,14 +6,14 @@ readme = readme_file.read() setup( - name="reweight", - version="0.3.0", + name="policyengine_reweight", + version="0.4.0", author="PolicyEngine", author_email="hello@policyengine.org", long_description=readme, long_description_content_type="text/markdown", classifiers=[ - "Development Status :: 1 - Planning", + "Development Status :: 4 - Beta", "License :: OSI Approved :: GNU Affero General Public License v3", "Operating System :: POSIX", "Programming Language :: Python", From ce4d7dba73a9f4af5af0416df081069a2faaa0cf Mon Sep 17 00:00:00 2001 From: = Date: Fri, 16 Aug 2024 14:38:08 -0400 Subject: [PATCH 22/22] Fixed a typo in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d671a8c..8b6e3bc 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ readme = readme_file.read() setup( - name="policyengine_reweight", + name="policyengine-reweight", version="0.4.0", author="PolicyEngine", author_email="hello@policyengine.org",