diff --git a/CHANGELOG.md b/CHANGELOG.md index bb5738e..6828c19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 2.2.5 (11/7/2024) +- fix: hot fixes for the extrapolation step + using the presidential margins to infer a ticket splitting estimate in each house / senate race [#140](https://github.com/washingtonpost/elex-live-model/pull/140) + # 2.2.4 (11/5/2024) - fix: truncation can fail catastrophically when % reporting is too low [#138](https://github.com/washingtonpost/elex-live-model/pull/138) diff --git a/setup.py b/setup.py index dc51a7e..7c4fc8b 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ LONG_DESCRIPTION = f.read() # The full version, including alpha/beta/rc tags -RELEASE = "2.2.4" +RELEASE = "2.2.5" # The short X.Y version VERSION = ".".join(RELEASE.split(".")[:2]) diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py index 94d9243..7895abe 100644 --- a/src/elexmodel/client.py +++ b/src/elexmodel/client.py @@ -1,4 +1,5 @@ from collections import defaultdict +from io import StringIO import numpy as np import pandas as pd @@ -339,6 +340,31 @@ def get_estimates( versioned_data_handler = None else: versioned_data_handler = None + + if model_parameters.get("correct_from_presidential", False): + s3_client = s3.S3CsvUtil(TARGET_BUCKET) + baseline_path = f"{S3_FILE_PATH}/{self.election_id}/data/P/data_county.csv" + results_path = f"{S3_FILE_PATH}/{self.election_id}/results/P/county/current.csv" + predictions_path = f"{S3_FILE_PATH}/{self.election_id}/predictions/P/county/unit_data/current.csv" + pres_baseline = pd.read_csv(StringIO(s3_client.get(baseline_path)), dtype={"geographic_unit_fips": str}) + pres_baseline["baseline_normalized_margin"] = (pres_baseline.baseline_dem - pres_baseline.baseline_gop) / ( + pres_baseline.baseline_dem + pres_baseline.baseline_gop + ) + pres_results = pd.read_csv(StringIO(s3_client.get(results_path)), dtype={"geographic_unit_fips": str}) + pres_predictions = pd.read_csv( + StringIO(s3_client.get(predictions_path)), dtype={"geographic_unit_fips": str} + ) + pres_predictions = pres_predictions.merge( + pres_results[["geographic_unit_fips", "results_weights"]], on="geographic_unit_fips", how="left" + ) + pres_predictions = pres_predictions.merge( + pres_baseline[["geographic_unit_fips", "baseline_normalized_margin"]], + on="geographic_unit_fips", + how="left", + ) + else: + pres_predictions = None + LOG.info("Running model for %s", self.election_id) LOG.info( "Model parameters: \n prediction intervals: %s, percent reporting threshold: %s, \ @@ -359,7 +385,9 @@ def get_estimates( self.model = GaussianElectionModel(model_settings=model_settings) elif pi_method == "bootstrap": self.model = BootstrapElectionModel( - model_settings=model_settings, versioned_data_handler=versioned_data_handler + model_settings=model_settings, + versioned_data_handler=versioned_data_handler, + pres_predictions=pres_predictions, ) minimum_reporting_units_max = 0 diff --git a/src/elexmodel/handlers/data/VersionedData.py b/src/elexmodel/handlers/data/VersionedData.py index 231bfc2..b76a55d 100644 --- a/src/elexmodel/handlers/data/VersionedData.py +++ b/src/elexmodel/handlers/data/VersionedData.py @@ -117,7 +117,11 @@ def compute_estimated_margin(df): # because the AP adjusted its model after the fact. We correct for this here. # we recompute the percent_expected_vote using the last reported value as the max perc_expected_vote_corr = np.divide( - results_turnout, results_turnout[-1], out=np.zeros_like(results_turnout), where=results_turnout[-1] != 0 + results_turnout, + results_turnout[-1], + out=np.zeros_like(results_turnout), + where=results_turnout[-1] != 0, + casting="unsafe", ) # check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin) @@ -190,7 +194,7 @@ def compute_estimated_margin(df): est_margins = observed_norm_margin * observed_vote + observed_batch_margin * (percs - observed_vote) est_margins = np.divide( - est_margins, percs, where=percs != 0, out=np.zeros_like(est_margins) + est_margins, percs, where=percs != 0, out=np.zeros_like(est_margins), casting="unsafe" ) # Handle div-by-zero # Return a DataFrame with the multi-index (geographic_unit_fips, perc) diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py index 00ac36a..59f3e1f 100644 --- a/src/elexmodel/models/BootstrapElectionModel.py +++ b/src/elexmodel/models/BootstrapElectionModel.py @@ -52,7 +52,7 @@ class BootstrapElectionModel(BaseElectionModel): and the epsilons are contest (state/district) level random effects. """ - def __init__(self, model_settings={}, versioned_data_handler=None): + def __init__(self, model_settings={}, versioned_data_handler=None, pres_predictions=None): super().__init__(model_settings) self.B = model_settings.get("B", 500) # number of bootstrap samples self.strata = model_settings.get("strata", ["county_classification"]) # columns to stratify the data by @@ -61,6 +61,7 @@ def __init__(self, model_settings={}, versioned_data_handler=None): "agg_model_hard_threshold", True ) # use sigmoid or hard thresold when calculating agg model self.district_election = model_settings.get("district_election", False) + self.lambda_ = model_settings.get("lambda_", None) # regularization parameter for OLS # save versioned data for later use @@ -70,6 +71,10 @@ def __init__(self, model_settings={}, versioned_data_handler=None): self.extrapolate_std_method = model_settings.get("extrapolate_std_method", "std") self.max_dist_to_observed = model_settings.get("max_dist_to_observed", 5) + # save presidenial predictions for later use + self.pres_predictions = pres_predictions + self.correct_from_presidential = model_settings.get("correct_from_presidential", False) + # upper and lower bounds for the quantile regression which define the strata distributions # these make sure that we can control the worst cases for the distributions in case we # haven't seen enough data ayet @@ -1283,6 +1288,49 @@ def compute_bootstrap_errors( extrap_filter ] + if self.correct_from_presidential: + nonreporting_units["geographic_unit_fips_p"] = nonreporting_units.geographic_unit_fips.apply( + lambda x: x.split("_")[1] + ) + nonreporting_units = nonreporting_units.merge( + self.pres_predictions, + left_on="geographic_unit_fips_p", + right_on="geographic_unit_fips", + how="left", + suffixes=("", "_pres"), + ) + + # adjust results_normalized_margin_pres to account for split counties + + nonreporting_units["margin_adj"] = ( + nonreporting_units.baseline_normalized_margin - nonreporting_units.baseline_normalized_margin_pres + ) + + nonreporting_units["results_normalized_margin_pres"] = ( + nonreporting_units.results_margin_pres / nonreporting_units.results_weights_pres + + nonreporting_units.margin_adj + ) + nonreporting_units["pred_normalized_margin_pres"] = ( + nonreporting_units.pred_margin / nonreporting_units.pred_turnout + nonreporting_units.margin_adj + ) + + nonreporting_units["pred_normalized_margin"] = np.mean( + y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper), axis=1 + ) + + nonreporting_units["margin_gap"] = ( + nonreporting_units.results_normalized_margin - nonreporting_units.results_normalized_margin_pres + ) + + nonreporting_units["pred_normalized_margin_new"] = ( + nonreporting_units.pred_normalized_margin_pres + nonreporting_units.margin_gap + ) + adjustment = ( + nonreporting_units["pred_normalized_margin_new"].values + - nonreporting_units["pred_normalized_margin"].values + ) + y_test_pred_B[~np.isnan(adjustment)] += adjustment[~np.isnan(adjustment)].reshape(-1, 1) + y_test_pred_B = y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper) # \tilde{y_i}^{b} * \tilde{z_i}^{b}