Merge pull request #12 from washingtonpost/release/2.0.0

Release/2.0.0
washingtonpost · Sep 25, 2023 · 3f01e32 · 3f01e32
2 parents 717ad24 + 37d995b
commit 3f01e32
Show file tree

Hide file tree

Showing 12 changed files with 652 additions and 189 deletions.
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -10,5 +10,5 @@ jobs:
     - uses: actions/checkout@v2
     - uses: actions/setup-python@v2
       with:
-        python-version: '3.9'
+        python-version: '3.10'
     - uses: pre-commit/[email protected]
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -7,7 +7,7 @@ jobs:
     timeout-minutes: 5
     strategy:
       matrix:
-        python-version: [3.8, 3.7]
+        python-version: ['3.10']
     steps:
       - uses: actions/checkout@v2
       - name: Setup Python

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,6 @@
 # Changelog
+### 2.0.0 - 2023-09-22
+- feat: adding ordinary least squares regression solver. Updating quantile regression to solve dual [#11](https://github.com/washingtonpost/elex-solver/pull/11)
 
 ### 1.1.0 - 2023-04-21
 - fix: Not regularizing intercept coefficient + better warning handling [#8](https://github.com/washingtonpost/elex-solver/pull/8)

diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # elex-solver
 
 This packages includes solvers for:
+* Ordinary least squares regression
 * Quantile regression
 * Transition matrices
 
@@ -9,8 +10,11 @@ This packages includes solvers for:
 * We recommend that you set up a virtualenv and activate it (IE ``mkvirtualenv elex-solver`` via http://virtualenvwrapper.readthedocs.io/en/latest/).
 * Run ``pip install elex-solver``
 
+## Ordinary least squares
+We have our own implementation of ordinary least squares in Python because this let us optimize it towards the bootstrap by storing and re-using the normal equations. This allows for significant speed up.
+
 ## Quantile Regression
-Since we did not find any implementations of quantile regression in Python that fit our needs, we decided to write one ourselves. This uses [`cvxpy`](https://www.cvxpy.org/#) and sets up quantile regression as a normal optimization problem. We use quantile regression for our election night model.
+Since we did not find any implementations of quantile regression in Python that fit our needs, we decided to write one ourselves. At the moment this uses two libraries, the version that solves the non-regularized problem uses `numpy`and solves the dual based on [this](https://arxiv.org/pdf/2305.12616.pdf) paper. The version that solves the regularized problem uses [`cvxpy`](https://www.cvxpy.org/#) and sets up the problem as a normal optimization problem. Eventually, we are planning on replacing the regularized version with the dual also.
 
 ## Transition matrices
 We also have a solver for transition matrices. While this works arbitrarily, we have used this in the past for our primary election night model. We can still use this to create the sankey diagram coefficients.

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 from setuptools import find_packages, setup
 
-INSTALL_REQUIRES = "cvxpy<=1.2.0"
+INSTALL_REQUIRES = ["cvxpy<=1.2.0", "scipy<1.11.0"]
 
 THIS_FILE_DIR = os.path.dirname(__file__)
 
@@ -13,7 +13,7 @@
     LONG_DESCRIPTION = f.read()
 
 # The full version, including alpha/beta/rc tags
-RELEASE = "1.1.0"
+RELEASE = "2.0.0"
 # The short X.Y version
 VERSION = ".".join(RELEASE.split(".")[:2])
 
@@ -29,7 +29,7 @@
         "Intended Audience :: Developers",
         "License :: OSI Approved :: MIT License",
         "Programming Language :: Python",
-        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.10",
     ],
     description="A package for optimization solvers",
     long_description=LONG_DESCRIPTION,

diff --git a/src/elexsolver/LinearSolver.py b/src/elexsolver/LinearSolver.py
@@ -0,0 +1,79 @@
+import logging
+import warnings
+from abc import ABC
+
+import numpy as np
+
+from elexsolver.logging import initialize_logging
+
+initialize_logging()
+
+LOG = logging.getLogger(__name__)
+
+
+class LinearSolverException(Exception):
+    pass
+
+
+class IllConditionedMatrixException(LinearSolverException):
+    pass
+
+
+class LinearSolver(ABC):
+    """
+    An abstract base class for a linear solver
+    """
+
+    CONDITION_WARNING_MIN = 50  # arbitrary
+    CONDITION_ERROR_MIN = 1e8  # based on scipy
+
+    def __init__(self):
+        self.coefficients = None
+
+    @classmethod
+    def fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, lambda_: float = 0.0, *kwargs):
+        """
+        Fits model
+        """
+        raise NotImplementedError
+
+    def predict(self, x: np.ndarray) -> np.ndarray:
+        """
+        Use coefficients to predict
+        """
+        self._check_any_element_nan_or_inf(x)
+
+        return x @ self.coefficients
+
+    def get_coefficients(self) -> np.ndarray:
+        """
+        Returns model coefficients
+        """
+        return self.coefficients
+
+    def _check_matrix_condition(self, x):
+        """
+        Check condition number of the design matrix as a check for multicolinearity.
+        This is equivalent to the ratio between the largest and the smallest singular value of the design matrix.
+        """
+        condition_number = np.linalg.cond(x)
+        if condition_number >= self.CONDITION_ERROR_MIN:
+            raise IllConditionedMatrixException(
+                f"Ill-conditioned matrix detected. Matrix condition number >= {self.CONDITION_ERROR_MIN}"
+            )
+        elif condition_number >= self.CONDITION_WARNING_MIN:
+            warnings.warn("Warning: Ill-conditioned matrix detected. result is not guaranteed to be accurate")
+
+    def _check_any_element_nan_or_inf(self, x):
+        """
+        Check whether any element in a matrix or vector is NaN or infinity
+        """
+        if np.any(np.isnan(x)) or np.any(np.isinf(x)):
+            raise ValueError("Array contains NaN or Infinity")
+
+    def _check_intercept(self, x):
+        """
+        Check whether the first column is all 1s (normal intercept) otherwise raises a warning.
+        """
+        if ~np.all(x[:, 0] == 1):
+            warnings.warn("Warning: fit_intercept=True and not all elements of the first columns are 1s")
diff --git a/src/elexsolver/OLSRegressionSolver.py b/src/elexsolver/OLSRegressionSolver.py
@@ -0,0 +1,143 @@
+import logging
+
+import numpy as np
+
+from elexsolver.LinearSolver import LinearSolver
+from elexsolver.logging import initialize_logging
+
+initialize_logging()
+
+LOG = logging.getLogger(__name__)
+
+
+class OLSRegressionSolver(LinearSolver):
+    """
+    A class for Ordinary Least Squares Regression optimized for the bootstrap
+    """
+
+    # OLS setup:
+    #       X \beta = y
+    # since X might not be square, we multiply the above equation on both sides by X^T to generate X^T X, which is guaranteed
+    # to be square
+    #       X^T X \beta = X^T y
+    # Since X^T X is square we can invert it
+    #       \beta = (X^T X)^{-1} X^T y
+    # Since our version of the model bootstraps y, but keeps X constant we can
+    # pre-compute (X^T X)^{-1} X^T and then re-use it to compute \beta_b for every bootstrap sample
+
+    def __init__(self):
+        super().__init__()
+        self.normal_eqs = None
+        self.hat_vals = None
+
+    def _get_regularizer(
+        self, lambda_: float, dim: int, fit_intercept: bool, regularize_intercept: bool, n_feat_ignore_reg: int
+    ) -> np.ndarray:
+        """
+        Returns the regularization matrix
+        """
+        # lambda_I is the matrix for regularization, which need to be the same shape as R and
+        # have the regularization constant lambda_ along the diagonal
+        lambda_I = lambda_ * np.eye(dim)
+
+        # we don't want to regularize the coefficient for intercept
+        # but we also might not want to fit the intercept
+        # for some number of features
+        # so set regularization constant to zero for intercept
+        # and the first n_feat_ignore_reg features
+        for i in range(fit_intercept + n_feat_ignore_reg):
+            # if we are fitting an intercept and want to regularize intercept then we don't want
+            # to set the regularization matrix at lambda_I[0, 0] to zero
+            if fit_intercept and i == 0 and regularize_intercept:
+                continue
+            lambda_I[i, i] = 0
+
+        return lambda_I
+
+    def _compute_normal_equations(
+        self,
+        x: np.ndarray,
+        L: np.ndarray,
+        lambda_: float,
+        fit_intercept: bool,
+        regularize_intercept: bool,
+        n_feat_ignore_reg: int,
+    ) -> np.ndarray:
+        """
+        Computes the normal equations for OLS: (X^T X)^{-1} X^T
+        """
+        # Inverting X^T X directly is computationally expensive and mathematically unstable, so we use QR factorization
+        # which factors x into the sum of an orthogonal matrix Q and a upper tringular matrix R
+        # L is a diagonal matrix of weights
+        Q, R = np.linalg.qr(L @ x)
+
+        # get regularization matrix
+        lambda_I = self._get_regularizer(lambda_, R.shape[0], fit_intercept, regularize_intercept, n_feat_ignore_reg)
+
+        # substitute X = QR into the normal equations to get
+        #       R^T Q^T Q R \beta = R^T Q^T y
+        #       R^T R \beta = R^T Q^T y
+        #       \beta = (R^T R)^{-1} R^T Q^T y
+        # since R is upper triangular it is eqsier to invert
+        # lambda_I is the regularization matrix
+        return np.linalg.inv(R.T @ R + lambda_I) @ R.T @ Q.T
+
+    def fit(
+        self,
+        x: np.ndarray,
+        y: np.ndarray,
+        weights: np.ndarray | None = None,
+        lambda_: float = 0.0,
+        normal_eqs: np.ndarray | None = None,
+        fit_intercept: bool = True,
+        regularize_intercept: bool = False,
+        n_feat_ignore_reg: int = 0,
+    ):
+        self._check_any_element_nan_or_inf(x)
+        self._check_any_element_nan_or_inf(y)
+
+        if fit_intercept:
+            self._check_intercept(x)
+
+        # if weights are none, all rows should be weighted equally
+        if weights is None:
+            weights = np.ones((y.shape[0],))
+
+        # normalize weights and turn into diagional matrix
+        # square root because will be squared when R^T R happens later
+        L = np.diag(np.sqrt(weights.flatten() / weights.sum()))
+
+        # if normal equations are provided then use those, otherwise compute them
+        # in the bootstrap setting we can now pass in the normal equations and can
+        # save time re-computing them
+        if normal_eqs is None:
+            self.normal_eqs = self._compute_normal_equations(
+                x, L, lambda_, fit_intercept, regularize_intercept, n_feat_ignore_reg
+            )
+        else:
+            self.normal_eqs = normal_eqs
+
+        # compute hat matrix: X (X^T X)^{-1} X^T
+        self.hat_vals = np.diag(x @ self.normal_eqs @ L)
+
+        # compute coefficients: (X^T X)^{-1} X^T y
+        self.coefficients = self.normal_eqs @ L @ y
+
+    def residuals(self, y: np.ndarray, y_hat: np.ndarray, loo: bool = True, center: bool = True) -> np.ndarray:
+        """
+        Computes residuals for the model
+        """
+        # compute standard residuals
+        residuals = y - y_hat
+
+        # if leave one out is True, inflate by (1 - P)
+        # in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals
+        # the un-inflated training residuals are too small, since training covariates were observed during fitting
+        if loo:
+            residuals /= (1 - self.hat_vals).reshape(-1, 1)
+
+        # centering removes the column mean
+        if center:
+            residuals -= np.mean(residuals, axis=0)
+
+        return residuals