diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index 7cf270d..4a583b4 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -140,22 +140,17 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): return beta -def _choose_step_sgd(initial, k): - return initial / (k + 1) - - @normalize -def sgd(X, y, max_iter=1e3, tol=1e-2, family=Logistic, batch_size=64, - initial_step=1.0, **kwargs): +def sgd(X, y, epochs=100, tol=1e-3, family=Logistic, batch_size=64, + initial_step=1e-4, callback=None, average=True): """Stochastic Gradient Descent. Parameters ---------- X : array-like, shape (n_samples, n_features) y : array-like, shape (n_samples,) - max_iter : int, float - maximum number of iterations to attempt before declaring - failure to converge + epochs : int, float + maximum number of passes through the dataset tol : float Maximum allowed change from prior iteration required to declare convergence @@ -163,13 +158,19 @@ def sgd(X, y, max_iter=1e3, tol=1e-2, family=Logistic, batch_size=64, The batch size used to approximate the gradient. Larger batch sizes will approximate the gradient better. initial_step : float - Initial step size used in the optimization. The step size decays like - initial_step/(1 + iter_count). + The initial step size. The step size is decays like 1/k. + callback : callable + A callback to call every iteration that accepts keyword arguments + `X`, `y`, `beta`, `grad`, `nit` (number of iterations) and `family` + average : bool + To average the parameters found or not. See [1]_. family : Family Returns ------- beta : array-like, shape (n_features,) + + .. _1: https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Averaging """ gradient = family.gradient n, p = X.shape @@ -180,24 +181,39 @@ def sgd(X, y, max_iter=1e3, tol=1e-2, family=Logistic, batch_size=64, '`dask.array.from_array ') beta = np.zeros(p) - - iter_count = 0 - converged = False - - while not converged: - beta_old = beta.copy() - iter_count += 1 - - i = np.random.choice(n, size=(batch_size,)) - Xbeta = dot(X[i], beta) - - grad = gradient(Xbeta, X[i], y[i]).compute() - - beta -= _choose_step_sgd(initial_step, iter_count) * grad / batch_size - - rel_error = LA.norm(beta_old - beta) / LA.norm(beta) - converged = (rel_error < tol) or (iter_count > max_iter) - + if average: + beta_sum = np.zeros(p) + + nit = 0 + for epoch in range(epochs): + j = np.random.permutation(n) + X = X[j] + y = y[j] + for k in range(n // batch_size): + beta_old = beta.copy() + nit += 1 + + i = slice(batch_size * k, batch_size * (k + 1)) + Xbeta = dot(X[i], beta) + grad = gradient(Xbeta, X[i], y[i]).compute() + + # step_size = O(1/sqrt(k)) from "Non-asymptotic analysis of + # stochastic approximation algorithms for machine learning" by + # Moulines, Eric and Bach, Francis Rsgd + step_size = initial_step / np.sqrt(nit + 1) + beta -= step_size * (n / batch_size) * grad + if average: + beta_sum += beta + if callback: + callback(X=X[i], y=y[i], grad=grad, nit=nit, family=family, + beta=beta if not average else beta_sum / nit) + + rel_error = LA.norm(beta_old - beta) / LA.norm(beta) + converged = (rel_error < tol) or (nit / n > epochs) + if converged: + break + if average: + return beta_sum / nit return beta