Skip to content

Commit

Permalink
ENH: clean up SGD implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Scott Sievert committed Mar 22, 2018
1 parent 876897f commit b6963af
Showing 1 changed file with 45 additions and 29 deletions.
74 changes: 45 additions & 29 deletions dask_glm/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,36 +140,37 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
return beta


def _choose_step_sgd(initial, k):
return initial / (k + 1)


@normalize
def sgd(X, y, max_iter=1e3, tol=1e-2, family=Logistic, batch_size=64,
initial_step=1.0, **kwargs):
def sgd(X, y, epochs=100, tol=1e-3, family=Logistic, batch_size=64,
initial_step=1e-4, callback=None, average=True):
"""Stochastic Gradient Descent.
Parameters
----------
X : array-like, shape (n_samples, n_features)
y : array-like, shape (n_samples,)
max_iter : int, float
maximum number of iterations to attempt before declaring
failure to converge
epochs : int, float
maximum number of passes through the dataset
tol : float
Maximum allowed change from prior iteration required to
declare convergence
batch_size : int
The batch size used to approximate the gradient. Larger batch sizes
will approximate the gradient better.
initial_step : float
Initial step size used in the optimization. The step size decays like
initial_step/(1 + iter_count).
The initial step size. The step size is decays like 1/k.
callback : callable
A callback to call every iteration that accepts keyword arguments
`X`, `y`, `beta`, `grad`, `nit` (number of iterations) and `family`
average : bool
To average the parameters found or not. See [1]_.
family : Family
Returns
-------
beta : array-like, shape (n_features,)
.. _1: https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Averaging
"""
gradient = family.gradient
n, p = X.shape
Expand All @@ -180,24 +181,39 @@ def sgd(X, y, max_iter=1e3, tol=1e-2, family=Logistic, batch_size=64,
'`dask.array.from_array ')

beta = np.zeros(p)

iter_count = 0
converged = False

while not converged:
beta_old = beta.copy()
iter_count += 1

i = np.random.choice(n, size=(batch_size,))
Xbeta = dot(X[i], beta)

grad = gradient(Xbeta, X[i], y[i]).compute()

beta -= _choose_step_sgd(initial_step, iter_count) * grad / batch_size

rel_error = LA.norm(beta_old - beta) / LA.norm(beta)
converged = (rel_error < tol) or (iter_count > max_iter)

if average:
beta_sum = np.zeros(p)

nit = 0
for epoch in range(epochs):
j = np.random.permutation(n)
X = X[j]
y = y[j]
for k in range(n // batch_size):
beta_old = beta.copy()
nit += 1

i = slice(batch_size * k, batch_size * (k + 1))
Xbeta = dot(X[i], beta)
grad = gradient(Xbeta, X[i], y[i]).compute()

# step_size = O(1/sqrt(k)) from "Non-asymptotic analysis of
# stochastic approximation algorithms for machine learning" by
# Moulines, Eric and Bach, Francis Rsgd
step_size = initial_step / np.sqrt(nit + 1)
beta -= step_size * (n / batch_size) * grad
if average:
beta_sum += beta
if callback:
callback(X=X[i], y=y[i], grad=grad, nit=nit, family=family,
beta=beta if not average else beta_sum / nit)

rel_error = LA.norm(beta_old - beta) / LA.norm(beta)
converged = (rel_error < tol) or (nit / n > epochs)
if converged:
break
if average:
return beta_sum / nit
return beta


Expand Down

0 comments on commit b6963af

Please sign in to comment.