Skip to content

Commit

Permalink
MAINT: allow user to pass n in kwargs and warn
Browse files Browse the repository at this point in the history
  • Loading branch information
stsievert committed Mar 26, 2018
1 parent b6963af commit 03bff77
Showing 1 changed file with 38 additions and 26 deletions.
64 changes: 38 additions & 26 deletions dask_glm/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""

from __future__ import absolute_import, division, print_function
from warnings import warn

from dask import delayed, persist, compute, set_options
import functools
Expand Down Expand Up @@ -142,53 +143,68 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):

@normalize
def sgd(X, y, epochs=100, tol=1e-3, family=Logistic, batch_size=64,
initial_step=1e-4, callback=None, average=True):
initial_step=1e-4, callback=None, average=True, maxiter=np.inf, **kwargs):
"""Stochastic Gradient Descent.
Parameters
----------
X : array-like, shape (n_samples, n_features)
y : array-like, shape (n_samples,)
epochs : int, float
X: array - like, shape(n_samples, n_features)
y: array - like, shape(n_samples,)
epochs: int, float
maximum number of passes through the dataset
tol : float
tol: float
Maximum allowed change from prior iteration required to
declare convergence
batch_size : int
batch_size: int
The batch size used to approximate the gradient. Larger batch sizes
will approximate the gradient better.
initial_step : float
The initial step size. The step size is decays like 1/k.
callback : callable
initial_step: float
The initial step size. The step size is decays like 1 / k.
callback: callable
A callback to call every iteration that accepts keyword arguments
`X`, `y`, `beta`, `grad`, `nit` (number of iterations) and `family`
average : bool
To average the parameters found or not. See [1]_.
family : Family
average: bool
To average the parameters found or not. See[1]_.
family: Family
Returns
-------
beta : array-like, shape (n_features,)
Notes
-----
The dataset is not shuffled beforehand. It is assumed that when
approximating the gradient each row accessed from the dataset (which is
accessed sequentially) is an indepedent and identically distributed (iid)
approximations to the gradient. Shuffling the array beforehand should
ensure this.
.. _1: https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Averaging
"""
gradient = family.gradient
n, p = X.shape
if np.isnan(n):
raise ValueError('SGD needs shape information to allow indexing. '
'Possible by passing a computed array in (`X.compute()` '
'or `X.values.compute()`), then doing using '
'`dask.array.from_array ')
n = kwargs.get('n', np.nan)
if np.isnan(n):
warn('Computing X to find the number of examples. Pass the number '
'examples in as a keyword argument `n` to avoid this. i.e., '
'`sgd(..., n=num_examples)` or `sgd(..., n=len(X.compute()))`.')
n = len(X.compute())

beta = np.zeros(p)
if average:
beta_sum = np.zeros(p)

nit = 0
for epoch in range(epochs):
j = np.random.permutation(n)
X = X[j]
y = y[j]

# step_size = O(1/sqrt(k)) from "Non-asymptotic analysis of
# stochastic approximation algorithms for machine learning" by
# Moulines, Eric and Bach, Francis Rsgd
# but, this may require many iterations. Using
# step_size = lambda init, nit, decay: init * decay**(nit//n)
# is used in practice but not testing now
step_size = lambda init, nit: init / np.sqrt(nit + 1)
while True:
for k in range(n // batch_size):
beta_old = beta.copy()
nit += 1
Expand All @@ -197,19 +213,15 @@ def sgd(X, y, epochs=100, tol=1e-3, family=Logistic, batch_size=64,
Xbeta = dot(X[i], beta)
grad = gradient(Xbeta, X[i], y[i]).compute()

# step_size = O(1/sqrt(k)) from "Non-asymptotic analysis of
# stochastic approximation algorithms for machine learning" by
# Moulines, Eric and Bach, Francis Rsgd
step_size = initial_step / np.sqrt(nit + 1)
beta -= step_size * (n / batch_size) * grad
beta -= step_size(initial_step, nit) * (n / batch_size) * grad
if average:
beta_sum += beta
if callback:
callback(X=X[i], y=y[i], grad=grad, nit=nit, family=family,
beta=beta if not average else beta_sum / nit)

rel_error = LA.norm(beta_old - beta) / LA.norm(beta)
converged = (rel_error < tol) or (nit / n > epochs)
converged = (rel_error < tol) or (nit / n > epochs) or (nit > maxiter)
if converged:
break
if average:
Expand Down

0 comments on commit 03bff77

Please sign in to comment.