Skip to content

Commit

Permalink
MAINT: allow user to pass n in kwargs and warn
Browse files Browse the repository at this point in the history
  • Loading branch information
stsievert committed Mar 27, 2018
1 parent b6963af commit 7439977
Showing 1 changed file with 56 additions and 28 deletions.
84 changes: 56 additions & 28 deletions dask_glm/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
"""

from __future__ import absolute_import, division, print_function
import time
from warnings import warn

from dask import delayed, persist, compute, set_options
import functools
Expand Down Expand Up @@ -140,55 +142,85 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
return beta


def _get_n(n, y, kwargs):
if np.isnan(n):
if 'n' in kwargs:
return kwargs.get('n')
raise ValueError('Could not get the number of examples `n`. Pass the '
'number of examples in as a keyword argument: '
'`sgd(..., n=n)`. If using a distributed dataframe, '
'`sgd(..., n=len(ddf))` works')
return n


def _shuffle_blocks(x, seed=42):
rng = np.random.RandomState(seed)
i = rng.permutation(x.shape[0]).astype(int)
y = x[i]
return y


@normalize
def sgd(X, y, epochs=100, tol=1e-3, family=Logistic, batch_size=64,
initial_step=1e-4, callback=None, average=True):
"""Stochastic Gradient Descent.
initial_step=1e-4, callback=None, average=True, maxiter=np.inf, **kwargs):
r"""Stochastic Gradient Descent.
Parameters
----------
X : array-like, shape (n_samples, n_features)
y : array-like, shape (n_samples,)
epochs : int, float
X: array - like, shape(n_samples, n_features)
y: array - like, shape(n_samples,)
epochs: int, float
maximum number of passes through the dataset
tol : float
tol: float
Maximum allowed change from prior iteration required to
declare convergence
batch_size : int
batch_size: int
The batch size used to approximate the gradient. Larger batch sizes
will approximate the gradient better.
initial_step : float
The initial step size. The step size is decays like 1/k.
callback : callable
initial_step: float
The initial step size. The step size is decays like 1 / k.
callback: callable
A callback to call every iteration that accepts keyword arguments
`X`, `y`, `beta`, `grad`, `nit` (number of iterations) and `family`
average : bool
To average the parameters found or not. See [1]_.
family : Family
average: bool
To average the parameters found or not. See[1]_.
family: Family
Returns
-------
beta : array-like, shape (n_features,)
Notes
-----
The current implementation assumes that the dataset is "well shuffled", or
each block is a representative example of the gradient. More formally, we
assume the gradient approximation is an unbiased approximation regardless
of which block is sampled.
.. _1: https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Averaging
"""
gradient = family.gradient
n, p = X.shape
if np.isnan(n):
raise ValueError('SGD needs shape information to allow indexing. '
'Possible by passing a computed array in (`X.compute()` '
'or `X.values.compute()`), then doing using '
'`dask.array.from_array ')
n = _get_n(n, y, kwargs)

beta = np.zeros(p)
if average:
beta_sum = np.zeros(p)

nit = 0
for epoch in range(epochs):
j = np.random.permutation(n)
X = X[j]
y = y[j]

# step_size = O(1/sqrt(k)) from "Non-asymptotic analysis of
# stochastic approximation algorithms for machine learning" by
# Moulines, Eric and Bach, Francis Rsgd
# but, this may require many iterations. Using
# step_size = lambda init, nit, decay: init * decay**(nit//n)
# is used in practice but not testing now
step_size = lambda init, nit: init / np.sqrt(nit + 1)
while True:
seed = int(time.time() * 1000) % 2**32 # millisecond timings
X = X.map_blocks(_shuffle_blocks, seed=seed, dtype=X.dtype)
y = y.map_blocks(_shuffle_blocks, seed=seed, dtype=y.dtype)
for k in range(n // batch_size):
beta_old = beta.copy()
nit += 1
Expand All @@ -197,19 +229,15 @@ def sgd(X, y, epochs=100, tol=1e-3, family=Logistic, batch_size=64,
Xbeta = dot(X[i], beta)
grad = gradient(Xbeta, X[i], y[i]).compute()

# step_size = O(1/sqrt(k)) from "Non-asymptotic analysis of
# stochastic approximation algorithms for machine learning" by
# Moulines, Eric and Bach, Francis Rsgd
step_size = initial_step / np.sqrt(nit + 1)
beta -= step_size * (n / batch_size) * grad
beta -= step_size(initial_step, nit) * (n / batch_size) * grad
if average:
beta_sum += beta
if callback:
callback(X=X[i], y=y[i], grad=grad, nit=nit, family=family,
beta=beta if not average else beta_sum / nit)

rel_error = LA.norm(beta_old - beta) / LA.norm(beta)
converged = (rel_error < tol) or (nit / n > epochs)
converged = (rel_error < tol) or (nit / n > epochs) or (nit > maxiter)
if converged:
break
if average:
Expand Down

0 comments on commit 7439977

Please sign in to comment.