diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index 9052c57..fe58437 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -145,7 +145,7 @@ def _choose_step_sgd(initial, k): @normalize def sgd(X, y, max_iter=1e3, tol=1e-8, family=Logistic, batch_size=64, - initial_step=10.0, n=None, **kwargs): + initial_step=1.0, **kwargs): """Stochastic Gradient Descent. Parameters @@ -164,34 +164,33 @@ def sgd(X, y, max_iter=1e3, tol=1e-8, family=Logistic, batch_size=64, initial_step : float Initial step size used in the optimization. The step size decays like initial_step/(1 + iter_count). - n : int - The number of examples, or the first dimension of the matrix X. This argument will only be used if X.shape[1] is NaN. family : Family Returns ------- beta : array-like, shape (n_features,) """ - gradient, hessian = family.gradient, family.hessian - n_examples, p = X.shape - if not np.isnan(n_examples): - n = n_examples - if n is None: - raise ValueError('Pass number of examples in with kwarg `n`') - beta = np.zeros(p) # always init to zeros? + gradient = family.gradient + n, p = X.shape + if np.isnan(n): + raise ValueError('SGD needs shape information to allow indexing. ' + 'Possible by passing a computed array in (`X.compute()` ' + 'or `X.values.compute()`), then doing using ' + '`dask.array.from_array ') + + beta = np.zeros(p) iter_count = 0 converged = False while not converged: - beta_old = beta + beta_old = beta.copy() iter_count += 1 i = np.random.choice(n, size=(batch_size,)) Xbeta = dot(X[i], beta) - grad = gradient(Xbeta, X[i], y[i]) - (grad,) = compute((grad,)) + grad = gradient(Xbeta, X[i], y[i]).compute() beta -= _choose_step_sgd(initial_step, iter_count) * grad / batch_size