diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py
index b7bdbd0..9f89f2c 100644
--- a/dask_glm/algorithms.py
+++ b/dask_glm/algorithms.py
@@ -64,8 +64,17 @@ def compute_stepsize_dask(beta, step, Xbeta, Xstep, y, curr_val,
     return stepSize, beta, Xbeta, func
 
 
+def _compute_batch(chunk, batch_size=1, seed=42):
+    n_block = chunk.shape[0]
+    rnd = np.random.RandomState(seed)
+    i = rnd.permutation(n_block)
+    return chunk[i[:batch_size]]
+
+
 @normalize
-def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
+def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic,
+                     approx_grad=False, initial_batch_size=None, armijoMult=1e-1,
+                     seed=42, **kwargs):
     """
     Michael Grant's implementation of Gradient Descent.
 
@@ -80,24 +89,57 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
         Maximum allowed change from prior iteration required to
         declare convergence
     family : Family
+    approx_grad : bool
+        If True (default False), approximate the gradient with a subset of
+        examples in X and y. When the number of examples is very large this can
+        result in speed gains, and is described in :cite:`friedlander2012hybrid`.
+    initial_batch_size : int
+        The initial batch size when approximating the gradient. Only used when
+        `approx_grad == True`. Defaults to `min(n // n_chunks, n // 100)`
 
     Returns
     -------
     beta : array-like, shape (n_features,)
+
+    References
+    ----------
+    @article{friedlander2012hybrid,
+        Author = {Friedlander, Michael P and Schmidt, Mark},
+        Journal = {SIAM Journal on Scientific Computing},
+        Number = {3},
+        Pages = {A1380--A1405},
+        Title = {Hybrid deterministic-stochastic methods for data fitting},
+        Volume = {34},
+        Year = {2012}}
+
     """
 
     loglike, gradient = family.loglike, family.gradient
     n, p = X.shape
     firstBacktrackMult = 0.1
     nextBacktrackMult = 0.5
-    armijoMult = 0.1
+    armijoMult = 1e-1
     stepGrowth = 1.25
     stepSize = 1.0
     recalcRate = 10
     backtrackMult = firstBacktrackMult
     beta = np.zeros(p)
 
+    if approx_grad:
+        n_chunks = max(len(c) for c in X.chunks)
+        batch_size = n_chunks + 1 \
+                     if initial_batch_size is None else initial_batch_size
+        keep = {'X': X, 'y': y}
+
     for k in range(max_iter):
+        if approx_grad:
+            block_batch_size = batch_size // n_chunks
+            kwargs = {'batch_size': block_batch_size, 'seed': k + seed}
+            X = keep['X'].map_blocks(_compute_batch, **kwargs)
+            y = keep['y'].map_blocks(_compute_batch, **kwargs)
+            #  Xbeta = X.dot(beta)
+            #  func = loglike(Xbeta, y)
+
         # how necessary is this recalculation?
         if k % recalcRate == 0:
             Xbeta = X.dot(beta)