Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support dask, cupy, and numpy array types #105

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@ jobs:
- name: Run pytest
shell: bash -l {0}
run: |
pip install pytest
pytest dask_glm
pip install pytest pytest-xdist
pytest dask_glm -n auto
7 changes: 4 additions & 3 deletions dask_glm/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
maybe_to_cupy,
normalize,
scatter_array,
zeros,
)


Expand Down Expand Up @@ -115,7 +116,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
stepSize = 1.0
recalcRate = 10
backtrackMult = firstBacktrackMult
beta = np.zeros_like(X._meta, shape=p)
beta = zeros(shape=p, arr=getattr(X, "_meta", None))

for k in range(max_iter):
# how necessary is this recalculation?
Expand Down Expand Up @@ -188,7 +189,7 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):
"""
gradient, hessian = family.gradient, family.hessian
n, p = X.shape
beta = np.zeros_like(X._meta, shape=p)
beta = zeros(shape=p, arr=getattr(X, "_meta", None))
Xbeta = dot(X, beta)

iter_count = 0
Expand Down Expand Up @@ -457,7 +458,7 @@ def proximal_grad(
stepSize = 1.0
recalcRate = 10
backtrackMult = firstBacktrackMult
beta = np.zeros_like(X._meta, shape=p)
beta = zeros(shape=p, arr=getattr(X, "_meta", None))
regularizer = Regularizer.get(regularizer)

for k in range(max_iter):
Expand Down
66 changes: 37 additions & 29 deletions dask_glm/tests/test_algos_families.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,22 +37,34 @@ def make_intercept_data(N, p, seed=20009):
return X, y


def make_array_type(type, *arrays):
if type == "cupy":
cupy = pytest.importorskip("cupy")
return to_dask_cupy_array_xy(*arrays, cupy)
elif type == "numpy":
return tuple(a.compute() if hasattr(a, "compute") else a for a in arrays)
else:
return arrays


def maybe_compute(expr):
return expr.compute() if hasattr(expr, "compute") else expr


@pytest.mark.parametrize("opt", [lbfgs, newton, gradient_descent])
@pytest.mark.parametrize(
"N, p, seed,", [(100, 2, 20009), (250, 12, 90210), (95, 6, 70605)]
)
@pytest.mark.parametrize("is_cupy", [True, False])
def test_methods(N, p, seed, opt, is_cupy):
@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"])
def test_methods(N, p, seed, opt, array_type):
X, y = make_intercept_data(N, p, seed=seed)

if is_cupy:
cupy = pytest.importorskip("cupy")
X, y = to_dask_cupy_array_xy(X, y, cupy)
X, y = make_array_type(array_type, X, y)

coefs = opt(X, y)
p = sigmoid(X.dot(coefs).compute())
dot = maybe_compute(X.dot(coefs))
p = sigmoid(dot)

y_sum = y.compute().sum()
y_sum = maybe_compute(y).sum()
p_sum = p.sum()
assert np.isclose(y_sum, p_sum, atol=1e-1)

Expand All @@ -68,25 +80,24 @@ def test_methods(N, p, seed, opt, is_cupy):
@pytest.mark.parametrize("N", [1000])
@pytest.mark.parametrize("nchunks", [1, 10])
@pytest.mark.parametrize("family", [Logistic, Normal, Poisson])
@pytest.mark.parametrize("is_cupy", [True, False])
def test_basic_unreg_descent(func, kwargs, N, nchunks, family, is_cupy):
@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"])
def test_basic_unreg_descent(func, kwargs, N, nchunks, family, array_type):
beta = np.random.normal(size=2)
M = len(beta)
X = da.random.random((N, M), chunks=(N // nchunks, M))
y = make_y(X, beta=np.array(beta), chunks=(N // nchunks,))

if is_cupy:
cupy = pytest.importorskip("cupy")
X, y = to_dask_cupy_array_xy(X, y, cupy)
X, y = make_array_type(array_type, X, y)

X, y = persist(X, y)
if array_type != "numpy":
X, y = persist(X, y)

result = func(X, y, family=family, **kwargs)
test_vec = np.random.normal(size=2)
test_vec = maybe_to_cupy(test_vec, X)

opt = family.pointwise_loss(result, X, y).compute()
test_val = family.pointwise_loss(test_vec, X, y).compute()
opt = maybe_compute(family.pointwise_loss(result, X, y))
test_val = maybe_compute(family.pointwise_loss(test_vec, X, y))

assert opt < test_val

Expand All @@ -103,27 +114,26 @@ def test_basic_unreg_descent(func, kwargs, N, nchunks, family, is_cupy):
@pytest.mark.parametrize("family", [Logistic, Normal, Poisson])
@pytest.mark.parametrize("lam", [0.01, 1.2, 4.05])
@pytest.mark.parametrize("reg", [r() for r in Regularizer.__subclasses__()])
@pytest.mark.parametrize("is_cupy", [True, False])
def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg, is_cupy):
@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"])
def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg, array_type):
beta = np.random.normal(size=2)
M = len(beta)
X = da.random.random((N, M), chunks=(N // nchunks, M))
y = make_y(X, beta=np.array(beta), chunks=(N // nchunks,))

if is_cupy:
cupy = pytest.importorskip("cupy")
X, y = to_dask_cupy_array_xy(X, y, cupy)
X, y = make_array_type(array_type, X, y)

X, y = persist(X, y)
if array_type != "numpy":
X, y = persist(X, y)

result = func(X, y, family=family, lamduh=lam, regularizer=reg, **kwargs)
test_vec = np.random.normal(size=2)
test_vec = maybe_to_cupy(test_vec, X)

f = reg.add_reg_f(family.pointwise_loss, lam)

opt = f(result, X, y).compute()
test_val = f(test_vec, X, y).compute()
opt = maybe_compute(f(result, X, y))
test_val = maybe_compute(f(test_vec, X, y))

assert opt < test_val

Expand All @@ -138,12 +148,10 @@ def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg, is_cupy):
],
)
@pytest.mark.parametrize("scheduler", ["synchronous", "threading", "multiprocessing"])
@pytest.mark.parametrize("is_cupy", [True, False])
def test_determinism(func, kwargs, scheduler, is_cupy):
@pytest.mark.parametrize("array_type", ["dask", "numpy", "cupy"])
def test_determinism(func, kwargs, scheduler, array_type):
X, y = make_intercept_data(1000, 10)
if is_cupy:
cupy = pytest.importorskip("cupy")
X, y = to_dask_cupy_array_xy(X, y, cupy)
X, y = make_array_type(array_type, X, y)

with dask.config.set(scheduler=scheduler):
a = func(X, y, **kwargs)
Expand Down
9 changes: 8 additions & 1 deletion dask_glm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def normalize_inputs(X, y, *args, **kwargs):
mean = (
mean
if len(intercept_idx[0])
else np.zeros_like(X._meta, shape=mean.shape)
else zeros(mean.shape, getattr(X, "_meta", None))
)
Xn = (X - mean) / std
out = algo(Xn, y, *args, **kwargs).copy()
Expand All @@ -39,6 +39,13 @@ def normalize_inputs(X, y, *args, **kwargs):
return normalize_inputs


def zeros(shape, arr=None, **kwargs):
if arr:
return np.zeros_like(arr, shape=shape, **kwargs)
else:
return np.zeros(shape=shape, **kwargs)


def sigmoid(x):
"""Sigmoid function of x."""
return 1 / (1 + exp(-x))
Expand Down
Loading