Skip to content

Commit

Permalink
Introduce a center param to PCA (dask#734)
Browse files Browse the repository at this point in the history
  • Loading branch information
hristog committed Mar 20, 2021
1 parent 027c978 commit 468862b
Show file tree
Hide file tree
Showing 4 changed files with 382 additions and 27 deletions.
7 changes: 7 additions & 0 deletions dask_ml/decomposition/incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,21 @@ def __init__(
self,
n_components=None,
whiten=False,
center=True,
copy=True,
batch_size=None,
svd_solver="auto",
iterated_power=0,
random_state=None,
):
if center is False:
raise NotImplementedError(
"IncrementalPCA with center=False is not supported."
)

self.n_components = n_components
self.whiten = whiten
self.center = center
self.copy = copy
self.batch_size = batch_size
self.svd_solver = svd_solver
Expand Down
93 changes: 72 additions & 21 deletions dask_ml/decomposition/pca.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numbers
import warnings

import dask
import dask.array as da
Expand Down Expand Up @@ -60,6 +61,21 @@ class PCA(sklearn.decomposition.PCA):
improve the predictive accuracy of the downstream estimators by
making their data respect some hard-wired assumptions.
center : bool, optional (default True)
When False (True by default), the underlying data gets centered at zero
by subtracting the mean of the data from the data itself.
PCA is performed on centered data due to its being a regression model,
without an intercept. As such, its pricipal components originate at the
origin of the transformed space.
`center` set to False may be employed when performing PCA on already
centered data.
Since centering is a required step as part of whitening, `center` set
to False and `whiten` is a combination which may result in unexpected
behavior, if performed on not previously centered data.
svd_solver : string {'auto', 'full', 'tsqr', 'randomized'}
auto :
the solver is selected by a default policy based on `X.shape` and
Expand Down Expand Up @@ -149,21 +165,28 @@ class PCA(sklearn.decomposition.PCA):
>>> dX = da.from_array(X, chunks=X.shape)
>>> pca = PCA(n_components=2)
>>> pca.fit(dX)
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
PCA(n_components=2)
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[ 0.99244... 0.00755...]
[0.99244289 0.00755711]
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
[ 6.30061... 0.54980...]
[6.30061232 0.54980396]
>>> pca = PCA(n_components=2, svd_solver='full')
>>> pca.fit(dX) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
svd_solver='full', tol=0.0, whiten=False)
PCA(n_components=2, svd_solver='full')
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[ 0.99244... 0.00755...]
[0.99244289 0.00755711]
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
[ 6.30061... 0.54980...]
[6.30061232 0.54980396]
>>> dX_mean_0 = dX - dX.mean(axis=0)
>>> pca = PCA(n_components=2, svd_solver='full', center=False)
>>> pca.fit(dX_mean_0)
PCA(center=False, n_components=2, svd_solver='full')
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[0.99244289 0.00755711]
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
[6.30061232 0.54980396]
Notes
-----
Expand All @@ -175,13 +198,16 @@ class PCA(sklearn.decomposition.PCA):
``dask.linalg.svd_compressed``.
* n_components : ``n_components='mle'`` is not allowed.
Fractional ``n_components`` between 0 and 1 is not allowed.
* center : defaults to ``True`` and enables control over whether centering
gets implicitly performed as a part of the PCA model.
"""

def __init__(
self,
n_components=None,
copy=True,
whiten=False,
center=True,
svd_solver="auto",
tol=0.0,
iterated_power=0,
Expand All @@ -190,11 +216,19 @@ def __init__(
self.n_components = n_components
self.copy = copy
self.whiten = whiten
self.center = center
self.svd_solver = svd_solver
self.tol = tol
self.iterated_power = iterated_power
self.random_state = random_state

if whiten and not center:
warnings.warn(
"Whitening requires centering. Please, ensure that your data "
"is already centered, in order to avoid unexpected behavior.",
RuntimeWarning,
)

def fit(self, X, y=None):
if not dask.is_dask_collection(X):
raise TypeError(_TYPE_MSG.format(type(X)))
Expand Down Expand Up @@ -266,8 +300,10 @@ def _fit(self, X):

solver = self._get_solver(X, n_components)

self.mean_ = X.mean(0)
X -= self.mean_
self.mean_ = X.mean(axis=0)

if self.center:
X -= self.mean_

if solver in {"full", "tsqr"}:
U, S, V = da.linalg.svd(X)
Expand Down Expand Up @@ -370,14 +406,20 @@ def transform(self, X):
X_new : array-like, shape (n_samples, n_components)
"""
check_is_fitted(self, ["mean_", "components_"])
check_is_fitted(self, "components_")

if self.whiten:
check_is_fitted(self, "explained_variance_")

if self.center:
check_is_fitted(self, "mean_")
if self.mean_ is not None:
X -= self.mean_

# X = check_array(X)
if self.mean_ is not None:
X = X - self.mean_
X_transformed = da.dot(X, self.components_.T)
if self.whiten:
X_transformed /= np.sqrt(self.explained_variance_)

return X_transformed

def fit_transform(self, X, y=None):
Expand All @@ -396,7 +438,6 @@ def fit_transform(self, X, y=None):
X_new : array-like, shape (n_samples, n_components)
"""
# X = check_array(X)
if not dask.is_dask_collection(X):
raise TypeError(_TYPE_MSG.format(type(X)))
U, S, V = self._fit(X)
Expand Down Expand Up @@ -431,18 +472,25 @@ def inverse_transform(self, X):
If whitening is enabled, inverse_transform does not compute the
exact inverse operation of transform.
"""
check_is_fitted(self, "mean_")
check_is_fitted(self, "components_")

if self.center:
check_is_fitted(self, "mean_")
offset = self.mean_
else:
offset = 0

if self.whiten:
check_is_fitted(self, "explained_variance_")
return (
da.dot(
X,
np.sqrt(self.explained_variance_[:, np.newaxis]) * self.components_,
)
+ self.mean_
+ offset
)
else:
return da.dot(X, self.components_) + self.mean_

return da.dot(X, self.components_) + offset

def score_samples(self, X):
"""Return the log-likelihood of each sample.
Expand All @@ -463,8 +511,11 @@ def score_samples(self, X):
"""
check_is_fitted(self, "mean_")

# X = check_array(X)
Xr = X - self.mean_
if self.center:
Xr = X - self.mean_
else:
Xr = X

n_features = X.shape[1]
precision = self.get_precision() # [n_features, n_features]
log_like = -0.5 * (Xr * (da.dot(Xr, precision))).sum(axis=1)
Expand Down
5 changes: 5 additions & 0 deletions tests/test_incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,3 +475,8 @@ def test_incremental_pca_partial_fit_float_division():
np.testing.assert_allclose(
singular_vals_float_samples_seen, singular_vals_int_samples_seen
)


def test_incremental_pca_no_centering_not_supported():
with pytest.raises(NotImplementedError, match="not supported"):
IncrementalPCA(n_components=2, center=False)
Loading

0 comments on commit 468862b

Please sign in to comment.