Repository URL to install this package:
|
Version:
0.17.1 ▾
|
""" Non-negative matrix factorization
"""
# Author: Vlad Niculae
# Lars Buitinck <L.J.Buitinck@uva.nl>
# Mathieu Blondel <mathieu@mblondel.org>
# Tom Dupre la Tour
# Author: Chih-Jen Lin, National Taiwan University (original projected gradient
# NMF implementation)
# Author: Anthony Di Franco (Projected gradient, Python and NumPy port)
# License: BSD 3 clause
from __future__ import division
from math import sqrt
import warnings
import numbers
import numpy as np
import scipy.sparse as sp
from ..externals import six
from ..base import BaseEstimator, TransformerMixin
from ..utils import check_random_state, check_array
from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
from ..utils.extmath import fast_dot
from ..utils.validation import check_is_fitted, check_non_negative
from ..utils import deprecated
from ..utils import ConvergenceWarning
from .cdnmf_fast import _update_cdnmf_fast
def safe_vstack(Xs):
if any(sp.issparse(X) for X in Xs):
return sp.vstack(Xs)
else:
return np.vstack(Xs)
def norm(x):
"""Dot product-based Euclidean norm implementation
See: http://fseoane.net/blog/2011/computing-the-vector-norm/
"""
return sqrt(squared_norm(x))
def trace_dot(X, Y):
"""Trace of np.dot(X, Y.T)."""
return np.dot(X.ravel(), Y.ravel())
def _sparseness(x):
"""Hoyer's measure of sparsity for a vector"""
sqrt_n = np.sqrt(len(x))
return (sqrt_n - np.linalg.norm(x, 1) / norm(x)) / (sqrt_n - 1)
def _check_init(A, shape, whom):
A = check_array(A)
if np.shape(A) != shape:
raise ValueError('Array with wrong shape passed to %s. Expected %s, '
'but got %s ' % (whom, shape, np.shape(A)))
check_non_negative(A, whom)
if np.max(A) == 0:
raise ValueError('Array passed to %s is full of zeros.' % whom)
def _safe_compute_error(X, W, H):
"""Frobenius norm between X and WH, safe for sparse array"""
if not sp.issparse(X):
error = norm(X - np.dot(W, H))
else:
norm_X = np.dot(X.data, X.data)
norm_WH = trace_dot(np.dot(np.dot(W.T, W), H), H)
cross_prod = trace_dot((X * H.T), W)
error = sqrt(norm_X + norm_WH - 2. * cross_prod)
return error
def _check_string_param(sparseness, solver):
allowed_sparseness = (None, 'data', 'components')
if sparseness not in allowed_sparseness:
raise ValueError(
'Invalid sparseness parameter: got %r instead of one of %r' %
(sparseness, allowed_sparseness))
allowed_solver = ('pg', 'cd')
if solver not in allowed_solver:
raise ValueError(
'Invalid solver parameter: got %r instead of one of %r' %
(solver, allowed_solver))
def _initialize_nmf(X, n_components, init=None, eps=1e-6,
random_state=None):
"""Algorithms for NMF initialization.
Computes an initial guess for the non-negative
rank k matrix approximation for X: X = WH
Parameters
----------
X : array-like, shape (n_samples, n_features)
The data matrix to be decomposed.
n_components : integer
The number of components desired in the approximation.
init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar'
Method used to initialize the procedure.
Default: 'nndsvdar' if n_components < n_features, otherwise 'random'.
Valid options:
- 'random': non-negative random matrices, scaled with:
sqrt(X.mean() / n_components)
- 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
initialization (better for sparseness)
- 'nndsvda': NNDSVD with zeros filled with the average of X
(better when sparsity is not desired)
- 'nndsvdar': NNDSVD with zeros filled with small random values
(generally faster, less accurate alternative to NNDSVDa
for when sparsity is not desired)
- 'custom': use custom matrices W and H
eps : float
Truncate all values less then this in output to zero.
random_state : int seed, RandomState instance, or None (default)
Random number generator seed control, used in 'nndsvdar' and
'random' modes.
Returns
-------
W : array-like, shape (n_samples, n_components)
Initial guesses for solving X ~= WH
H : array-like, shape (n_components, n_features)
Initial guesses for solving X ~= WH
References
----------
C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for
nonnegative matrix factorization - Pattern Recognition, 2008
http://tinyurl.com/nndsvd
"""
check_non_negative(X, "NMF initialization")
n_samples, n_features = X.shape
if init is None:
if n_components < n_features:
init = 'nndsvd'
else:
init = 'random'
# Random initialization
if init == 'random':
avg = np.sqrt(X.mean() / n_components)
rng = check_random_state(random_state)
H = avg * rng.randn(n_components, n_features)
W = avg * rng.randn(n_samples, n_components)
# we do not write np.abs(H, out=H) to stay compatible with
# numpy 1.5 and earlier where the 'out' keyword is not
# supported as a kwarg on ufuncs
np.abs(H, H)
np.abs(W, W)
return W, H
# NNDSVD initialization
U, S, V = randomized_svd(X, n_components, random_state=random_state)
W, H = np.zeros(U.shape), np.zeros(V.shape)
# The leading singular triplet is non-negative
# so it can be used as is for initialization.
W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])
for j in range(1, n_components):
x, y = U[:, j], V[j, :]
# extract positive and negative parts of column vectors
x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))
# and their norms
x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)
m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm
# choose update
if m_p > m_n:
u = x_p / x_p_nrm
v = y_p / y_p_nrm
sigma = m_p
else:
u = x_n / x_n_nrm
v = y_n / y_n_nrm
sigma = m_n
lbd = np.sqrt(S[j] * sigma)
W[:, j] = lbd * u
H[j, :] = lbd * v
W[W < eps] = 0
H[H < eps] = 0
if init == "nndsvd":
pass
elif init == "nndsvda":
avg = X.mean()
W[W == 0] = avg
H[H == 0] = avg
elif init == "nndsvdar":
rng = check_random_state(random_state)
avg = X.mean()
W[W == 0] = abs(avg * rng.randn(len(W[W == 0])) / 100)
H[H == 0] = abs(avg * rng.randn(len(H[H == 0])) / 100)
else:
raise ValueError(
'Invalid init parameter: got %r instead of one of %r' %
(init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar')))
return W, H
def _nls_subproblem(V, W, H, tol, max_iter, alpha=0., l1_ratio=0.,
sigma=0.01, beta=0.1):
"""Non-negative least square solver
Solves a non-negative least squares subproblem using the projected
gradient descent algorithm.
Parameters
----------
V : array-like, shape (n_samples, n_features)
Constant matrix.
W : array-like, shape (n_samples, n_components)
Constant matrix.
H : array-like, shape (n_components, n_features)
Initial guess for the solution.
tol : float
Tolerance of the stopping condition.
max_iter : int
Maximum number of iterations before timing out.
alpha : double, default: 0.
Constant that multiplies the regularization terms. Set it to zero to
have no regularization.
l1_ratio : double, default: 0.
The regularization mixing parameter, with 0 <= l1_ratio <= 1.
For l1_ratio = 0 the penalty is an L2 penalty.
For l1_ratio = 1 it is an L1 penalty.
For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
sigma : float
Constant used in the sufficient decrease condition checked by the line
search. Smaller values lead to a looser sufficient decrease condition,
thus reducing the time taken by the line search, but potentially
increasing the number of iterations of the projected gradient
procedure. 0.01 is a commonly used value in the optimization
literature.
beta : float
Factor by which the step size is decreased (resp. increased) until
(resp. as long as) the sufficient decrease condition is satisfied.
Larger values allow to find a better step size but lead to longer line
search. 0.1 is a commonly used value in the optimization literature.
Returns
-------
H : array-like, shape (n_components, n_features)
Solution to the non-negative least squares problem.
grad : array-like, shape (n_components, n_features)
The gradient.
n_iter : int
The number of iterations done by the algorithm.
References
----------
C.-J. Lin. Projected gradient methods for non-negative matrix
factorization. Neural Computation, 19(2007), 2756-2779.
http://www.csie.ntu.edu.tw/~cjlin/nmf/
"""
WtV = safe_sparse_dot(W.T, V)
WtW = fast_dot(W.T, W)
# values justified in the paper (alpha is renamed gamma)
gamma = 1
for n_iter in range(1, max_iter + 1):
grad = np.dot(WtW, H) - WtV
if alpha > 0 and l1_ratio == 1.:
grad += alpha
elif alpha > 0:
grad += alpha * (l1_ratio + (1 - l1_ratio) * H)
# The following multiplication with a boolean array is more than twice
# as fast as indexing into grad.
if norm(grad * np.logical_or(grad < 0, H > 0)) < tol:
break
Hp = H
for inner_iter in range(20):
# Gradient step.
Hn = H - gamma * grad
# Projection step.
Hn *= Hn > 0
d = Hn - H
gradd = np.dot(grad.ravel(), d.ravel())
dQd = np.dot(np.dot(WtW, d).ravel(), d.ravel())
suff_decr = (1 - sigma) * gradd + 0.5 * dQd < 0
if inner_iter == 0:
decr_gamma = not suff_decr
if decr_gamma:
if suff_decr:
H = Hn
break
else:
gamma *= beta
elif not suff_decr or (Hp == Hn).all():
H = Hp
break
else:
gamma /= beta
Hp = Hn
if n_iter == max_iter:
warnings.warn("Iteration limit reached in nls subproblem.")
return H, grad, n_iter
def _update_projected_gradient_w(X, W, H, tolW, nls_max_iter, alpha, l1_ratio,
sparseness, beta, eta):
"""Helper function for _fit_projected_gradient"""
n_samples, n_features = X.shape
n_components_ = H.shape[0]
if sparseness is None:
Wt, gradW, iterW = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter,
alpha=alpha, l1_ratio=l1_ratio)
elif sparseness == 'data':
Wt, gradW, iterW = _nls_subproblem(
safe_vstack([X.T, np.zeros((1, n_samples))]),
safe_vstack([H.T, np.sqrt(beta) * np.ones((1,
n_components_))]),
W.T, tolW, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio)
elif sparseness == 'components':
Wt, gradW, iterW = _nls_subproblem(
safe_vstack([X.T,
np.zeros((n_components_, n_samples))]),
safe_vstack([H.T,
np.sqrt(eta) * np.eye(n_components_)]),
W.T, tolW, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio)
return Wt.T, gradW.T, iterW
def _update_projected_gradient_h(X, W, H, tolH, nls_max_iter, alpha, l1_ratio,
sparseness, beta, eta):
"""Helper function for _fit_projected_gradient"""
n_samples, n_features = X.shape
n_components_ = W.shape[1]
if sparseness is None:
H, gradH, iterH = _nls_subproblem(X, W, H, tolH, nls_max_iter,
alpha=alpha, l1_ratio=l1_ratio)
elif sparseness == 'data':
H, gradH, iterH = _nls_subproblem(
safe_vstack([X, np.zeros((n_components_, n_features))]),
safe_vstack([W,
np.sqrt(eta) * np.eye(n_components_)]),
H, tolH, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio)
elif sparseness == 'components':
H, gradH, iterH = _nls_subproblem(
safe_vstack([X, np.zeros((1, n_features))]),
safe_vstack([W,
np.sqrt(beta)
* np.ones((1, n_components_))]),
H, tolH, nls_max_iter, alpha=alpha, l1_ratio=l1_ratio)
return H, gradH, iterH
def _fit_projected_gradient(X, W, H, tol, max_iter,
nls_max_iter, alpha, l1_ratio,
sparseness, beta, eta):
"""Compute Non-negative Matrix Factorization (NMF) with Projected Gradient
References
----------
C.-J. Lin. Projected gradient methods for non-negative matrix
factorization. Neural Computation, 19(2007), 2756-2779.
http://www.csie.ntu.edu.tw/~cjlin/nmf/
P. Hoyer. Non-negative Matrix Factorization with Sparseness Constraints.
Journal of Machine Learning Research 2004.
"""
gradW = (np.dot(W, np.dot(H, H.T))
- safe_sparse_dot(X, H.T, dense_output=True))
gradH = (np.dot(np.dot(W.T, W), H)
- safe_sparse_dot(W.T, X, dense_output=True))
init_grad = squared_norm(gradW) + squared_norm(gradH.T)
# max(0.001, tol) to force alternating minimizations of W and H
tolW = max(0.001, tol) * np.sqrt(init_grad)
tolH = tolW
for n_iter in range(1, max_iter + 1):
# stopping condition
# as discussed in paper
proj_grad_W = squared_norm(gradW * np.logical_or(gradW < 0, W > 0))
proj_grad_H = squared_norm(gradH * np.logical_or(gradH < 0, H > 0))
if (proj_grad_W + proj_grad_H) / init_grad < tol ** 2:
break
# update W
W, gradW, iterW = _update_projected_gradient_w(X, W, H, tolW,
nls_max_iter,
alpha, l1_ratio,
sparseness, beta, eta)
if iterW == 1:
tolW = 0.1 * tolW
# update H
H, gradH, iterH = _update_projected_gradient_h(X, W, H, tolH,
nls_max_iter,
alpha, l1_ratio,
sparseness, beta, eta)
if iterH == 1:
tolH = 0.1 * tolH
H[H == 0] = 0 # fix up negative zeros
if n_iter == max_iter:
W, _, _ = _update_projected_gradient_w(X, W, H, tol, nls_max_iter,
alpha, l1_ratio, sparseness,
beta, eta)
return W, H, n_iter
def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle,
random_state):
"""Helper function for _fit_coordinate_descent
Update W to minimize the objective function, iterating once over all
coordinates. By symmetry, to update H, one can call
_update_coordinate_descent(X.T, Ht, W, ...)
"""
n_components = Ht.shape[1]
HHt = fast_dot(Ht.T, Ht)
XHt = safe_sparse_dot(X, Ht)
# L2 regularization corresponds to increase the diagonal of HHt
if l2_reg != 0.:
# adds l2_reg only on the diagonal
HHt.flat[::n_components + 1] += l2_reg
# L1 regularization correponds to decrease each element of XHt
if l1_reg != 0.:
XHt -= l1_reg
if shuffle:
permutation = random_state.permutation(n_components)
else:
permutation = np.arange(n_components)
# The following seems to be required on 64-bit Windows w/ Python 3.5.
permutation = np.asarray(permutation, dtype=np.intp)
return _update_cdnmf_fast(W, HHt, XHt, permutation)
def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, alpha=0.001,
l1_ratio=0., regularization=None, update_H=True,
verbose=0, shuffle=False, random_state=None):
"""Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent
The objective function is minimized with an alternating minimization of W
and H. Each minimization is done with a cyclic (up to a permutation of the
features) Coordinate Descent.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Constant matrix.
W : array-like, shape (n_samples, n_components)
Initial guess for the solution.
H : array-like, shape (n_components, n_features)
Initial guess for the solution.
tol : float, default: 1e-4
Tolerance of the stopping condition.
max_iter : integer, default: 200
Maximum number of iterations before timing out.
alpha : double, default: 0.
Constant that multiplies the regularization terms.
l1_ratio : double, default: 0.
The regularization mixing parameter, with 0 <= l1_ratio <= 1.
For l1_ratio = 0 the penalty is an L2 penalty.
For l1_ratio = 1 it is an L1 penalty.
For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
regularization : 'both' | 'components' | 'transformation' | None
Select whether the regularization affects the components (H), the
transformation (W), both or none of them.
update_H : boolean, default: True
Set to True, both W and H will be estimated from initial guesses.
Set to False, only W will be estimated.
verbose : integer, default: 0
The verbosity level.
shuffle : boolean, default: False
If true, randomize the order of coordinates in the CD solver.
random_state : integer seed, RandomState instance, or None (default)
Random number generator seed control.
Returns
-------
W : array-like, shape (n_samples, n_components)
Solution to the non-negative least squares problem.
H : array-like, shape (n_components, n_features)
Solution to the non-negative least squares problem.
n_iter : int
The number of iterations done by the algorithm.
References
----------
Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
large scale nonnegative matrix and tensor factorizations."
IEICE transactions on fundamentals of electronics, communications and
computer sciences 92.3: 708-721, 2009.
"""
# so W and Ht are both in C order in memory
Ht = check_array(H.T, order='C')
X = check_array(X, accept_sparse='csr')
# L1 and L2 regularization
l1_H, l2_H, l1_W, l2_W = 0, 0, 0, 0
if regularization in ('both', 'components'):
alpha = float(alpha)
l1_H = l1_ratio * alpha
l2_H = (1. - l1_ratio) * alpha
if regularization in ('both', 'transformation'):
alpha = float(alpha)
l1_W = l1_ratio * alpha
l2_W = (1. - l1_ratio) * alpha
rng = check_random_state(random_state)
for n_iter in range(max_iter):
violation = 0.
# Update W
violation += _update_coordinate_descent(X, W, Ht, l1_W, l2_W,
shuffle, rng)
# Update H
if update_H:
violation += _update_coordinate_descent(X.T, Ht, W, l1_H, l2_H,
shuffle, rng)
if n_iter == 0:
violation_init = violation
if violation_init == 0:
break
if verbose:
print("violation:", violation / violation_init)
if violation / violation_init <= tol:
if verbose:
print("Converged at iteration", n_iter + 1)
break
return W, Ht.T, n_iter
def non_negative_factorization(X, W=None, H=None, n_components=None,
init='random', update_H=True, solver='cd',
tol=1e-4, max_iter=200, alpha=0., l1_ratio=0.,
regularization=None, random_state=None,
verbose=0, shuffle=False, nls_max_iter=2000,
sparseness=None, beta=1, eta=0.1):
"""Compute Non-negative Matrix Factorization (NMF)
Find two non-negative matrices (W, H) whose product approximates the non-
negative matrix X. This factorization can be used for example for
dimensionality reduction, source separation or topic extraction.
The objective function is::
0.5 * ||X - WH||_Fro^2
+ alpha * l1_ratio * ||vec(W)||_1
+ alpha * l1_ratio * ||vec(H)||_1
+ 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
+ 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2
Where::
||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm)
||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm)
The objective function is minimized with an alternating minimization of W
and H. If H is given and update_H=False, it solves for W only.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Constant matrix.
W : array-like, shape (n_samples, n_components)
If init='custom', it is used as initial guess for the solution.
H : array-like, shape (n_components, n_features)
If init='custom', it is used as initial guess for the solution.
If update_H=False, it is used as a constant, to solve for W only.
n_components : integer
Number of components, if n_components is not set all features
are kept.
init : None | 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom'
Method used to initialize the procedure.
Default: 'nndsvd' if n_components < n_features, otherwise random.
Valid options:
- 'random': non-negative random matrices, scaled with:
sqrt(X.mean() / n_components)
- 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
initialization (better for sparseness)
- 'nndsvda': NNDSVD with zeros filled with the average of X
(better when sparsity is not desired)
- 'nndsvdar': NNDSVD with zeros filled with small random values
(generally faster, less accurate alternative to NNDSVDa
for when sparsity is not desired)
- 'custom': use custom matrices W and H
update_H : boolean, default: True
Set to True, both W and H will be estimated from initial guesses.
Set to False, only W will be estimated.
solver : 'pg' | 'cd'
Numerical solver to use:
'pg' is a (deprecated) Projected Gradient solver.
'cd' is a Coordinate Descent solver.
tol : float, default: 1e-4
Tolerance of the stopping condition.
max_iter : integer, default: 200
Maximum number of iterations before timing out.
alpha : double, default: 0.
Constant that multiplies the regularization terms.
l1_ratio : double, default: 0.
The regularization mixing parameter, with 0 <= l1_ratio <= 1.
For l1_ratio = 0 the penalty is an elementwise L2 penalty
(aka Frobenius Norm).
For l1_ratio = 1 it is an elementwise L1 penalty.
For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
regularization : 'both' | 'components' | 'transformation' | None
Select whether the regularization affects the components (H), the
transformation (W), both or none of them.
random_state : integer seed, RandomState instance, or None (default)
Random number generator seed control.
verbose : integer, default: 0
The verbosity level.
shuffle : boolean, default: False
If true, randomize the order of coordinates in the CD solver.
nls_max_iter : integer, default: 2000
Number of iterations in NLS subproblem.
Used only in the deprecated 'pg' solver.
sparseness : 'data' | 'components' | None, default: None
Where to enforce sparsity in the model.
Used only in the deprecated 'pg' solver.
beta : double, default: 1
Degree of sparseness, if sparseness is not None. Larger values mean
more sparseness. Used only in the deprecated 'pg' solver.
eta : double, default: 0.1
Degree of correctness to maintain, if sparsity is not None. Smaller
values mean larger error. Used only in the deprecated 'pg' solver.
Returns
-------
W : array-like, shape (n_samples, n_components)
Solution to the non-negative least squares problem.
H : array-like, shape (n_components, n_features)
Solution to the non-negative least squares problem.
n_iter : int
Actual number of iterations.
References
----------
C.-J. Lin. Projected gradient methods for non-negative matrix
factorization. Neural Computation, 19(2007), 2756-2779.
http://www.csie.ntu.edu.tw/~cjlin/nmf/
Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
large scale nonnegative matrix and tensor factorizations."
IEICE transactions on fundamentals of electronics, communications and
computer sciences 92.3: 708-721, 2009.
"""
X = check_array(X, accept_sparse=('csr', 'csc'))
check_non_negative(X, "NMF (input X)")
_check_string_param(sparseness, solver)
n_samples, n_features = X.shape
if n_components is None:
n_components = n_features
if not isinstance(n_components, six.integer_types) or n_components <= 0:
raise ValueError("Number of components must be positive;"
" got (n_components=%r)" % n_components)
if not isinstance(max_iter, numbers.Number) or max_iter < 0:
raise ValueError("Maximum number of iteration must be positive;"
" got (max_iter=%r)" % max_iter)
if not isinstance(tol, numbers.Number) or tol < 0:
raise ValueError("Tolerance for stopping criteria must be "
"positive; got (tol=%r)" % tol)
# check W and H, or initialize them
if init == 'custom':
_check_init(H, (n_components, n_features), "NMF (input H)")
_check_init(W, (n_samples, n_components), "NMF (input W)")
elif not update_H:
_check_init(H, (n_components, n_features), "NMF (input H)")
W = np.zeros((n_samples, n_components))
else:
W, H = _initialize_nmf(X, n_components, init=init,
random_state=random_state)
if solver == 'pg':
warnings.warn("'pg' solver will be removed in release 0.19."
" Use 'cd' solver instead.", DeprecationWarning)
if update_H: # fit_transform
W, H, n_iter = _fit_projected_gradient(X, W, H, tol,
max_iter,
nls_max_iter,
alpha, l1_ratio,
sparseness,
beta, eta)
else: # transform
W, H, n_iter = _update_projected_gradient_w(X, W, H,
tol, nls_max_iter,
alpha, l1_ratio,
sparseness, beta,
eta)
elif solver == 'cd':
W, H, n_iter = _fit_coordinate_descent(X, W, H, tol,
max_iter,
alpha, l1_ratio,
regularization,
update_H=update_H,
verbose=verbose,
shuffle=shuffle,
random_state=random_state)
else:
raise ValueError("Invalid solver parameter '%s'." % solver)
if n_iter == max_iter:
warnings.warn("Maximum number of iteration %d reached. Increase it to"
" improve convergence." % max_iter, ConvergenceWarning)
return W, H, n_iter
class NMF(BaseEstimator, TransformerMixin):
"""Non-Negative Matrix Factorization (NMF)
Find two non-negative matrices (W, H) whose product approximates the non-
negative matrix X. This factorization can be used for example for
dimensionality reduction, source separation or topic extraction.
The objective function is::
0.5 * ||X - WH||_Fro^2
+ alpha * l1_ratio * ||vec(W)||_1
+ alpha * l1_ratio * ||vec(H)||_1
+ 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
+ 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2
Where::
||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm)
||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm)
The objective function is minimized with an alternating minimization of W
and H.
Read more in the :ref:`User Guide <NMF>`.
Parameters
----------
n_components : int or None
Number of components, if n_components is not set all features
are kept.
init : 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom'
Method used to initialize the procedure.
Default: 'nndsvdar' if n_components < n_features, otherwise random.
Valid options:
- 'random': non-negative random matrices, scaled with:
sqrt(X.mean() / n_components)
- 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
initialization (better for sparseness)
- 'nndsvda': NNDSVD with zeros filled with the average of X
(better when sparsity is not desired)
- 'nndsvdar': NNDSVD with zeros filled with small random values
(generally faster, less accurate alternative to NNDSVDa
for when sparsity is not desired)
- 'custom': use custom matrices W and H
solver : 'pg' | 'cd'
Numerical solver to use:
'pg' is a Projected Gradient solver (deprecated).
'cd' is a Coordinate Descent solver (recommended).
.. versionadded:: 0.17
Coordinate Descent solver.
.. versionchanged:: 0.17
Deprecated Projected Gradient solver.
tol : double, default: 1e-4
Tolerance value used in stopping conditions.
max_iter : integer, default: 200
Number of iterations to compute.
random_state : integer seed, RandomState instance, or None (default)
Random number generator seed control.
alpha : double, default: 0.
Constant that multiplies the regularization terms. Set it to zero to
have no regularization.
.. versionadded:: 0.17
*alpha* used in the Coordinate Descent solver.
l1_ratio : double, default: 0.
The regularization mixing parameter, with 0 <= l1_ratio <= 1.
For l1_ratio = 0 the penalty is an elementwise L2 penalty
(aka Frobenius Norm).
For l1_ratio = 1 it is an elementwise L1 penalty.
For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
.. versionadded:: 0.17
Regularization parameter *l1_ratio* used in the Coordinate Descent solver.
shuffle : boolean, default: False
If true, randomize the order of coordinates in the CD solver.
.. versionadded:: 0.17
*shuffle* parameter used in the Coordinate Descent solver.
nls_max_iter : integer, default: 2000
Number of iterations in NLS subproblem.
Used only in the deprecated 'pg' solver.
.. versionchanged:: 0.17
Deprecated Projected Gradient solver. Use Coordinate Descent solver
instead.
sparseness : 'data' | 'components' | None, default: None
Where to enforce sparsity in the model.
Used only in the deprecated 'pg' solver.
.. versionchanged:: 0.17
Deprecated Projected Gradient solver. Use Coordinate Descent solver
instead.
beta : double, default: 1
Degree of sparseness, if sparseness is not None. Larger values mean
more sparseness. Used only in the deprecated 'pg' solver.
.. versionchanged:: 0.17
Deprecated Projected Gradient solver. Use Coordinate Descent solver
instead.
eta : double, default: 0.1
Degree of correctness to maintain, if sparsity is not None. Smaller
values mean larger error. Used only in the deprecated 'pg' solver.
.. versionchanged:: 0.17
Deprecated Projected Gradient solver. Use Coordinate Descent solver
instead.
Attributes
----------
components_ : array, [n_components, n_features]
Non-negative components of the data.
reconstruction_err_ : number
Frobenius norm of the matrix difference between
the training data and the reconstructed data from
the fit produced by the model. ``|| X - WH ||_2``
n_iter_ : int
Actual number of iterations.
Examples
--------
>>> import numpy as np
>>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
>>> from sklearn.decomposition import NMF
>>> model = NMF(n_components=2, init='random', random_state=0)
>>> model.fit(X) #doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
NMF(alpha=0.0, beta=1, eta=0.1, init='random', l1_ratio=0.0, max_iter=200,
n_components=2, nls_max_iter=2000, random_state=0, shuffle=False,
solver='cd', sparseness=None, tol=0.0001, verbose=0)
>>> model.components_
array([[ 2.09783018, 0.30560234],
[ 2.13443044, 2.13171694]])
>>> model.reconstruction_err_ #doctest: +ELLIPSIS
0.00115993...
References
----------
C.-J. Lin. Projected gradient methods for non-negative matrix
factorization. Neural Computation, 19(2007), 2756-2779.
http://www.csie.ntu.edu.tw/~cjlin/nmf/
Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
large scale nonnegative matrix and tensor factorizations."
IEICE transactions on fundamentals of electronics, communications and
computer sciences 92.3: 708-721, 2009.
"""
def __init__(self, n_components=None, init=None, solver='cd',
tol=1e-4, max_iter=200, random_state=None,
alpha=0., l1_ratio=0., verbose=0, shuffle=False,
nls_max_iter=2000, sparseness=None, beta=1, eta=0.1):
self.n_components = n_components
self.init = init
self.solver = solver
self.tol = tol
self.max_iter = max_iter
self.random_state = random_state
self.alpha = alpha
self.l1_ratio = l1_ratio
self.verbose = verbose
self.shuffle = shuffle
if sparseness is not None:
warnings.warn("Controlling regularization through the sparseness,"
" beta and eta arguments is only available"
" for 'pg' solver, which will be removed"
" in release 0.19. Use another solver with L1 or L2"
" regularization instead.", DeprecationWarning)
self.nls_max_iter = nls_max_iter
self.sparseness = sparseness
self.beta = beta
self.eta = eta
def fit_transform(self, X, y=None, W=None, H=None):
"""Learn a NMF model for the data X and returns the transformed data.
This is more efficient than calling fit followed by transform.
Parameters
----------
X: {array-like, sparse matrix}, shape (n_samples, n_features)
Data matrix to be decomposed
W : array-like, shape (n_samples, n_components)
If init='custom', it is used as initial guess for the solution.
H : array-like, shape (n_components, n_features)
If init='custom', it is used as initial guess for the solution.
Attributes
----------
components_ : array-like, shape (n_components, n_features)
Factorization matrix, sometimes called 'dictionary'.
n_iter_ : int
Actual number of iterations for the transform.
Returns
-------
W: array, shape (n_samples, n_components)
Transformed data.
"""
X = check_array(X, accept_sparse=('csr', 'csc'))
W, H, n_iter_ = non_negative_factorization(
X=X, W=W, H=H, n_components=self.n_components,
init=self.init, update_H=True, solver=self.solver,
tol=self.tol, max_iter=self.max_iter, alpha=self.alpha,
l1_ratio=self.l1_ratio, regularization='both',
random_state=self.random_state, verbose=self.verbose,
shuffle=self.shuffle,
nls_max_iter=self.nls_max_iter, sparseness=self.sparseness,
beta=self.beta, eta=self.eta)
if self.solver == 'pg':
self.comp_sparseness_ = _sparseness(H.ravel())
self.data_sparseness_ = _sparseness(W.ravel())
self.reconstruction_err_ = _safe_compute_error(X, W, H)
self.n_components_ = H.shape[0]
self.components_ = H
self.n_iter_ = n_iter_
return W
def fit(self, X, y=None, **params):
"""Learn a NMF model for the data X.
Parameters
----------
X: {array-like, sparse matrix}, shape (n_samples, n_features)
Data matrix to be decomposed
Attributes
----------
components_ : array-like, shape (n_components, n_features)
Factorization matrix, sometimes called 'dictionary'.
n_iter_ : int
Actual number of iterations for the transform.
Returns
-------
self
"""
self.fit_transform(X, **params)
return self
def transform(self, X):
"""Transform the data X according to the fitted NMF model
Parameters
----------
X: {array-like, sparse matrix}, shape (n_samples, n_features)
Data matrix to be transformed by the model
Attributes
----------
n_iter_ : int
Actual number of iterations for the transform.
Returns
-------
W: array, shape (n_samples, n_components)
Transformed data
"""
check_is_fitted(self, 'n_components_')
W, _, n_iter_ = non_negative_factorization(
X=X, W=None, H=self.components_, n_components=self.n_components_,
init=self.init, update_H=False, solver=self.solver,
tol=self.tol, max_iter=self.max_iter, alpha=self.alpha,
l1_ratio=self.l1_ratio, regularization='both',
random_state=self.random_state, verbose=self.verbose,
shuffle=self.shuffle,
nls_max_iter=self.nls_max_iter, sparseness=self.sparseness,
beta=self.beta, eta=self.eta)
self.n_iter_ = n_iter_
return W
@deprecated("It will be removed in release 0.19. Use NMF instead."
"'pg' solver is still available until release 0.19.")
class ProjectedGradientNMF(NMF):
"""Non-Negative Matrix Factorization (NMF)
Find two non-negative matrices (W, H) whose product approximates the non-
negative matrix X. This factorization can be used for example for
dimensionality reduction, source separation or topic extraction.
The objective function is::
0.5 * ||X - WH||_Fro^2
+ alpha * l1_ratio * ||vec(W)||_1
+ alpha * l1_ratio * ||vec(H)||_1
+ 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
+ 0.5 * alpha * (1 - l1_ratio) * ||H||_Fro^2
Where::
||A||_Fro^2 = \sum_{i,j} A_{ij}^2 (Frobenius norm)
||vec(A)||_1 = \sum_{i,j} abs(A_{ij}) (Elementwise L1 norm)
The objective function is minimized with an alternating minimization of W
and H.
Read more in the :ref:`User Guide <NMF>`.
Parameters
----------
n_components : int or None
Number of components, if n_components is not set all features
are kept.
init : 'random' | 'nndsvd' | 'nndsvda' | 'nndsvdar' | 'custom'
Method used to initialize the procedure.
Default: 'nndsvdar' if n_components < n_features, otherwise random.
Valid options:
- 'random': non-negative random matrices, scaled with:
sqrt(X.mean() / n_components)
- 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)
initialization (better for sparseness)
- 'nndsvda': NNDSVD with zeros filled with the average of X
(better when sparsity is not desired)
- 'nndsvdar': NNDSVD with zeros filled with small random values
(generally faster, less accurate alternative to NNDSVDa
for when sparsity is not desired)
- 'custom': use custom matrices W and H
solver : 'pg' | 'cd'
Numerical solver to use:
'pg' is a Projected Gradient solver (deprecated).
'cd' is a Coordinate Descent solver (recommended).
.. versionadded:: 0.17
Coordinate Descent solver.
.. versionchanged:: 0.17
Deprecated Projected Gradient solver.
tol : double, default: 1e-4
Tolerance value used in stopping conditions.
max_iter : integer, default: 200
Number of iterations to compute.
random_state : integer seed, RandomState instance, or None (default)
Random number generator seed control.
alpha : double, default: 0.
Constant that multiplies the regularization terms. Set it to zero to
have no regularization.
.. versionadded:: 0.17
*alpha* used in the Coordinate Descent solver.
l1_ratio : double, default: 0.
The regularization mixing parameter, with 0 <= l1_ratio <= 1.
For l1_ratio = 0 the penalty is an elementwise L2 penalty
(aka Frobenius Norm).
For l1_ratio = 1 it is an elementwise L1 penalty.
For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
.. versionadded:: 0.17
Regularization parameter *l1_ratio* used in the Coordinate Descent solver.
shuffle : boolean, default: False
If true, randomize the order of coordinates in the CD solver.
.. versionadded:: 0.17
*shuffle* parameter used in the Coordinate Descent solver.
nls_max_iter : integer, default: 2000
Number of iterations in NLS subproblem.
Used only in the deprecated 'pg' solver.
.. versionchanged:: 0.17
Deprecated Projected Gradient solver. Use Coordinate Descent solver
instead.
sparseness : 'data' | 'components' | None, default: None
Where to enforce sparsity in the model.
Used only in the deprecated 'pg' solver.
.. versionchanged:: 0.17
Deprecated Projected Gradient solver. Use Coordinate Descent solver
instead.
beta : double, default: 1
Degree of sparseness, if sparseness is not None. Larger values mean
more sparseness. Used only in the deprecated 'pg' solver.
.. versionchanged:: 0.17
Deprecated Projected Gradient solver. Use Coordinate Descent solver
instead.
eta : double, default: 0.1
Degree of correctness to maintain, if sparsity is not None. Smaller
values mean larger error. Used only in the deprecated 'pg' solver.
.. versionchanged:: 0.17
Deprecated Projected Gradient solver. Use Coordinate Descent solver
instead.
Attributes
----------
components_ : array, [n_components, n_features]
Non-negative components of the data.
reconstruction_err_ : number
Frobenius norm of the matrix difference between
the training data and the reconstructed data from
the fit produced by the model. ``|| X - WH ||_2``
n_iter_ : int
Actual number of iterations.
Examples
--------
>>> import numpy as np
>>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
>>> from sklearn.decomposition import NMF
>>> model = NMF(n_components=2, init='random', random_state=0)
>>> model.fit(X) #doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
NMF(alpha=0.0, beta=1, eta=0.1, init='random', l1_ratio=0.0, max_iter=200,
n_components=2, nls_max_iter=2000, random_state=0, shuffle=False,
solver='cd', sparseness=None, tol=0.0001, verbose=0)
>>> model.components_
array([[ 2.09783018, 0.30560234],
[ 2.13443044, 2.13171694]])
>>> model.reconstruction_err_ #doctest: +ELLIPSIS
0.00115993...
References
----------
C.-J. Lin. Projected gradient methods for non-negative matrix
factorization. Neural Computation, 19(2007), 2756-2779.
http://www.csie.ntu.edu.tw/~cjlin/nmf/
Cichocki, Andrzej, and P. H. A. N. Anh-Huy. "Fast local algorithms for
large scale nonnegative matrix and tensor factorizations."
IEICE transactions on fundamentals of electronics, communications and
computer sciences 92.3: 708-721, 2009.
"""
def __init__(self, n_components=None, solver='pg', init=None,
tol=1e-4, max_iter=200, random_state=None,
alpha=0., l1_ratio=0., verbose=0,
nls_max_iter=2000, sparseness=None, beta=1, eta=0.1):
super(ProjectedGradientNMF, self).__init__(
n_components=n_components, init=init, solver='pg', tol=tol,
max_iter=max_iter, random_state=random_state, alpha=alpha,
l1_ratio=l1_ratio, verbose=verbose, nls_max_iter=nls_max_iter,
sparseness=sparseness, beta=beta, eta=eta)