"""K-means clustering"""
# Authors: Gael Varoquaux <gael.varoquaux@normalesup.org>
# Thomas Rueckstiess <ruecksti@in.tum.de>
# James Bergstra <james.bergstra@umontreal.ca>
# Jan Schlueter <scikit-learn@jan-schlueter.de>
# Nelle Varoquaux
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Olivier Grisel <olivier.grisel@ensta.org>
# Mathieu Blondel <mathieu@mblondel.org>
# Robert Layton <robertlayton@gmail.com>
# License: BSD 3 clause
import warnings
import numpy as np
import scipy.sparse as sp
from threadpoolctl import threadpool_limits
from ..base import BaseEstimator, ClusterMixin, TransformerMixin
from ..metrics.pairwise import euclidean_distances
from ..utils.extmath import row_norms, stable_cumsum
from ..utils.sparsefuncs_fast import assign_rows_csr
from ..utils.sparsefuncs import mean_variance_axis
from ..utils.validation import _num_samples, _deprecate_positional_args
from ..utils import check_array
from ..utils import gen_batches
from ..utils import check_random_state
from ..utils.validation import check_is_fitted, _check_sample_weight
from ..utils._openmp_helpers import _openmp_effective_n_threads
from ..exceptions import ConvergenceWarning
from ._k_means_fast import _inertia_dense
from ._k_means_fast import _inertia_sparse
from ._k_means_fast import _mini_batch_update_csr
from ._k_means_lloyd import lloyd_iter_chunked_dense
from ._k_means_lloyd import lloyd_iter_chunked_sparse
from ._k_means_elkan import init_bounds_dense
from ._k_means_elkan import init_bounds_sparse
from ._k_means_elkan import elkan_iter_chunked_dense
from ._k_means_elkan import elkan_iter_chunked_sparse
###############################################################################
# Initialization heuristic
def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
"""Init n_clusters seeds according to k-means++
Parameters
----------
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
The data to pick seeds for. To avoid memory copy, the input data
should be double precision (dtype=np.float64).
n_clusters : int
The number of seeds to choose
x_squared_norms : ndarray of shape (n_samples,)
Squared Euclidean norm of each data point.
random_state : RandomState instance
The generator used to initialize the centers.
See :term:`Glossary <random_state>`.
n_local_trials : int, default=None
The number of seeding trials for each center (except the first),
of which the one reducing inertia the most is greedily chosen.
Set to None to make the number of trials depend logarithmically
on the number of seeds (2+log(k)); this is the default.
Notes
-----
Selects initial cluster centers for k-mean clustering in a smart way
to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
"k-means++: the advantages of careful seeding". ACM-SIAM symposium
on Discrete algorithms. 2007
Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
which is the implementation used in the aforementioned paper.
"""
n_samples, n_features = X.shape
centers = np.empty((n_clusters, n_features), dtype=X.dtype)
assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
# Set the number of local seeding trials if none is given
if n_local_trials is None:
# This is what Arthur/Vassilvitskii tried, but did not report
# specific results for other than mentioning in the conclusion
# that it helped.
n_local_trials = 2 + int(np.log(n_clusters))
# Pick first center randomly
center_id = random_state.randint(n_samples)
if sp.issparse(X):
centers[0] = X[center_id].toarray()
else:
centers[0] = X[center_id]
# Initialize list of closest distances and calculate current potential
closest_dist_sq = euclidean_distances(
centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms,
squared=True)
current_pot = closest_dist_sq.sum()
# Pick the remaining n_clusters-1 points
for c in range(1, n_clusters):
# Choose center candidates by sampling with probability proportional
# to the squared distance to the closest existing center
rand_vals = random_state.random_sample(n_local_trials) * current_pot
candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
rand_vals)
# XXX: numerical imprecision can result in a candidate_id out of range
np.clip(candidate_ids, None, closest_dist_sq.size - 1,
out=candidate_ids)
# Compute distances to center candidates
distance_to_candidates = euclidean_distances(
X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
# update closest distances squared and potential for each candidate
np.minimum(closest_dist_sq, distance_to_candidates,
out=distance_to_candidates)
candidates_pot = distance_to_candidates.sum(axis=1)
# Decide which candidate is the best
best_candidate = np.argmin(candidates_pot)
current_pot = candidates_pot[best_candidate]
closest_dist_sq = distance_to_candidates[best_candidate]
best_candidate = candidate_ids[best_candidate]
# Permanently add best center candidate found in local tries
if sp.issparse(X):
centers[c] = X[best_candidate].toarray()
else:
centers[c] = X[best_candidate]
return centers
###############################################################################
# K-means batch estimation by EM (expectation maximization)
def _validate_center_shape(X, n_centers, centers):
"""Check if centers is compatible with X and n_centers"""
if len(centers) != n_centers:
raise ValueError('The shape of the initial centers (%s) '
'does not match the number of clusters %i'
% (centers.shape, n_centers))
if centers.shape[1] != X.shape[1]:
raise ValueError(
"The number of features of the initial centers %s "
"does not match the number of features of the data %s."
% (centers.shape[1], X.shape[1]))
def _tolerance(X, tol):
"""Return a tolerance which is independent of the dataset"""
if tol == 0:
return 0
if sp.issparse(X):
variances = mean_variance_axis(X, axis=0)[1]
else:
variances = np.var(X, axis=0)
return np.mean(variances) * tol
def _check_normalize_sample_weight(sample_weight, X):
"""Set sample_weight if None, and check for correct dtype"""
sample_weight_was_none = sample_weight is None
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
if not sample_weight_was_none:
# normalize the weights to sum up to n_samples
# an array of 1 (i.e. samples_weight is None) is already normalized
n_samples = len(sample_weight)
scale = n_samples / sample_weight.sum()
sample_weight = sample_weight * scale
return sample_weight
@_deprecate_positional_args
def k_means(X, n_clusters, *, sample_weight=None, init='k-means++',
precompute_distances='deprecated', n_init=10, max_iter=300,
verbose=False, tol=1e-4, random_state=None, copy_x=True,
n_jobs='deprecated', algorithm="auto", return_n_iter=False):
"""K-means clustering algorithm.
Read more in the :ref:`User Guide <k_means>`.
Parameters
----------
X : {array-like, sparse} matrix of shape (n_samples, n_features)
The observations to cluster. It must be noted that the data
will be converted to C ordering, which will cause a memory copy
if the given data is not C-contiguous.
n_clusters : int
The number of clusters to form as well as the number of
centroids to generate.
sample_weight : array-like of shape (n_samples,), default=None
The weights for each observation in X. If None, all observations
are assigned equal weight
init : {'k-means++', 'random', ndarray, callable}, default='k-means++'
Method for initialization:
'k-means++' : selects initial cluster centers for k-mean
clustering in a smart way to speed up convergence. See section
Notes in k_init for more details.
'random': choose `n_clusters` observations (rows) at random from data
for the initial centroids.
If an ndarray is passed, it should be of shape (n_clusters, n_features)
and gives the initial centers.
If a callable is passed, it should take arguments X, n_clusters and a
random state and return an initialization.
precompute_distances : {'auto', True, False}
Precompute distances (faster but takes more memory).
'auto' : do not precompute distances if n_samples * n_clusters > 12
million. This corresponds to about 100MB overhead per job using
double precision.
True : always precompute distances
False : never precompute distances
.. deprecated:: 0.23
'precompute_distances' was deprecated in version 0.23 and will be
removed in 0.25. It has no effect.
n_init : int, default=10
Number of time the k-means algorithm will be run with different
centroid seeds. The final results will be the best output of
n_init consecutive runs in terms of inertia.
max_iter : int, default=300
Maximum number of iterations of the k-means algorithm to run.
verbose : bool, default=False
Verbosity mode.
tol : float, default=1e-4
Relative tolerance with regards to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence.
It's not advised to set `tol=0` since convergence might never be
declared due to rounding errors. Use a very small number instead.
random_state : int, RandomState instance, default=None
Determines random number generation for centroid initialization. Use
an int to make the randomness deterministic.
See :term:`Glossary <random_state>`.
copy_x : bool, default=True
When pre-computing distances it is more numerically accurate to center
the data first. If copy_x is True (default), then the original data is
not modified. If False, the original data is modified, and put back
before the function returns, but small numerical differences may be
introduced by subtracting and then adding the data mean. Note that if
the original data is not C-contiguous, a copy will be made even if
copy_x is False. If the original data is sparse, but not in CSR format,
a copy will be made even if copy_x is False.
n_jobs : int, default=None
The number of OpenMP threads to use for the computation. Parallelism is
sample-wise on the main cython loop which assigns each sample to its
closest center.
``None`` or ``-1`` means using all processors.
.. deprecated:: 0.23
``n_jobs`` was deprecated in version 0.23 and will be removed in
0.25.
algorithm : {"auto", "full", "elkan"}, default="auto"
K-means algorithm to use. The classical EM-style algorithm is "full".
The "elkan" variation is more efficient on data with well-defined
clusters, by using the triangle inequality. However it's more memory
intensive due to the allocation of an extra array of shape
(n_samples, n_clusters).
For now "auto" (kept for backward compatibiliy) chooses "elkan" but it
might change in the future for a better heuristic.
return_n_iter : bool, default=False
Whether or not to return the number of iterations.
Returns
-------
centroid : ndarray of shape (n_clusters, n_features)
Centroids found at the last iteration of k-means.
label : ndarray of shape (n_samples,)
label[i] is the code or index of the centroid the
i'th observation is closest to.
inertia : float
The final value of the inertia criterion (sum of squared distances to
the closest centroid for all observations in the training set).
best_n_iter : int
Number of iterations corresponding to the best results.
Returned only if `return_n_iter` is set to True.
"""
est = KMeans(
n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter,
verbose=verbose, precompute_distances=precompute_distances, tol=tol,
random_state=random_state, copy_x=copy_x, n_jobs=n_jobs,
algorithm=algorithm
).fit(X, sample_weight=sample_weight)
if return_n_iter:
return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_
else:
return est.cluster_centers_, est.labels_, est.inertia_
def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
init='k-means++', verbose=False, x_squared_norms=None,
random_state=None, tol=1e-4, n_threads=1):
"""A single run of k-means lloyd, assumes preparation completed prior.
Parameters
----------
X : {ndarray, sparse matrix} of shape (n_samples, n_features)
The observations to cluster. If sparse matrix, must be in CSR format.
sample_weight : array-like of shape (n_samples,)
The weights for each observation in X.
n_clusters : int
The number of clusters to form as well as the number of
centroids to generate.
max_iter : int, default=300
Maximum number of iterations of the k-means algorithm to run.
Loading ...