"""
K-means clustering and vector quantization (:mod:`scipy.cluster.vq`)
====================================================================
Provides routines for k-means clustering, generating code books
from k-means models, and quantizing vectors by comparing them with
centroids in a code book.
.. autosummary::
:toctree: generated/
whiten -- Normalize a group of observations so each feature has unit variance
vq -- Calculate code book membership of a set of observation vectors
kmeans -- Performs k-means on a set of observation vectors forming k clusters
kmeans2 -- A different implementation of k-means with more methods
-- for initializing centroids
Background information
----------------------
The k-means algorithm takes as input the number of clusters to
generate, k, and a set of observation vectors to cluster. It
returns a set of centroids, one for each of the k clusters. An
observation vector is classified with the cluster number or
centroid index of the centroid closest to it.
A vector v belongs to cluster i if it is closer to centroid i than
any other centroids. If v belongs to i, we say centroid i is the
dominating centroid of v. The k-means algorithm tries to
minimize distortion, which is defined as the sum of the squared distances
between each observation vector and its dominating centroid.
The minimization is achieved by iteratively reclassifying
the observations into clusters and recalculating the centroids until
a configuration is reached in which the centroids are stable. One can
also define a maximum number of iterations.
Since vector quantization is a natural application for k-means,
information theory terminology is often used. The centroid index
or cluster index is also referred to as a "code" and the table
mapping codes to centroids and vice versa is often referred as a
"code book". The result of k-means, a set of centroids, can be
used to quantize vectors. Quantization aims to find an encoding of
vectors that reduces the expected distortion.
All routines expect obs to be a M by N array where the rows are
the observation vectors. The codebook is a k by N array where the
i'th row is the centroid of code word i. The observation vectors
and centroids have the same feature dimension.
As an example, suppose we wish to compress a 24-bit color image
(each pixel is represented by one byte for red, one for blue, and
one for green) before sending it over the web. By using a smaller
8-bit encoding, we can reduce the amount of data by two
thirds. Ideally, the colors for each of the 256 possible 8-bit
encoding values should be chosen to minimize distortion of the
color. Running k-means with k=256 generates a code book of 256
codes, which fills up all possible 8-bit sequences. Instead of
sending a 3-byte value for each pixel, the 8-bit centroid index
(or code word) of the dominating centroid is transmitted. The code
book is also sent over the wire so each 8-bit code can be
translated back to a 24-bit pixel value representation. If the
image of interest was of an ocean, we would expect many 24-bit
blues to be represented by 8-bit codes. If it was an image of a
human face, more flesh tone colors would be represented in the
code book.
"""
from __future__ import division, print_function, absolute_import
import warnings
import numpy as np
from collections import deque
from scipy._lib._util import _asarray_validated
from scipy._lib.six import xrange
from scipy.spatial.distance import cdist
from . import _vq
__docformat__ = 'restructuredtext'
__all__ = ['whiten', 'vq', 'kmeans', 'kmeans2']
class ClusterError(Exception):
pass
def whiten(obs, check_finite=True):
"""
Normalize a group of observations on a per feature basis.
Before running k-means, it is beneficial to rescale each feature
dimension of the observation set with whitening. Each feature is
divided by its standard deviation across all observations to give
it unit variance.
Parameters
----------
obs : ndarray
Each row of the array is an observation. The
columns are the features seen during each observation.
>>> # f0 f1 f2
>>> obs = [[ 1., 1., 1.], #o0
... [ 2., 2., 2.], #o1
... [ 3., 3., 3.], #o2
... [ 4., 4., 4.]] #o3
check_finite : bool, optional
Whether to check that the input matrices contain only finite numbers.
Disabling may give a performance gain, but may result in problems
(crashes, non-termination) if the inputs do contain infinities or NaNs.
Default: True
Returns
-------
result : ndarray
Contains the values in `obs` scaled by the standard deviation
of each column.
Examples
--------
>>> from scipy.cluster.vq import whiten
>>> features = np.array([[1.9, 2.3, 1.7],
... [1.5, 2.5, 2.2],
... [0.8, 0.6, 1.7,]])
>>> whiten(features)
array([[ 4.17944278, 2.69811351, 7.21248917],
[ 3.29956009, 2.93273208, 9.33380951],
[ 1.75976538, 0.7038557 , 7.21248917]])
"""
obs = _asarray_validated(obs, check_finite=check_finite)
std_dev = obs.std(axis=0)
zero_std_mask = std_dev == 0
if zero_std_mask.any():
std_dev[zero_std_mask] = 1.0
warnings.warn("Some columns have standard deviation zero. "
"The values of these columns will not change.",
RuntimeWarning)
return obs / std_dev
def vq(obs, code_book, check_finite=True):
"""
Assign codes from a code book to observations.
Assigns a code from a code book to each observation. Each
observation vector in the 'M' by 'N' `obs` array is compared with the
centroids in the code book and assigned the code of the closest
centroid.
The features in `obs` should have unit variance, which can be
achieved by passing them through the whiten function. The code
book can be created with the k-means algorithm or a different
encoding algorithm.
Parameters
----------
obs : ndarray
Each row of the 'M' x 'N' array is an observation. The columns are
the "features" seen during each observation. The features must be
whitened first using the whiten function or something equivalent.
code_book : ndarray
The code book is usually generated using the k-means algorithm.
Each row of the array holds a different code, and the columns are
the features of the code.
>>> # f0 f1 f2 f3
>>> code_book = [
... [ 1., 2., 3., 4.], #c0
... [ 1., 2., 3., 4.], #c1
... [ 1., 2., 3., 4.]] #c2
check_finite : bool, optional
Whether to check that the input matrices contain only finite numbers.
Disabling may give a performance gain, but may result in problems
(crashes, non-termination) if the inputs do contain infinities or NaNs.
Default: True
Returns
-------
code : ndarray
A length M array holding the code book index for each observation.
dist : ndarray
The distortion (distance) between the observation and its nearest
code.
Examples
--------
>>> from numpy import array
>>> from scipy.cluster.vq import vq
>>> code_book = array([[1.,1.,1.],
... [2.,2.,2.]])
>>> features = array([[ 1.9,2.3,1.7],
... [ 1.5,2.5,2.2],
... [ 0.8,0.6,1.7]])
>>> vq(features,code_book)
(array([1, 1, 0],'i'), array([ 0.43588989, 0.73484692, 0.83066239]))
"""
obs = _asarray_validated(obs, check_finite=check_finite)
code_book = _asarray_validated(code_book, check_finite=check_finite)
ct = np.common_type(obs, code_book)
c_obs = obs.astype(ct, copy=False)
c_code_book = code_book.astype(ct, copy=False)
if np.issubdtype(ct, np.float64) or np.issubdtype(ct, np.float32):
return _vq.vq(c_obs, c_code_book)
return py_vq(obs, code_book, check_finite=False)
def py_vq(obs, code_book, check_finite=True):
""" Python version of vq algorithm.
The algorithm computes the euclidian distance between each
observation and every frame in the code_book.
Parameters
----------
obs : ndarray
Expects a rank 2 array. Each row is one observation.
code_book : ndarray
Code book to use. Same format than obs. Should have same number of
features (eg columns) than obs.
check_finite : bool, optional
Whether to check that the input matrices contain only finite numbers.
Disabling may give a performance gain, but may result in problems
(crashes, non-termination) if the inputs do contain infinities or NaNs.
Default: True
Returns
-------
code : ndarray
code[i] gives the label of the ith obversation, that its code is
code_book[code[i]].
mind_dist : ndarray
min_dist[i] gives the distance between the ith observation and its
corresponding code.
Notes
-----
This function is slower than the C version but works for
all input types. If the inputs have the wrong types for the
C versions of the function, this one is called as a last resort.
It is about 20 times slower than the C version.
"""
obs = _asarray_validated(obs, check_finite=check_finite)
code_book = _asarray_validated(code_book, check_finite=check_finite)
if obs.ndim != code_book.ndim:
raise ValueError("Observation and code_book should have the same rank")
if obs.ndim == 1:
obs = obs[:, np.newaxis]
code_book = code_book[:, np.newaxis]
dist = cdist(obs, code_book)
code = dist.argmin(axis=1)
min_dist = dist[np.arange(len(code)), code]
return code, min_dist
# py_vq2 was equivalent to py_vq
py_vq2 = np.deprecate(py_vq, old_name='py_vq2', new_name='py_vq')
def _kmeans(obs, guess, thresh=1e-5):
""" "raw" version of k-means.
Returns
-------
code_book
the lowest distortion codebook found.
avg_dist
the average distance a observation is from a code in the book.
Lower means the code_book matches the data better.
See Also
--------
kmeans : wrapper around k-means
Examples
--------
Note: not whitened in this example.
>>> from numpy import array
>>> from scipy.cluster.vq import _kmeans
>>> features = array([[ 1.9,2.3],
... [ 1.5,2.5],
... [ 0.8,0.6],
... [ 0.4,1.8],
... [ 1.0,1.0]])
>>> book = array((features[0],features[2]))
>>> _kmeans(features,book)
(array([[ 1.7 , 2.4 ],
[ 0.73333333, 1.13333333]]), 0.40563916697728591)
"""
code_book = np.asarray(guess)
diff = np.inf
prev_avg_dists = deque([diff], maxlen=2)
while diff > thresh:
# compute membership and distances between obs and code_book
obs_code, distort = vq(obs, code_book, check_finite=False)
prev_avg_dists.append(distort.mean(axis=-1))
# recalc code_book as centroids of associated obs
code_book, has_members = _vq.update_cluster_means(obs, obs_code,
code_book.shape[0])
code_book = code_book[has_members]
diff = prev_avg_dists[0] - prev_avg_dists[1]
return code_book, prev_avg_dists[1]
def kmeans(obs, k_or_guess, iter=20, thresh=1e-5, check_finite=True):
"""
Performs k-means on a set of observation vectors forming k clusters.
The k-means algorithm adjusts the classification of the observations
into clusters and updates the cluster centroids until the position of
the centroids is stable over successive iterations. In this
implementation of the algorithm, the stability of the centroids is
determined by comparing the absolute value of the change in the average
Euclidean distance between the observations and their corresponding
centroids against a threshold. This yields
a code book mapping centroids to codes and vice versa.
Parameters
----------
obs : ndarray
Each row of the M by N array is an observation vector. The
columns are the features seen during each observation.
The features must be whitened first with the `whiten` function.
k_or_guess : int or ndarray
The number of centroids to generate. A code is assigned to
each centroid, which is also the row index of the centroid
in the code_book matrix generated.
The initial k centroids are chosen by randomly selecting
observations from the observation matrix. Alternatively,
Loading ...