# -*- coding: utf-8 -*-
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Mathieu Blondel <mathieu@mblondel.org>
# Robert Layton <robertlayton@gmail.com>
# Andreas Mueller <amueller@ais.uni-bonn.de>
# Philippe Gervais <philippe.gervais@inria.fr>
# Lars Buitinck <larsmans@gmail.com>
# Joel Nothman <joel.nothman@gmail.com>
# License: BSD 3 clause
import itertools
import numpy as np
from scipy.spatial import distance
from scipy.sparse import csr_matrix
from scipy.sparse import issparse
from ..utils import check_array
from ..utils import gen_even_slices
from ..utils import gen_batches
from ..utils.fixes import partial
from ..utils.extmath import row_norms, safe_sparse_dot
from ..preprocessing import normalize
from ..externals.joblib import Parallel
from ..externals.joblib import delayed
from ..externals.joblib.parallel import cpu_count
from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
# Utility Functions
def _return_float_dtype(X, Y):
"""
1. If dtype of X and Y is float32, then dtype float32 is returned.
2. Else dtype float is returned.
"""
if not issparse(X) and not isinstance(X, np.ndarray):
X = np.asarray(X)
if Y is None:
Y_dtype = X.dtype
elif not issparse(Y) and not isinstance(Y, np.ndarray):
Y = np.asarray(Y)
Y_dtype = Y.dtype
else:
Y_dtype = Y.dtype
if X.dtype == Y_dtype == np.float32:
dtype = np.float32
else:
dtype = np.float
return X, Y, dtype
def check_pairwise_arrays(X, Y, precomputed=False):
""" Set X and Y appropriately and checks inputs
If Y is None, it is set as a pointer to X (i.e. not a copy).
If Y is given, this does not happen.
All distance metrics should use this function first to assert that the
given parameters are correct and safe to use.
Specifically, this function first ensures that both X and Y are arrays,
then checks that they are at least two dimensional while ensuring that
their elements are floats. Finally, the function checks that the size
of the second dimension of the two arrays is equal, or the equivalent
check for a precomputed distance matrix.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
precomputed : bool
True if X is to be treated as precomputed distances to the samples in
Y.
Returns
-------
safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
An array equal to X, guaranteed to be a numpy array.
safe_Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
An array equal to Y if Y was not None, guaranteed to be a numpy array.
If Y was None, safe_Y will be a pointer to X.
"""
X, Y, dtype = _return_float_dtype(X, Y)
if Y is X or Y is None:
X = Y = check_array(X, accept_sparse='csr', dtype=dtype)
else:
X = check_array(X, accept_sparse='csr', dtype=dtype)
Y = check_array(Y, accept_sparse='csr', dtype=dtype)
if precomputed:
if X.shape[1] != Y.shape[0]:
raise ValueError("Precomputed metric requires shape "
"(n_queries, n_indexed). Got (%d, %d) "
"for %d indexed." %
(X.shape[0], X.shape[1], Y.shape[0]))
elif X.shape[1] != Y.shape[1]:
raise ValueError("Incompatible dimension for X and Y matrices: "
"X.shape[1] == %d while Y.shape[1] == %d" % (
X.shape[1], Y.shape[1]))
return X, Y
def check_paired_arrays(X, Y):
""" Set X and Y appropriately and checks inputs for paired distances
All paired distance metrics should use this function first to assert that
the given parameters are correct and safe to use.
Specifically, this function first ensures that both X and Y are arrays,
then checks that they are at least two dimensional while ensuring that
their elements are floats. Finally, the function checks that the size
of the dimensions of the two arrays are equal.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
Returns
-------
safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
An array equal to X, guaranteed to be a numpy array.
safe_Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
An array equal to Y if Y was not None, guaranteed to be a numpy array.
If Y was None, safe_Y will be a pointer to X.
"""
X, Y = check_pairwise_arrays(X, Y)
if X.shape != Y.shape:
raise ValueError("X and Y should be of same shape. They were "
"respectively %r and %r long." % (X.shape, Y.shape))
return X, Y
# Pairwise distances
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
X_norm_squared=None):
"""
Considering the rows of X (and Y=X) as vectors, compute the
distance matrix between each pair of vectors.
For efficiency reasons, the euclidean distance between a pair of row
vector x and y is computed as::
dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
This formulation has two advantages over other ways of computing distances.
First, it is computationally efficient when dealing with sparse data.
Second, if one argument varies but the other remains unchanged, then
`dot(x, x)` and/or `dot(y, y)` can be pre-computed.
However, this is not the most precise way of doing this computation, and
the distance matrix returned by this function may not be exactly
symmetric as required by, e.g., ``scipy.spatial.distance`` functions.
Read more in the :ref:`User Guide <metrics>`.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples_1, n_features)
Y : {array-like, sparse matrix}, shape (n_samples_2, n_features)
Y_norm_squared : array-like, shape (n_samples_2, ), optional
Pre-computed dot-products of vectors in Y (e.g.,
``(Y**2).sum(axis=1)``)
squared : boolean, optional
Return squared Euclidean distances.
X_norm_squared : array-like, shape = [n_samples_1], optional
Pre-computed dot-products of vectors in X (e.g.,
``(X**2).sum(axis=1)``)
Returns
-------
distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2)
Examples
--------
>>> from sklearn.metrics.pairwise import euclidean_distances
>>> X = [[0, 1], [1, 1]]
>>> # distance between rows of X
>>> euclidean_distances(X, X)
array([[ 0., 1.],
[ 1., 0.]])
>>> # get distance to origin
>>> euclidean_distances(X, [[0, 0]])
array([[ 1. ],
[ 1.41421356]])
See also
--------
paired_distances : distances betweens pairs of elements of X and Y.
"""
X, Y = check_pairwise_arrays(X, Y)
if X_norm_squared is not None:
XX = check_array(X_norm_squared)
if XX.shape == (1, X.shape[0]):
XX = XX.T
elif XX.shape != (X.shape[0], 1):
raise ValueError(
"Incompatible dimensions for X and X_norm_squared")
else:
XX = row_norms(X, squared=True)[:, np.newaxis]
if X is Y: # shortcut in the common case euclidean_distances(X, X)
YY = XX.T
elif Y_norm_squared is not None:
YY = np.atleast_2d(Y_norm_squared)
if YY.shape != (1, Y.shape[0]):
raise ValueError(
"Incompatible dimensions for Y and Y_norm_squared")
else:
YY = row_norms(Y, squared=True)[np.newaxis, :]
distances = safe_sparse_dot(X, Y.T, dense_output=True)
distances *= -2
distances += XX
distances += YY
np.maximum(distances, 0, out=distances)
if X is Y:
# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
distances.flat[::distances.shape[0] + 1] = 0.0
return distances if squared else np.sqrt(distances, out=distances)
def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
batch_size=500, metric_kwargs=None):
"""Compute minimum distances between one point and a set of points.
This function computes for each row in X, the index of the row of Y which
is closest (according to the specified distance). The minimal distances are
also returned.
This is mostly equivalent to calling:
(pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),
pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))
but uses much less memory, and is faster for large arrays.
Parameters
----------
X, Y : {array-like, sparse matrix}
Arrays containing points. Respective shapes (n_samples1, n_features)
and (n_samples2, n_features)
batch_size : integer
To reduce memory consumption over the naive solution, data are
processed in batches, comprising batch_size rows of X and
batch_size rows of Y. The default value is quite conservative, but
can be changed for fine-tuning. The larger the number, the larger the
memory usage.
metric : string or callable, default 'euclidean'
metric to use for distance computation. Any metric from scikit-learn
or scipy.spatial.distance can be used.
If metric is a callable function, it is called on each
pair of instances (rows) and the resulting value recorded. The callable
should take two arrays as input and return one value indicating the
distance between them. This works for Scipy's metrics, but is less
efficient than passing the metric name as a string.
Distance matrices are not supported.
Valid values for metric are:
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
'sqeuclidean', 'yule']
See the documentation for scipy.spatial.distance for details on these
metrics.
metric_kwargs : dict, optional
Keyword arguments to pass to specified metric function.
axis : int, optional, default 1
Axis along which the argmin and distances are to be computed.
Returns
-------
argmin : numpy.ndarray
Y[argmin[i], :] is the row in Y that is closest to X[i, :].
distances : numpy.ndarray
distances[i] is the distance between the i-th row in X and the
argmin[i]-th row in Y.
See also
--------
sklearn.metrics.pairwise_distances
sklearn.metrics.pairwise_distances_argmin
"""
dist_func = None
if metric in PAIRWISE_DISTANCE_FUNCTIONS:
dist_func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
elif not callable(metric) and not isinstance(metric, str):
raise ValueError("'metric' must be a string or a callable")
X, Y = check_pairwise_arrays(X, Y)
if metric_kwargs is None:
metric_kwargs = {}
if axis == 0:
X, Y = Y, X
# Allocate output arrays
indices = np.empty(X.shape[0], dtype=np.intp)
values = np.empty(X.shape[0])
values.fill(np.infty)
for chunk_x in gen_batches(X.shape[0], batch_size):
X_chunk = X[chunk_x, :]
for chunk_y in gen_batches(Y.shape[0], batch_size):
Y_chunk = Y[chunk_y, :]
if dist_func is not None:
if metric == 'euclidean': # special case, for speed
Loading ...