"""
Distance computations (:mod:`scipy.spatial.distance`)
=====================================================
.. sectionauthor:: Damian Eads
Function Reference
------------------
Distance matrix computation from a collection of raw observation vectors
stored in a rectangular array.
.. autosummary::
:toctree: generated/
pdist -- pairwise distances between observation vectors.
cdist -- distances between two collections of observation vectors
squareform -- convert distance matrix to a condensed one and vice versa
directed_hausdorff -- directed Hausdorff distance between arrays
Predicates for checking the validity of distance matrices, both
condensed and redundant. Also contained in this module are functions
for computing the number of observations in a distance matrix.
.. autosummary::
:toctree: generated/
is_valid_dm -- checks for a valid distance matrix
is_valid_y -- checks for a valid condensed distance matrix
num_obs_dm -- # of observations in a distance matrix
num_obs_y -- # of observations in a condensed distance matrix
Distance functions between two numeric vectors ``u`` and ``v``. Computing
distances over a large collection of vectors is inefficient for these
functions. Use ``pdist`` for this purpose.
.. autosummary::
:toctree: generated/
braycurtis -- the Bray-Curtis distance.
canberra -- the Canberra distance.
chebyshev -- the Chebyshev distance.
cityblock -- the Manhattan distance.
correlation -- the Correlation distance.
cosine -- the Cosine distance.
euclidean -- the Euclidean distance.
jensenshannon -- the Jensen-Shannon distance.
mahalanobis -- the Mahalanobis distance.
minkowski -- the Minkowski distance.
seuclidean -- the normalized Euclidean distance.
sqeuclidean -- the squared Euclidean distance.
wminkowski -- (deprecated) alias of `minkowski`.
Distance functions between two boolean vectors (representing sets) ``u`` and
``v``. As in the case of numerical vectors, ``pdist`` is more efficient for
computing the distances between all pairs.
.. autosummary::
:toctree: generated/
dice -- the Dice dissimilarity.
hamming -- the Hamming distance.
jaccard -- the Jaccard distance.
kulsinski -- the Kulsinski distance.
rogerstanimoto -- the Rogers-Tanimoto dissimilarity.
russellrao -- the Russell-Rao dissimilarity.
sokalmichener -- the Sokal-Michener dissimilarity.
sokalsneath -- the Sokal-Sneath dissimilarity.
yule -- the Yule dissimilarity.
:func:`hamming` also operates over discrete numerical vectors.
"""
# Copyright (C) Damian Eads, 2007-2008. New BSD License.
from __future__ import division, print_function, absolute_import
__all__ = [
'braycurtis',
'canberra',
'cdist',
'chebyshev',
'cityblock',
'correlation',
'cosine',
'dice',
'directed_hausdorff',
'euclidean',
'hamming',
'is_valid_dm',
'is_valid_y',
'jaccard',
'jensenshannon',
'kulsinski',
'mahalanobis',
'matching',
'minkowski',
'num_obs_dm',
'num_obs_y',
'pdist',
'rogerstanimoto',
'russellrao',
'seuclidean',
'sokalmichener',
'sokalsneath',
'sqeuclidean',
'squareform',
'wminkowski',
'yule'
]
import warnings
import numpy as np
from functools import partial
from collections import namedtuple
from scipy._lib.six import callable, string_types
from scipy._lib.six import xrange
from scipy._lib._util import _asarray_validated
from . import _distance_wrap
from . import _hausdorff
from ..linalg import norm
from ..special import rel_entr
def _args_to_kwargs_xdist(args, kwargs, metric, func_name):
"""
Convert legacy positional arguments to keyword arguments for pdist/cdist.
"""
if not args:
return kwargs
if (callable(metric) and metric not in [
braycurtis, canberra, chebyshev, cityblock, correlation, cosine,
dice, euclidean, hamming, jaccard, jensenshannon, kulsinski,
mahalanobis, matching, minkowski, rogerstanimoto, russellrao,
seuclidean, sokalmichener, sokalsneath, sqeuclidean, yule,
wminkowski]):
raise TypeError('When using a custom metric arguments must be passed'
'as keyword (i.e., ARGNAME=ARGVALUE)')
if func_name == 'pdist':
old_arg_names = ['p', 'w', 'V', 'VI']
else:
old_arg_names = ['p', 'V', 'VI', 'w']
num_args = len(args)
warnings.warn('%d metric parameters have been passed as positional.'
'This will raise an error in a future version.'
'Please pass arguments as keywords(i.e., ARGNAME=ARGVALUE)'
% num_args, DeprecationWarning)
if num_args > 4:
raise ValueError('Deprecated %s signature accepts only 4'
'positional arguments (%s), %d given.'
% (func_name, ', '.join(old_arg_names), num_args))
for old_arg, arg in zip(old_arg_names, args):
if old_arg in kwargs:
raise TypeError('%s() got multiple values for argument %s'
% (func_name, old_arg))
kwargs[old_arg] = arg
return kwargs
def _copy_array_if_base_present(a):
"""Copy the array if its base points to a parent array."""
if a.base is not None:
return a.copy()
return a
def _correlation_cdist_wrap(XA, XB, dm, **kwargs):
XA = XA - XA.mean(axis=1, keepdims=True)
XB = XB - XB.mean(axis=1, keepdims=True)
_distance_wrap.cdist_cosine_double_wrap(XA, XB, dm, **kwargs)
def _correlation_pdist_wrap(X, dm, **kwargs):
X2 = X - X.mean(axis=1, keepdims=True)
_distance_wrap.pdist_cosine_double_wrap(X2, dm, **kwargs)
def _convert_to_type(X, out_type):
return np.ascontiguousarray(X, dtype=out_type)
def _filter_deprecated_kwargs(kwargs, args_blacklist):
# Filtering out old default keywords
for k in args_blacklist:
if k in kwargs:
del kwargs[k]
warnings.warn('Got unexpected kwarg %s. This will raise an error'
' in a future version.' % k, DeprecationWarning)
def _nbool_correspond_all(u, v, w=None):
if u.dtype == v.dtype == bool and w is None:
not_u = ~u
not_v = ~v
nff = (not_u & not_v).sum()
nft = (not_u & v).sum()
ntf = (u & not_v).sum()
ntt = (u & v).sum()
else:
dtype = np.find_common_type([int], [u.dtype, v.dtype])
u = u.astype(dtype)
v = v.astype(dtype)
not_u = 1.0 - u
not_v = 1.0 - v
if w is not None:
not_u = w * not_u
u = w * u
nff = (not_u * not_v).sum()
nft = (not_u * v).sum()
ntf = (u * not_v).sum()
ntt = (u * v).sum()
return (nff, nft, ntf, ntt)
def _nbool_correspond_ft_tf(u, v, w=None):
if u.dtype == v.dtype == bool and w is None:
not_u = ~u
not_v = ~v
nft = (not_u & v).sum()
ntf = (u & not_v).sum()
else:
dtype = np.find_common_type([int], [u.dtype, v.dtype])
u = u.astype(dtype)
v = v.astype(dtype)
not_u = 1.0 - u
not_v = 1.0 - v
if w is not None:
not_u = w * not_u
u = w * u
nft = (not_u * v).sum()
ntf = (u * not_v).sum()
return (nft, ntf)
def _validate_cdist_input(XA, XB, mA, mB, n, metric_name, **kwargs):
if metric_name is not None:
# get supported types
types = _METRICS[metric_name].types
# choose best type
typ = types[types.index(XA.dtype)] if XA.dtype in types else types[0]
# validate data
XA = _convert_to_type(XA, out_type=typ)
XB = _convert_to_type(XB, out_type=typ)
# validate kwargs
_validate_kwargs = _METRICS[metric_name].validator
if _validate_kwargs:
kwargs = _validate_kwargs(np.vstack([XA, XB]), mA + mB, n, **kwargs)
else:
typ = None
return XA, XB, typ, kwargs
def _validate_hamming_kwargs(X, m, n, **kwargs):
w = kwargs.get('w', np.ones((n,), dtype='double'))
if w.ndim != 1 or w.shape[0] != n:
raise ValueError("Weights must have same size as input vector. %d vs. %d" % (w.shape[0], n))
kwargs['w'] = _validate_weights(w)
return kwargs
def _validate_mahalanobis_kwargs(X, m, n, **kwargs):
VI = kwargs.pop('VI', None)
if VI is None:
if m <= n:
# There are fewer observations than the dimension of
# the observations.
raise ValueError("The number of observations (%d) is too "
"small; the covariance matrix is "
"singular. For observations with %d "
"dimensions, at least %d observations "
"are required." % (m, n, n + 1))
CV = np.atleast_2d(np.cov(X.astype(np.double).T))
VI = np.linalg.inv(CV).T.copy()
kwargs["VI"] = _convert_to_double(VI)
return kwargs
def _validate_minkowski_kwargs(X, m, n, **kwargs):
if 'p' not in kwargs:
kwargs['p'] = 2.
return kwargs
def _validate_pdist_input(X, m, n, metric_name, **kwargs):
if metric_name is not None:
# get supported types
types = _METRICS[metric_name].types
# choose best type
typ = types[types.index(X.dtype)] if X.dtype in types else types[0]
# validate data
X = _convert_to_type(X, out_type=typ)
# validate kwargs
_validate_kwargs = _METRICS[metric_name].validator
if _validate_kwargs:
kwargs = _validate_kwargs(X, m, n, **kwargs)
else:
typ = None
return X, typ, kwargs
def _validate_seuclidean_kwargs(X, m, n, **kwargs):
V = kwargs.pop('V', None)
if V is None:
V = np.var(X.astype(np.double), axis=0, ddof=1)
else:
V = np.asarray(V, order='c')
if V.dtype != np.double:
raise TypeError('Variance vector V must contain doubles.')
if len(V.shape) != 1:
raise ValueError('Variance vector V must '
'be one-dimensional.')
if V.shape[0] != n:
raise ValueError('Variance vector V must be of the same '
'dimension as the vectors on which the distances '
'are computed.')
kwargs['V'] = _convert_to_double(V)
return kwargs
def _validate_vector(u, dtype=None):
# XXX Is order='c' really necessary?
u = np.asarray(u, dtype=dtype, order='c').squeeze()
# Ensure values such as u=1 and u=[1] still return 1-D arrays.
u = np.atleast_1d(u)
if u.ndim > 1:
raise ValueError("Input vector should be 1-D.")
return u
def _validate_weights(w, dtype=np.double):
w = _validate_vector(w, dtype=dtype)
if np.any(w < 0):
raise ValueError("Input weights should be all non-negative")
Loading ...