Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

aaronreidsmith / scipy   python

Repository URL to install this package:

Version: 1.3.3 

/ spatial / distance.py

"""
Distance computations (:mod:`scipy.spatial.distance`)
=====================================================

.. sectionauthor:: Damian Eads

Function Reference
------------------

Distance matrix computation from a collection of raw observation vectors
stored in a rectangular array.

.. autosummary::
   :toctree: generated/

   pdist   -- pairwise distances between observation vectors.
   cdist   -- distances between two collections of observation vectors
   squareform -- convert distance matrix to a condensed one and vice versa
   directed_hausdorff -- directed Hausdorff distance between arrays

Predicates for checking the validity of distance matrices, both
condensed and redundant. Also contained in this module are functions
for computing the number of observations in a distance matrix.

.. autosummary::
   :toctree: generated/

   is_valid_dm -- checks for a valid distance matrix
   is_valid_y  -- checks for a valid condensed distance matrix
   num_obs_dm  -- # of observations in a distance matrix
   num_obs_y   -- # of observations in a condensed distance matrix

Distance functions between two numeric vectors ``u`` and ``v``. Computing
distances over a large collection of vectors is inefficient for these
functions. Use ``pdist`` for this purpose.

.. autosummary::
   :toctree: generated/

   braycurtis       -- the Bray-Curtis distance.
   canberra         -- the Canberra distance.
   chebyshev        -- the Chebyshev distance.
   cityblock        -- the Manhattan distance.
   correlation      -- the Correlation distance.
   cosine           -- the Cosine distance.
   euclidean        -- the Euclidean distance.
   jensenshannon    -- the Jensen-Shannon distance.
   mahalanobis      -- the Mahalanobis distance.
   minkowski        -- the Minkowski distance.
   seuclidean       -- the normalized Euclidean distance.
   sqeuclidean      -- the squared Euclidean distance.
   wminkowski       -- (deprecated) alias of `minkowski`.

Distance functions between two boolean vectors (representing sets) ``u`` and
``v``.  As in the case of numerical vectors, ``pdist`` is more efficient for
computing the distances between all pairs.

.. autosummary::
   :toctree: generated/

   dice             -- the Dice dissimilarity.
   hamming          -- the Hamming distance.
   jaccard          -- the Jaccard distance.
   kulsinski        -- the Kulsinski distance.
   rogerstanimoto   -- the Rogers-Tanimoto dissimilarity.
   russellrao       -- the Russell-Rao dissimilarity.
   sokalmichener    -- the Sokal-Michener dissimilarity.
   sokalsneath      -- the Sokal-Sneath dissimilarity.
   yule             -- the Yule dissimilarity.

:func:`hamming` also operates over discrete numerical vectors.
"""

# Copyright (C) Damian Eads, 2007-2008. New BSD License.

from __future__ import division, print_function, absolute_import

__all__ = [
    'braycurtis',
    'canberra',
    'cdist',
    'chebyshev',
    'cityblock',
    'correlation',
    'cosine',
    'dice',
    'directed_hausdorff',
    'euclidean',
    'hamming',
    'is_valid_dm',
    'is_valid_y',
    'jaccard',
    'jensenshannon',
    'kulsinski',
    'mahalanobis',
    'matching',
    'minkowski',
    'num_obs_dm',
    'num_obs_y',
    'pdist',
    'rogerstanimoto',
    'russellrao',
    'seuclidean',
    'sokalmichener',
    'sokalsneath',
    'sqeuclidean',
    'squareform',
    'wminkowski',
    'yule'
]


import warnings
import numpy as np

from functools import partial
from collections import namedtuple
from scipy._lib.six import callable, string_types
from scipy._lib.six import xrange
from scipy._lib._util import _asarray_validated

from . import _distance_wrap
from . import _hausdorff
from ..linalg import norm
from ..special import rel_entr


def _args_to_kwargs_xdist(args, kwargs, metric, func_name):
    """
    Convert legacy positional arguments to keyword arguments for pdist/cdist.
    """
    if not args:
        return kwargs

    if (callable(metric) and metric not in [
            braycurtis, canberra, chebyshev, cityblock, correlation, cosine,
            dice, euclidean, hamming, jaccard, jensenshannon, kulsinski,
            mahalanobis, matching, minkowski, rogerstanimoto, russellrao,
            seuclidean, sokalmichener, sokalsneath, sqeuclidean, yule,
            wminkowski]):
        raise TypeError('When using a custom metric arguments must be passed'
                        'as keyword (i.e., ARGNAME=ARGVALUE)')

    if func_name == 'pdist':
        old_arg_names = ['p', 'w', 'V', 'VI']
    else:
        old_arg_names = ['p', 'V', 'VI', 'w']

    num_args = len(args)
    warnings.warn('%d metric parameters have been passed as positional.'
                  'This will raise an error in a future version.'
                  'Please pass arguments as keywords(i.e., ARGNAME=ARGVALUE)'
                  % num_args, DeprecationWarning)

    if num_args > 4:
        raise ValueError('Deprecated %s signature accepts only 4'
                         'positional arguments (%s), %d given.'
                         % (func_name, ', '.join(old_arg_names), num_args))

    for old_arg, arg in zip(old_arg_names, args):
        if old_arg in kwargs:
            raise TypeError('%s() got multiple values for argument %s'
                            % (func_name, old_arg))
        kwargs[old_arg] = arg
    return kwargs


def _copy_array_if_base_present(a):
    """Copy the array if its base points to a parent array."""
    if a.base is not None:
        return a.copy()
    return a


def _correlation_cdist_wrap(XA, XB, dm, **kwargs):
    XA = XA - XA.mean(axis=1, keepdims=True)
    XB = XB - XB.mean(axis=1, keepdims=True)
    _distance_wrap.cdist_cosine_double_wrap(XA, XB, dm, **kwargs)


def _correlation_pdist_wrap(X, dm, **kwargs):
    X2 = X - X.mean(axis=1, keepdims=True)
    _distance_wrap.pdist_cosine_double_wrap(X2, dm, **kwargs)


def _convert_to_type(X, out_type):
    return np.ascontiguousarray(X, dtype=out_type)


def _filter_deprecated_kwargs(kwargs, args_blacklist):
    # Filtering out old default keywords
    for k in args_blacklist:
        if k in kwargs:
            del kwargs[k]
            warnings.warn('Got unexpected kwarg %s. This will raise an error'
                          ' in a future version.' % k, DeprecationWarning)


def _nbool_correspond_all(u, v, w=None):
    if u.dtype == v.dtype == bool and w is None:
        not_u = ~u
        not_v = ~v
        nff = (not_u & not_v).sum()
        nft = (not_u & v).sum()
        ntf = (u & not_v).sum()
        ntt = (u & v).sum()
    else:
        dtype = np.find_common_type([int], [u.dtype, v.dtype])
        u = u.astype(dtype)
        v = v.astype(dtype)
        not_u = 1.0 - u
        not_v = 1.0 - v
        if w is not None:
            not_u = w * not_u
            u = w * u
        nff = (not_u * not_v).sum()
        nft = (not_u * v).sum()
        ntf = (u * not_v).sum()
        ntt = (u * v).sum()
    return (nff, nft, ntf, ntt)


def _nbool_correspond_ft_tf(u, v, w=None):
    if u.dtype == v.dtype == bool and w is None:
        not_u = ~u
        not_v = ~v
        nft = (not_u & v).sum()
        ntf = (u & not_v).sum()
    else:
        dtype = np.find_common_type([int], [u.dtype, v.dtype])
        u = u.astype(dtype)
        v = v.astype(dtype)
        not_u = 1.0 - u
        not_v = 1.0 - v
        if w is not None:
            not_u = w * not_u
            u = w * u
        nft = (not_u * v).sum()
        ntf = (u * not_v).sum()
    return (nft, ntf)


def _validate_cdist_input(XA, XB, mA, mB, n, metric_name, **kwargs):
    if metric_name is not None:
        # get supported types
        types = _METRICS[metric_name].types
        # choose best type
        typ = types[types.index(XA.dtype)] if XA.dtype in types else types[0]
        # validate data
        XA = _convert_to_type(XA, out_type=typ)
        XB = _convert_to_type(XB, out_type=typ)

        # validate kwargs
        _validate_kwargs = _METRICS[metric_name].validator
        if _validate_kwargs:
            kwargs = _validate_kwargs(np.vstack([XA, XB]), mA + mB, n, **kwargs)
    else:
        typ = None
    return XA, XB, typ, kwargs


def _validate_hamming_kwargs(X, m, n, **kwargs):
    w = kwargs.get('w', np.ones((n,), dtype='double'))

    if w.ndim != 1 or w.shape[0] != n:
        raise ValueError("Weights must have same size as input vector. %d vs. %d" % (w.shape[0], n))

    kwargs['w'] = _validate_weights(w)
    return kwargs


def _validate_mahalanobis_kwargs(X, m, n, **kwargs):
    VI = kwargs.pop('VI', None)
    if VI is None:
        if m <= n:
            # There are fewer observations than the dimension of
            # the observations.
            raise ValueError("The number of observations (%d) is too "
                             "small; the covariance matrix is "
                             "singular. For observations with %d "
                             "dimensions, at least %d observations "
                             "are required." % (m, n, n + 1))
        CV = np.atleast_2d(np.cov(X.astype(np.double).T))
        VI = np.linalg.inv(CV).T.copy()
    kwargs["VI"] = _convert_to_double(VI)
    return kwargs


def _validate_minkowski_kwargs(X, m, n, **kwargs):
    if 'p' not in kwargs:
        kwargs['p'] = 2.
    return kwargs


def _validate_pdist_input(X, m, n, metric_name, **kwargs):
    if metric_name is not None:
        # get supported types
        types = _METRICS[metric_name].types
        # choose best type
        typ = types[types.index(X.dtype)] if X.dtype in types else types[0]
        # validate data
        X = _convert_to_type(X, out_type=typ)

        # validate kwargs
        _validate_kwargs = _METRICS[metric_name].validator
        if _validate_kwargs:
            kwargs = _validate_kwargs(X, m, n, **kwargs)
    else:
        typ = None
    return X, typ, kwargs


def _validate_seuclidean_kwargs(X, m, n, **kwargs):
    V = kwargs.pop('V', None)
    if V is None:
        V = np.var(X.astype(np.double), axis=0, ddof=1)
    else:
        V = np.asarray(V, order='c')
        if V.dtype != np.double:
            raise TypeError('Variance vector V must contain doubles.')
        if len(V.shape) != 1:
            raise ValueError('Variance vector V must '
                             'be one-dimensional.')
        if V.shape[0] != n:
            raise ValueError('Variance vector V must be of the same '
                             'dimension as the vectors on which the distances '
                             'are computed.')
    kwargs['V'] = _convert_to_double(V)
    return kwargs


def _validate_vector(u, dtype=None):
    # XXX Is order='c' really necessary?
    u = np.asarray(u, dtype=dtype, order='c').squeeze()
    # Ensure values such as u=1 and u=[1] still return 1-D arrays.
    u = np.atleast_1d(u)
    if u.ndim > 1:
        raise ValueError("Input vector should be 1-D.")
    return u


def _validate_weights(w, dtype=np.double):
    w = _validate_vector(w, dtype=dtype)
    if np.any(w < 0):
        raise ValueError("Input weights should be all non-negative")
Loading ...