Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

aaronreidsmith / scipy   python

Repository URL to install this package:

Version: 1.3.3 

/ cluster / tests / test_hierarchy.py

#
# Author: Damian Eads
# Date: April 17, 2008
#
# Copyright (C) 2008 Damian Eads
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above
#    copyright notice, this list of conditions and the following
#    disclaimer in the documentation and/or other materials provided
#    with the distribution.
#
# 3. The name of the author may not be used to endorse or promote
#    products derived from this software without specific prior
#    written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
# OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from __future__ import division, print_function, absolute_import

import numpy as np
from numpy.testing import assert_allclose, assert_equal, assert_, assert_warns
import pytest
from pytest import raises as assert_raises

from scipy._lib.six import xrange, u

import scipy.cluster.hierarchy
from scipy.cluster.hierarchy import (
    ClusterWarning, linkage, from_mlab_linkage, to_mlab_linkage,
    num_obs_linkage, inconsistent, cophenet, fclusterdata, fcluster,
    is_isomorphic, single, leaders, complete, weighted, centroid,
    correspond, is_monotonic, maxdists, maxinconsts, maxRstat,
    is_valid_linkage, is_valid_im, to_tree, leaves_list, dendrogram,
    set_link_color_palette, cut_tree, optimal_leaf_ordering,
    _order_cluster_tree, _hierarchy, _LINKAGE_METHODS)
from scipy.spatial.distance import pdist
from scipy.cluster._hierarchy import Heap

from . import hierarchy_test_data


# Matplotlib is not a scipy dependency but is optionally used in dendrogram, so
# check if it's available
try:
    import matplotlib
    # and set the backend to be Agg (no gui)
    matplotlib.use('Agg')
    # before importing pyplot
    import matplotlib.pyplot as plt
    have_matplotlib = True
except Exception:
    have_matplotlib = False


class TestLinkage(object):
    def test_linkage_non_finite_elements_in_distance_matrix(self):
        # Tests linkage(Y) where Y contains a non-finite element (e.g. NaN or Inf).
        # Exception expected.
        y = np.zeros((6,))
        y[0] = np.nan
        assert_raises(ValueError, linkage, y)

    def test_linkage_empty_distance_matrix(self):
        # Tests linkage(Y) where Y is a 0x4 linkage matrix. Exception expected.
        y = np.zeros((0,))
        assert_raises(ValueError, linkage, y)

    def test_linkage_tdist(self):
        for method in ['single', 'complete', 'average', 'weighted', u('single')]:
            self.check_linkage_tdist(method)

    def check_linkage_tdist(self, method):
        # Tests linkage(Y, method) on the tdist data set.
        Z = linkage(hierarchy_test_data.ytdist, method)
        expectedZ = getattr(hierarchy_test_data, 'linkage_ytdist_' + method)
        assert_allclose(Z, expectedZ, atol=1e-10)

    def test_linkage_X(self):
        for method in ['centroid', 'median', 'ward']:
            self.check_linkage_q(method)

    def check_linkage_q(self, method):
        # Tests linkage(Y, method) on the Q data set.
        Z = linkage(hierarchy_test_data.X, method)
        expectedZ = getattr(hierarchy_test_data, 'linkage_X_' + method)
        assert_allclose(Z, expectedZ, atol=1e-06)

        y = scipy.spatial.distance.pdist(hierarchy_test_data.X,
                                         metric="euclidean")
        Z = linkage(y, method)
        assert_allclose(Z, expectedZ, atol=1e-06)

    def test_compare_with_trivial(self):
        rng = np.random.RandomState(0)
        n = 20
        X = rng.rand(n, 2)
        d = pdist(X)

        for method, code in _LINKAGE_METHODS.items():
            Z_trivial = _hierarchy.linkage(d, n, code)
            Z = linkage(d, method)
            assert_allclose(Z_trivial, Z, rtol=1e-14, atol=1e-15)

    def test_optimal_leaf_ordering(self):
        Z = linkage(hierarchy_test_data.ytdist, optimal_ordering=True)
        expectedZ = getattr(hierarchy_test_data, 'linkage_ytdist_single_olo')
        assert_allclose(Z, expectedZ, atol=1e-10)


class TestLinkageTies(object):
    _expectations = {
        'single': np.array([[0, 1, 1.41421356, 2],
                            [2, 3, 1.41421356, 3]]),
        'complete': np.array([[0, 1, 1.41421356, 2],
                              [2, 3, 2.82842712, 3]]),
        'average': np.array([[0, 1, 1.41421356, 2],
                             [2, 3, 2.12132034, 3]]),
        'weighted': np.array([[0, 1, 1.41421356, 2],
                              [2, 3, 2.12132034, 3]]),
        'centroid': np.array([[0, 1, 1.41421356, 2],
                              [2, 3, 2.12132034, 3]]),
        'median': np.array([[0, 1, 1.41421356, 2],
                            [2, 3, 2.12132034, 3]]),
        'ward': np.array([[0, 1, 1.41421356, 2],
                          [2, 3, 2.44948974, 3]]),
    }

    def test_linkage_ties(self):
        for method in ['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward']:
            self.check_linkage_ties(method)

    def check_linkage_ties(self, method):
        X = np.array([[-1, -1], [0, 0], [1, 1]])
        Z = linkage(X, method=method)
        expectedZ = self._expectations[method]
        assert_allclose(Z, expectedZ, atol=1e-06)


class TestInconsistent(object):
    def test_inconsistent_tdist(self):
        for depth in hierarchy_test_data.inconsistent_ytdist:
            self.check_inconsistent_tdist(depth)

    def check_inconsistent_tdist(self, depth):
        Z = hierarchy_test_data.linkage_ytdist_single
        assert_allclose(inconsistent(Z, depth),
                        hierarchy_test_data.inconsistent_ytdist[depth])


class TestCopheneticDistance(object):
    def test_linkage_cophenet_tdist_Z(self):
        # Tests cophenet(Z) on tdist data set.
        expectedM = np.array([268, 295, 255, 255, 295, 295, 268, 268, 295, 295,
                              295, 138, 219, 295, 295])
        Z = hierarchy_test_data.linkage_ytdist_single
        M = cophenet(Z)
        assert_allclose(M, expectedM, atol=1e-10)

    def test_linkage_cophenet_tdist_Z_Y(self):
        # Tests cophenet(Z, Y) on tdist data set.
        Z = hierarchy_test_data.linkage_ytdist_single
        (c, M) = cophenet(Z, hierarchy_test_data.ytdist)
        expectedM = np.array([268, 295, 255, 255, 295, 295, 268, 268, 295, 295,
                              295, 138, 219, 295, 295])
        expectedc = 0.639931296433393415057366837573
        assert_allclose(c, expectedc, atol=1e-10)
        assert_allclose(M, expectedM, atol=1e-10)


class TestMLabLinkageConversion(object):
    def test_mlab_linkage_conversion_empty(self):
        # Tests from/to_mlab_linkage on empty linkage array.
        X = np.asarray([])
        assert_equal(from_mlab_linkage([]), X)
        assert_equal(to_mlab_linkage([]), X)

    def test_mlab_linkage_conversion_single_row(self):
        # Tests from/to_mlab_linkage on linkage array with single row.
        Z = np.asarray([[0., 1., 3., 2.]])
        Zm = [[1, 2, 3]]
        assert_equal(from_mlab_linkage(Zm), Z)
        assert_equal(to_mlab_linkage(Z), Zm)

    def test_mlab_linkage_conversion_multiple_rows(self):
        # Tests from/to_mlab_linkage on linkage array with multiple rows.
        Zm = np.asarray([[3, 6, 138], [4, 5, 219],
                         [1, 8, 255], [2, 9, 268], [7, 10, 295]])
        Z = np.array([[2., 5., 138., 2.],
                      [3., 4., 219., 2.],
                      [0., 7., 255., 3.],
                      [1., 8., 268., 4.],
                      [6., 9., 295., 6.]],
                      dtype=np.double)
        assert_equal(from_mlab_linkage(Zm), Z)
        assert_equal(to_mlab_linkage(Z), Zm)


class TestFcluster(object):
    def test_fclusterdata(self):
        for t in hierarchy_test_data.fcluster_inconsistent:
            self.check_fclusterdata(t, 'inconsistent')
        for t in hierarchy_test_data.fcluster_distance:
            self.check_fclusterdata(t, 'distance')
        for t in hierarchy_test_data.fcluster_maxclust:
            self.check_fclusterdata(t, 'maxclust')

    def check_fclusterdata(self, t, criterion):
        # Tests fclusterdata(X, criterion=criterion, t=t) on a random 3-cluster data set.
        expectedT = getattr(hierarchy_test_data, 'fcluster_' + criterion)[t]
        X = hierarchy_test_data.Q_X
        T = fclusterdata(X, criterion=criterion, t=t)
        assert_(is_isomorphic(T, expectedT))

    def test_fcluster(self):
        for t in hierarchy_test_data.fcluster_inconsistent:
            self.check_fcluster(t, 'inconsistent')
        for t in hierarchy_test_data.fcluster_distance:
            self.check_fcluster(t, 'distance')
        for t in hierarchy_test_data.fcluster_maxclust:
            self.check_fcluster(t, 'maxclust')

    def check_fcluster(self, t, criterion):
        # Tests fcluster(Z, criterion=criterion, t=t) on a random 3-cluster data set.
        expectedT = getattr(hierarchy_test_data, 'fcluster_' + criterion)[t]
        Z = single(hierarchy_test_data.Q_X)
        T = fcluster(Z, criterion=criterion, t=t)
        assert_(is_isomorphic(T, expectedT))

    def test_fcluster_monocrit(self):
        for t in hierarchy_test_data.fcluster_distance:
            self.check_fcluster_monocrit(t)
        for t in hierarchy_test_data.fcluster_maxclust:
            self.check_fcluster_maxclust_monocrit(t)

    def check_fcluster_monocrit(self, t):
        expectedT = hierarchy_test_data.fcluster_distance[t]
        Z = single(hierarchy_test_data.Q_X)
        T = fcluster(Z, t, criterion='monocrit', monocrit=maxdists(Z))
        assert_(is_isomorphic(T, expectedT))

    def check_fcluster_maxclust_monocrit(self, t):
        expectedT = hierarchy_test_data.fcluster_maxclust[t]
        Z = single(hierarchy_test_data.Q_X)
        T = fcluster(Z, t, criterion='maxclust_monocrit', monocrit=maxdists(Z))
        assert_(is_isomorphic(T, expectedT))


class TestLeaders(object):
    def test_leaders_single(self):
        # Tests leaders using a flat clustering generated by single linkage.
        X = hierarchy_test_data.Q_X
        Y = pdist(X)
        Z = linkage(Y)
        T = fcluster(Z, criterion='maxclust', t=3)
        Lright = (np.array([53, 55, 56]), np.array([2, 3, 1]))
        L = leaders(Z, T)
        assert_equal(L, Lright)


class TestIsIsomorphic(object):
    def test_is_isomorphic_1(self):
        # Tests is_isomorphic on test case #1 (one flat cluster, different labellings)
        a = [1, 1, 1]
        b = [2, 2, 2]
        assert_(is_isomorphic(a, b))
        assert_(is_isomorphic(b, a))

    def test_is_isomorphic_2(self):
        # Tests is_isomorphic on test case #2 (two flat clusters, different labelings)
        a = [1, 7, 1]
        b = [2, 3, 2]
        assert_(is_isomorphic(a, b))
        assert_(is_isomorphic(b, a))

    def test_is_isomorphic_3(self):
        # Tests is_isomorphic on test case #3 (no flat clusters)
        a = []
        b = []
        assert_(is_isomorphic(a, b))

    def test_is_isomorphic_4A(self):
        # Tests is_isomorphic on test case #4A (3 flat clusters, different labelings, isomorphic)
        a = [1, 2, 3]
        b = [1, 3, 2]
        assert_(is_isomorphic(a, b))
        assert_(is_isomorphic(b, a))

    def test_is_isomorphic_4B(self):
        # Tests is_isomorphic on test case #4B (3 flat clusters, different labelings, nonisomorphic)
        a = [1, 2, 3, 3]
        b = [1, 3, 2, 3]
        assert_(is_isomorphic(a, b) == False)
        assert_(is_isomorphic(b, a) == False)

    def test_is_isomorphic_4C(self):
        # Tests is_isomorphic on test case #4C (3 flat clusters, different labelings, isomorphic)
        a = [7, 2, 3]
        b = [6, 3, 2]
        assert_(is_isomorphic(a, b))
        assert_(is_isomorphic(b, a))

    def test_is_isomorphic_5(self):
        # Tests is_isomorphic on test case #5 (1000 observations, 2/3/5 random
        # clusters, random permutation of the labeling).
        for nc in [2, 3, 5]:
            self.help_is_isomorphic_randperm(1000, nc)

    def test_is_isomorphic_6(self):
        # Tests is_isomorphic on test case #5A (1000 observations, 2/3/5 random
        # clusters, random permutation of the labeling, slightly
        # nonisomorphic.)
        for nc in [2, 3, 5]:
            self.help_is_isomorphic_randperm(1000, nc, True, 5)

    def test_is_isomorphic_7(self):
        # Regression test for gh-6271
        assert_(not is_isomorphic([1, 2, 3], [1, 1, 1]))

    def help_is_isomorphic_randperm(self, nobs, nclusters, noniso=False, nerrors=0):
        for k in range(3):
            a = np.int_(np.random.rand(nobs) * nclusters)
            b = np.zeros(a.size, dtype=np.int_)
            P = np.random.permutation(nclusters)
            for i in xrange(0, a.shape[0]):
                b[i] = P[a[i]]
            if noniso:
                Q = np.random.permutation(nobs)
                b[Q[0:nerrors]] += 1
                b[Q[0:nerrors]] %= nclusters
Loading ...