"""
Several basic tests for hierarchical clustering procedures
"""
# Authors: Vincent Michel, 2010, Gael Varoquaux 2012,
# Matteo Visconti di Oleggio Castello 2014
# License: BSD 3 clause
from tempfile import mkdtemp
import shutil
from functools import partial
import numpy as np
from scipy import sparse
from scipy.cluster import hierarchy
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import ignore_warnings
from sklearn.cluster import ward_tree
from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
from sklearn.cluster.hierarchical import (_hc_cut, _TREE_BUILDERS,
linkage_tree)
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.metrics.pairwise import PAIRED_DISTANCES, cosine_distances,\
manhattan_distances, pairwise_distances
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.neighbors.graph import kneighbors_graph
from sklearn.cluster._hierarchical import average_merge, max_merge
from sklearn.utils.fast_dict import IntFloatDict
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_warns
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
X = rng.normal(size=(5, 5))
assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
assert_raises(ValueError, linkage_tree, X, linkage='foo')
assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# test hiearchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hiearchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_structured_linkage_tree():
# Check that we obtain the correct solution for structured linkage trees.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
# Avoiding a mask with only 'True' entries
mask[4:7, 4:7] = 0
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for tree_builder in _TREE_BUILDERS.values():
children, n_components, n_leaves, parent = \
tree_builder(X.T, connectivity)
n_nodes = 2 * X.shape[1] - 1
assert_true(len(children) + n_leaves == n_nodes)
# Check that ward_tree raises a ValueError with a connectivity matrix
# of the wrong shape
assert_raises(ValueError,
tree_builder, X.T, np.ones((4, 4)))
# Check that fitting with no samples raises an error
assert_raises(ValueError,
tree_builder, X.T[:0], connectivity)
def test_unstructured_linkage_tree():
# Check that we obtain the correct solution for unstructured linkage trees.
rng = np.random.RandomState(0)
X = rng.randn(50, 100)
for this_X in (X, X[0]):
# With specified a number of clusters just for the sake of
# raising a warning and testing the warning code
with ignore_warnings():
children, n_nodes, n_leaves, parent = assert_warns(
UserWarning, ward_tree, this_X.T, n_clusters=10)
n_nodes = 2 * X.shape[1] - 1
assert_equal(len(children) + n_leaves, n_nodes)
for tree_builder in _TREE_BUILDERS.values():
for this_X in (X, X[0]):
with ignore_warnings():
children, n_nodes, n_leaves, parent = assert_warns(
UserWarning, tree_builder, this_X.T, n_clusters=10)
n_nodes = 2 * X.shape[1] - 1
assert_equal(len(children) + n_leaves, n_nodes)
def test_height_linkage_tree():
# Check that the height of the results of linkage tree is sorted.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
for linkage_func in _TREE_BUILDERS.values():
children, n_nodes, n_leaves, parent = linkage_func(X.T, connectivity)
n_nodes = 2 * X.shape[1] - 1
assert_true(len(children) + n_leaves == n_nodes)
def test_agglomerative_clustering():
# Check that we obtain the correct number of clusters with
# agglomerative clustering.
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
n_samples = 100
X = rng.randn(n_samples, 50)
connectivity = grid_to_graph(*mask.shape)
for linkage in ("ward", "complete", "average"):
clustering = AgglomerativeClustering(n_clusters=10,
connectivity=connectivity,
linkage=linkage)
clustering.fit(X)
# test caching
try:
tempdir = mkdtemp()
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity,
memory=tempdir,
linkage=linkage)
clustering.fit(X)
labels = clustering.labels_
assert_true(np.size(np.unique(labels)) == 10)
finally:
shutil.rmtree(tempdir)
# Turn caching off now
clustering = AgglomerativeClustering(
n_clusters=10, connectivity=connectivity, linkage=linkage)
# Check that we obtain the same solution with early-stopping of the
# tree building
clustering.compute_full_tree = False
clustering.fit(X)
assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
labels), 1)
clustering.connectivity = None
clustering.fit(X)
assert_true(np.size(np.unique(clustering.labels_)) == 10)
# Check that we raise a TypeError on dense matrices
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=sparse.lil_matrix(
connectivity.toarray()[:10, :10]),
linkage=linkage)
assert_raises(ValueError, clustering.fit, X)
# Test that using ward with another metric than euclidean raises an
# exception
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity.toarray(),
affinity="manhattan",
linkage="ward")
assert_raises(ValueError, clustering.fit, X)
# Test using another metric than euclidean works with linkage complete
for affinity in PAIRED_DISTANCES.keys():
# Compare our (structured) implementation to scipy
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=np.ones((n_samples, n_samples)),
affinity=affinity,
linkage="complete")
clustering.fit(X)
clustering2 = AgglomerativeClustering(
n_clusters=10,
connectivity=None,
affinity=affinity,
linkage="complete")
clustering2.fit(X)
assert_almost_equal(normalized_mutual_info_score(clustering2.labels_,
clustering.labels_),
1)
# Test that using a distance matrix (affinity = 'precomputed') has same
# results (with connectivity constraints)
clustering = AgglomerativeClustering(n_clusters=10,
connectivity=connectivity,
linkage="complete")
clustering.fit(X)
X_dist = pairwise_distances(X)
clustering2 = AgglomerativeClustering(n_clusters=10,
connectivity=connectivity,
affinity='precomputed',
linkage="complete")
clustering2.fit(X_dist)
assert_array_equal(clustering.labels_, clustering2.labels_)
def test_ward_agglomeration():
# Check that we obtain the correct solution in a simplistic case
rng = np.random.RandomState(0)
mask = np.ones([10, 10], dtype=np.bool)
X = rng.randn(50, 100)
connectivity = grid_to_graph(*mask.shape)
agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
agglo.fit(X)
assert_true(np.size(np.unique(agglo.labels_)) == 5)
X_red = agglo.transform(X)
assert_true(X_red.shape[1] == 5)
X_full = agglo.inverse_transform(X_red)
assert_true(np.unique(X_full[0]).size == 5)
assert_array_almost_equal(agglo.transform(X_full), X_red)
# Check that fitting with no samples raises a ValueError
assert_raises(ValueError, agglo.fit, X[:0])
def assess_same_labelling(cut1, cut2):
"""Util for comparison with scipy"""
co_clust = []
for cut in [cut1, cut2]:
n = len(cut)
k = cut.max() + 1
ecut = np.zeros((n, k))
ecut[np.arange(n), cut] = 1
co_clust.append(np.dot(ecut, ecut.T))
assert_true((co_clust[0] == co_clust[1]).all())
def test_scikit_vs_scipy():
# Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
n, p, k = 10, 5, 3
rng = np.random.RandomState(0)
# Not using a lil_matrix here, just to check that non sparse
# matrices are well handled
connectivity = np.ones((n, n))
for linkage in _TREE_BUILDERS.keys():
for i in range(5):
X = .1 * rng.normal(size=(n, p))
X -= 4. * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out = hierarchy.linkage(X, method=linkage)
children_ = out[:, :2].astype(np.int)
children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity)
cut = _hc_cut(k, children, n_leaves)
cut_ = _hc_cut(k, children_, n_leaves)
assess_same_labelling(cut, cut_)
# Test error management in _hc_cut
assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)
def test_connectivity_propagation():
# Check that connectivity in the ward tree is propagated correctly during
# merging.
X = np.array([(.014, .120), (.014, .099), (.014, .097),
(.017, .153), (.017, .153), (.018, .153),
(.018, .153), (.018, .153), (.018, .153),
(.018, .153), (.018, .153), (.018, .153),
(.018, .152), (.018, .149), (.018, .144)])
connectivity = kneighbors_graph(X, 10, include_self=False)
ward = AgglomerativeClustering(
n_clusters=4, connectivity=connectivity, linkage='ward')
# If changes are not propagated correctly, fit crashes with an
# IndexError
ward.fit(X)
def test_ward_tree_children_order():
# Check that children are ordered in the same way for both structured and
# unstructured versions of ward_tree.
# test on five random datasets
n, p = 10, 5
rng = np.random.RandomState(0)
connectivity = np.ones((n, n))
for i in range(5):
X = .1 * rng.normal(size=(n, p))
X -= 4. * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out_unstructured = ward_tree(X)
out_structured = ward_tree(X, connectivity=connectivity)
assert_array_equal(out_unstructured[0], out_structured[0])
def test_ward_linkage_tree_return_distance():
# Test return_distance option on linkage and ward trees
# test that return_distance when set true, gives same
# output on both structured and unstructured clustering.
n, p = 10, 5
rng = np.random.RandomState(0)
connectivity = np.ones((n, n))
for i in range(5):
X = .1 * rng.normal(size=(n, p))
X -= 4. * np.arange(n)[:, np.newaxis]
X -= X.mean(axis=1)[:, np.newaxis]
out_unstructured = ward_tree(X, return_distance=True)
out_structured = ward_tree(X, connectivity=connectivity,
return_distance=True)
# get children
children_unstructured = out_unstructured[0]
children_structured = out_structured[0]
# check if we got the same clusters
assert_array_equal(children_unstructured, children_structured)
# check if the distances are the same
dist_unstructured = out_unstructured[-1]
dist_structured = out_structured[-1]
assert_array_almost_equal(dist_unstructured, dist_structured)
for linkage in ['average', 'complete']:
structured_items = linkage_tree(
X, connectivity=connectivity, linkage=linkage,
return_distance=True)[-1]
unstructured_items = linkage_tree(
X, linkage=linkage, return_distance=True)[-1]
structured_dist = structured_items[-1]
unstructured_dist = unstructured_items[-1]
structured_children = structured_items[0]
unstructured_children = unstructured_items[0]
assert_array_almost_equal(structured_dist, unstructured_dist)
assert_array_almost_equal(
structured_children, unstructured_children)
Loading ...