Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

alkaline-ml / scikit-learn   python

Repository URL to install this package:

/ mixture / tests / test_gaussian_mixture.py

# Author: Wei Xue <xuewei4d@gmail.com>
#         Thierry Guillemot <thierry.guillemot.work@gmail.com>
# License: BSD 3 clause

import sys
import copy
import warnings
import pytest

import numpy as np
from scipy import stats, linalg

from sklearn.covariance import EmpiricalCovariance
from sklearn.datasets import make_spd_matrix
from io import StringIO
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.mixture import GaussianMixture
from sklearn.mixture._gaussian_mixture import (
    _estimate_gaussian_covariances_full,
    _estimate_gaussian_covariances_tied,
    _estimate_gaussian_covariances_diag,
    _estimate_gaussian_covariances_spherical,
    _compute_precision_cholesky,
    _compute_log_det_cholesky,
    )
from sklearn.exceptions import ConvergenceWarning, NotFittedError
from sklearn.utils.extmath import fast_logdet
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import assert_warns_message
from sklearn.utils._testing import ignore_warnings


COVARIANCE_TYPE = ['full', 'tied', 'diag', 'spherical']


def generate_data(n_samples, n_features, weights, means, precisions,
                  covariance_type):
    rng = np.random.RandomState(0)

    X = []
    if covariance_type == 'spherical':
        for _, (w, m, c) in enumerate(zip(weights, means,
                                          precisions['spherical'])):
            X.append(rng.multivariate_normal(m, c * np.eye(n_features),
                                             int(np.round(w * n_samples))))
    if covariance_type == 'diag':
        for _, (w, m, c) in enumerate(zip(weights, means,
                                          precisions['diag'])):
            X.append(rng.multivariate_normal(m, np.diag(c),
                                             int(np.round(w * n_samples))))
    if covariance_type == 'tied':
        for _, (w, m) in enumerate(zip(weights, means)):
            X.append(rng.multivariate_normal(m, precisions['tied'],
                                             int(np.round(w * n_samples))))
    if covariance_type == 'full':
        for _, (w, m, c) in enumerate(zip(weights, means,
                                          precisions['full'])):
            X.append(rng.multivariate_normal(m, c,
                                             int(np.round(w * n_samples))))

    X = np.vstack(X)
    return X


class RandomData:
    def __init__(self, rng, n_samples=200, n_components=2, n_features=2,
                 scale=50):
        self.n_samples = n_samples
        self.n_components = n_components
        self.n_features = n_features

        self.weights = rng.rand(n_components)
        self.weights = self.weights / self.weights.sum()
        self.means = rng.rand(n_components, n_features) * scale
        self.covariances = {
            'spherical': .5 + rng.rand(n_components),
            'diag': (.5 + rng.rand(n_components, n_features)) ** 2,
            'tied': make_spd_matrix(n_features, random_state=rng),
            'full': np.array([
                make_spd_matrix(n_features, random_state=rng) * .5
                for _ in range(n_components)])}
        self.precisions = {
            'spherical': 1. / self.covariances['spherical'],
            'diag': 1. / self.covariances['diag'],
            'tied': linalg.inv(self.covariances['tied']),
            'full': np.array([linalg.inv(covariance)
                             for covariance in self.covariances['full']])}

        self.X = dict(zip(COVARIANCE_TYPE, [generate_data(
            n_samples, n_features, self.weights, self.means, self.covariances,
            covar_type) for covar_type in COVARIANCE_TYPE]))
        self.Y = np.hstack([np.full(int(np.round(w * n_samples)), k,
                                    dtype=np.int)
                            for k, w in enumerate(self.weights)])


def test_gaussian_mixture_attributes():
    # test bad parameters
    rng = np.random.RandomState(0)
    X = rng.rand(10, 2)

    n_components_bad = 0
    gmm = GaussianMixture(n_components=n_components_bad)
    assert_raise_message(ValueError,
                         "Invalid value for 'n_components': %d "
                         "Estimation requires at least one component"
                         % n_components_bad, gmm.fit, X)

    # covariance_type should be in [spherical, diag, tied, full]
    covariance_type_bad = 'bad_covariance_type'
    gmm = GaussianMixture(covariance_type=covariance_type_bad)
    assert_raise_message(ValueError,
                         "Invalid value for 'covariance_type': %s "
                         "'covariance_type' should be in "
                         "['spherical', 'tied', 'diag', 'full']"
                         % covariance_type_bad,
                         gmm.fit, X)

    tol_bad = -1
    gmm = GaussianMixture(tol=tol_bad)
    assert_raise_message(ValueError,
                         "Invalid value for 'tol': %.5f "
                         "Tolerance used by the EM must be non-negative"
                         % tol_bad, gmm.fit, X)

    reg_covar_bad = -1
    gmm = GaussianMixture(reg_covar=reg_covar_bad)
    assert_raise_message(ValueError,
                         "Invalid value for 'reg_covar': %.5f "
                         "regularization on covariance must be "
                         "non-negative" % reg_covar_bad, gmm.fit, X)

    max_iter_bad = 0
    gmm = GaussianMixture(max_iter=max_iter_bad)
    assert_raise_message(ValueError,
                         "Invalid value for 'max_iter': %d "
                         "Estimation requires at least one iteration"
                         % max_iter_bad, gmm.fit, X)

    n_init_bad = 0
    gmm = GaussianMixture(n_init=n_init_bad)
    assert_raise_message(ValueError,
                         "Invalid value for 'n_init': %d "
                         "Estimation requires at least one run"
                         % n_init_bad, gmm.fit, X)

    init_params_bad = 'bad_method'
    gmm = GaussianMixture(init_params=init_params_bad)
    assert_raise_message(ValueError,
                         "Unimplemented initialization method '%s'"
                         % init_params_bad,
                         gmm.fit, X)

    # test good parameters
    n_components, tol, n_init, max_iter, reg_covar = 2, 1e-4, 3, 30, 1e-1
    covariance_type, init_params = 'full', 'random'
    gmm = GaussianMixture(n_components=n_components, tol=tol, n_init=n_init,
                          max_iter=max_iter, reg_covar=reg_covar,
                          covariance_type=covariance_type,
                          init_params=init_params).fit(X)

    assert gmm.n_components == n_components
    assert gmm.covariance_type == covariance_type
    assert gmm.tol == tol
    assert gmm.reg_covar == reg_covar
    assert gmm.max_iter == max_iter
    assert gmm.n_init == n_init
    assert gmm.init_params == init_params


def test_check_X():
    from sklearn.mixture._base import _check_X
    rng = np.random.RandomState(0)

    n_samples, n_components, n_features = 10, 2, 2

    X_bad_dim = rng.rand(n_components - 1, n_features)
    assert_raise_message(ValueError,
                         'Expected n_samples >= n_components '
                         'but got n_components = %d, n_samples = %d'
                         % (n_components, X_bad_dim.shape[0]),
                         _check_X, X_bad_dim, n_components)

    X_bad_dim = rng.rand(n_components, n_features + 1)
    assert_raise_message(ValueError,
                         'Expected the input data X have %d features, '
                         'but got %d features'
                         % (n_features, X_bad_dim.shape[1]),
                         _check_X, X_bad_dim, n_components, n_features)

    X = rng.rand(n_samples, n_features)
    assert_array_equal(X, _check_X(X, n_components, n_features))


def test_check_weights():
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)

    n_components = rand_data.n_components
    X = rand_data.X['full']

    g = GaussianMixture(n_components=n_components)

    # Check bad shape
    weights_bad_shape = rng.rand(n_components, 1)
    g.weights_init = weights_bad_shape
    assert_raise_message(ValueError,
                         "The parameter 'weights' should have the shape of "
                         "(%d,), but got %s" %
                         (n_components, str(weights_bad_shape.shape)),
                         g.fit, X)

    # Check bad range
    weights_bad_range = rng.rand(n_components) + 1
    g.weights_init = weights_bad_range
    assert_raise_message(ValueError,
                         "The parameter 'weights' should be in the range "
                         "[0, 1], but got max value %.5f, min value %.5f"
                         % (np.min(weights_bad_range),
                            np.max(weights_bad_range)),
                         g.fit, X)

    # Check bad normalization
    weights_bad_norm = rng.rand(n_components)
    weights_bad_norm = weights_bad_norm / (weights_bad_norm.sum() + 1)
    g.weights_init = weights_bad_norm
    assert_raise_message(ValueError,
                         "The parameter 'weights' should be normalized, "
                         "but got sum(weights) = %.5f"
                         % np.sum(weights_bad_norm),
                         g.fit, X)

    # Check good weights matrix
    weights = rand_data.weights
    g = GaussianMixture(weights_init=weights, n_components=n_components)
    g.fit(X)
    assert_array_equal(weights, g.weights_init)


def test_check_means():
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)

    n_components, n_features = rand_data.n_components, rand_data.n_features
    X = rand_data.X['full']

    g = GaussianMixture(n_components=n_components)

    # Check means bad shape
    means_bad_shape = rng.rand(n_components + 1, n_features)
    g.means_init = means_bad_shape
    assert_raise_message(ValueError,
                         "The parameter 'means' should have the shape of ",
                         g.fit, X)

    # Check good means matrix
    means = rand_data.means
    g.means_init = means
    g.fit(X)
    assert_array_equal(means, g.means_init)


def test_check_precisions():
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)

    n_components, n_features = rand_data.n_components, rand_data.n_features

    # Define the bad precisions for each covariance_type
    precisions_bad_shape = {
        'full': np.ones((n_components + 1, n_features, n_features)),
        'tied': np.ones((n_features + 1, n_features + 1)),
        'diag': np.ones((n_components + 1, n_features)),
        'spherical': np.ones((n_components + 1))}

    # Define not positive-definite precisions
    precisions_not_pos = np.ones((n_components, n_features, n_features))
    precisions_not_pos[0] = np.eye(n_features)
    precisions_not_pos[0, 0, 0] = -1.

    precisions_not_positive = {
        'full': precisions_not_pos,
        'tied': precisions_not_pos[0],
        'diag': np.full((n_components, n_features), -1.),
        'spherical': np.full(n_components, -1.)}

    not_positive_errors = {
        'full': 'symmetric, positive-definite',
        'tied': 'symmetric, positive-definite',
        'diag': 'positive',
        'spherical': 'positive'}

    for covar_type in COVARIANCE_TYPE:
        X = RandomData(rng).X[covar_type]
        g = GaussianMixture(n_components=n_components,
                            covariance_type=covar_type,
                            random_state=rng)

        # Check precisions with bad shapes
        g.precisions_init = precisions_bad_shape[covar_type]
        assert_raise_message(ValueError,
                             "The parameter '%s precision' should have "
                             "the shape of" % covar_type,
                             g.fit, X)

        # Check not positive precisions
        g.precisions_init = precisions_not_positive[covar_type]
        assert_raise_message(ValueError,
                             "'%s precision' should be %s"
                             % (covar_type, not_positive_errors[covar_type]),
                             g.fit, X)

        # Check the correct init of precisions_init
        g.precisions_init = rand_data.precisions[covar_type]
        g.fit(X)
        assert_array_equal(rand_data.precisions[covar_type], g.precisions_init)


def test_suffstat_sk_full():
    # compare the precision matrix compute from the
    # EmpiricalCovariance.covariance fitted on X*sqrt(resp)
    # with _sufficient_sk_full, n_components=1
    rng = np.random.RandomState(0)
    n_samples, n_features = 500, 2

    # special case 1, assuming data is "centered"
    X = rng.rand(n_samples, n_features)
    resp = rng.rand(n_samples, 1)
    X_resp = np.sqrt(resp) * X
    nk = np.array([n_samples])
    xk = np.zeros((1, n_features))
    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=True)
    ecov.fit(X_resp)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full')
    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
Loading ...