Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

stream / scikit-learn   python

Repository URL to install this package:

Version: 0.17.1 

/ tests / test_dummy.py

from __future__ import division

import numpy as np
import scipy.sparse as sp

from sklearn.base import clone
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.stats import _weighted_percentile

from sklearn.dummy import DummyClassifier, DummyRegressor


@ignore_warnings
def _check_predict_proba(clf, X, y):
    proba = clf.predict_proba(X)
    # We know that we can have division by zero
    log_proba = clf.predict_log_proba(X)

    y = np.atleast_1d(y)
    if y.ndim == 1:
        y = np.reshape(y, (-1, 1))

    n_outputs = y.shape[1]
    n_samples = len(X)

    if n_outputs == 1:
        proba = [proba]
        log_proba = [log_proba]

    for k in range(n_outputs):
        assert_equal(proba[k].shape[0], n_samples)
        assert_equal(proba[k].shape[1], len(np.unique(y[:, k])))
        assert_array_equal(proba[k].sum(axis=1), np.ones(len(X)))
        # We know that we can have division by zero
        assert_array_equal(np.log(proba[k]), log_proba[k])


def _check_behavior_2d(clf):
    # 1d case
    X = np.array([[0], [0], [0], [0]])  # ignored
    y = np.array([1, 2, 1, 1])
    est = clone(clf)
    est.fit(X, y)
    y_pred = est.predict(X)
    assert_equal(y.shape, y_pred.shape)

    # 2d case
    y = np.array([[1, 0],
                  [2, 0],
                  [1, 0],
                  [1, 3]])
    est = clone(clf)
    est.fit(X, y)
    y_pred = est.predict(X)
    assert_equal(y.shape, y_pred.shape)


def _check_behavior_2d_for_constant(clf):
    # 2d case only
    X = np.array([[0], [0], [0], [0]])  # ignored
    y = np.array([[1, 0, 5, 4, 3],
                  [2, 0, 1, 2, 5],
                  [1, 0, 4, 5, 2],
                  [1, 3, 3, 2, 0]])
    est = clone(clf)
    est.fit(X, y)
    y_pred = est.predict(X)
    assert_equal(y.shape, y_pred.shape)


def _check_equality_regressor(statistic, y_learn, y_pred_learn,
                              y_test, y_pred_test):
    assert_array_equal(np.tile(statistic, (y_learn.shape[0], 1)),
                       y_pred_learn)
    assert_array_equal(np.tile(statistic, (y_test.shape[0], 1)),
                       y_pred_test)


def test_most_frequent_and_prior_strategy():
    X = [[0], [0], [0], [0]]  # ignored
    y = [1, 2, 1, 1]

    for strategy in ("most_frequent", "prior"):
        clf = DummyClassifier(strategy=strategy, random_state=0)
        clf.fit(X, y)
        assert_array_equal(clf.predict(X), np.ones(len(X)))
        _check_predict_proba(clf, X, y)

        if strategy == "prior":
            assert_array_equal(clf.predict_proba([X[0]]),
                               clf.class_prior_.reshape((1, -1)))
        else:
            assert_array_equal(clf.predict_proba([X[0]]),
                               clf.class_prior_.reshape((1, -1)) > 0.5)


def test_most_frequent_and_prior_strategy_multioutput():
    X = [[0], [0], [0], [0]]  # ignored
    y = np.array([[1, 0],
                  [2, 0],
                  [1, 0],
                  [1, 3]])

    n_samples = len(X)

    for strategy in ("prior", "most_frequent"):
        clf = DummyClassifier(strategy=strategy, random_state=0)
        clf.fit(X, y)
        assert_array_equal(clf.predict(X),
                           np.hstack([np.ones((n_samples, 1)),
                                      np.zeros((n_samples, 1))]))
        _check_predict_proba(clf, X, y)
        _check_behavior_2d(clf)


def test_stratified_strategy():
    X = [[0]] * 5  # ignored
    y = [1, 2, 1, 1, 2]
    clf = DummyClassifier(strategy="stratified", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)
    p = np.bincount(y_pred) / float(len(X))
    assert_almost_equal(p[1], 3. / 5, decimal=1)
    assert_almost_equal(p[2], 2. / 5, decimal=1)
    _check_predict_proba(clf, X, y)


def test_stratified_strategy_multioutput():
    X = [[0]] * 5  # ignored
    y = np.array([[2, 1],
                  [2, 2],
                  [1, 1],
                  [1, 2],
                  [1, 1]])

    clf = DummyClassifier(strategy="stratified", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)

    for k in range(y.shape[1]):
        p = np.bincount(y_pred[:, k]) / float(len(X))
        assert_almost_equal(p[1], 3. / 5, decimal=1)
        assert_almost_equal(p[2], 2. / 5, decimal=1)
        _check_predict_proba(clf, X, y)

    _check_behavior_2d(clf)


def test_uniform_strategy():
    X = [[0]] * 4  # ignored
    y = [1, 2, 1, 1]
    clf = DummyClassifier(strategy="uniform", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)
    p = np.bincount(y_pred) / float(len(X))
    assert_almost_equal(p[1], 0.5, decimal=1)
    assert_almost_equal(p[2], 0.5, decimal=1)
    _check_predict_proba(clf, X, y)


def test_uniform_strategy_multioutput():
    X = [[0]] * 4  # ignored
    y = np.array([[2, 1],
                  [2, 2],
                  [1, 2],
                  [1, 1]])
    clf = DummyClassifier(strategy="uniform", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)

    for k in range(y.shape[1]):
        p = np.bincount(y_pred[:, k]) / float(len(X))
        assert_almost_equal(p[1], 0.5, decimal=1)
        assert_almost_equal(p[2], 0.5, decimal=1)
        _check_predict_proba(clf, X, y)

    _check_behavior_2d(clf)


def test_string_labels():
    X = [[0]] * 5
    y = ["paris", "paris", "tokyo", "amsterdam", "berlin"]
    clf = DummyClassifier(strategy="most_frequent")
    clf.fit(X, y)
    assert_array_equal(clf.predict(X), ["paris"] * 5)


def test_classifier_exceptions():
    clf = DummyClassifier(strategy="unknown")
    assert_raises(ValueError, clf.fit, [], [])

    assert_raises(ValueError, clf.predict, [])
    assert_raises(ValueError, clf.predict_proba, [])


def test_mean_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 4  # ignored
    y = random_state.randn(4)

    reg = DummyRegressor()
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.mean(y)] * len(X))


def test_mean_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    mean = np.mean(y_learn, axis=0).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor()
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)


def test_regressor_exceptions():
    reg = DummyRegressor()
    assert_raises(ValueError, reg.predict, [])


def test_median_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="median")
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))


def test_median_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    median = np.median(y_learn, axis=0).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="median")
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(
        median, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)


def test_quantile_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="quantile", quantile=0.5)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=0)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.min(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=1)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.max(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=0.3)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))


def test_quantile_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    median = np.median(y_learn, axis=0).reshape((1, -1))
    quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="quantile", quantile=0.5)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(
        median, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)

    # Correctness oracle
    est = DummyRegressor(strategy="quantile", quantile=0.8)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(
        quantile_values, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)


def test_quantile_invalid():

    X = [[0]] * 5  # ignored
Loading ...