Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
scikit-learn / metrics / tests / test_metrics.py
Size: Mime:
from __future__ import division, print_function

import numpy as np
from functools import partial
from itertools import product
import warnings

from sklearn import datasets
from sklearn import svm
from sklearn import ensemble

from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.datasets import make_multilabel_classification
from sklearn.utils import check_random_state, shuffle
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.fixes import np_version
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.testing import (assert_true,
                                   assert_raises,
                                   assert_raise_message,
                                   assert_equal,
                                   assert_almost_equal,
                                   assert_not_equal,
                                   assert_array_equal,
                                   assert_array_almost_equal,
                                   assert_warns,
                                   assert_no_warnings,
                                   assert_greater,
                                   ignore_warnings,
                                   assert_warns_message)


from sklearn.metrics import (accuracy_score,
                             average_precision_score,
                             auc,
                             auc_score,
                             classification_report,
                             confusion_matrix,
                             explained_variance_score,
                             f1_score,
                             fbeta_score,
                             hamming_loss,
                             hinge_loss,
                             jaccard_similarity_score,
                             log_loss,
                             matthews_corrcoef,
                             mean_squared_error,
                             mean_absolute_error,
                             precision_recall_curve,
                             precision_recall_fscore_support,
                             precision_score,
                             recall_score,
                             r2_score,
                             roc_auc_score,
                             roc_curve,
                             zero_one_loss)

from sklearn.metrics.metrics import _average_binary_score
from sklearn.metrics.metrics import _check_clf_targets
from sklearn.metrics.metrics import _check_reg_targets
from sklearn.metrics.metrics import UndefinedMetricWarning

from sklearn.externals.six.moves import xrange

# Note toward developers about metric testing
# -------------------------------------------
# It is often possible to write one general test for several metrics:
#
#   - invariance properties, e.g. invariance to sample order
#   - common behavior for an argument, e.g. the "normalize" with value True
#     will return the mean of the metrics and with value False will return
#     the sum of the metrics.
#
# In order to improve the overall metric testing, it is a good idea to write
# first a specific test for the given metric and then add a general test for
# all metrics that have the same behavior.
#
# Two types of datastructures are used in order to implement this system:
# dictionaries of metrics and lists of metrics wit common properties.
#
# Dictionaries of metrics
# ------------------------
# The goal of having those dictionaries is to have an easy way to call a
# particular metric and associate a name to each function:
#
#   - REGRESSION_METRICS: all regression metrics.
#   - CLASSIFICATION_METRICS: all classification metrics
#     which compare a ground truth and the estimated targets as returned by a
#     classifier.
#   - THRESHOLDED_METRICS: all classification metrics which
#     compare a ground truth and a score, e.g. estimated probabilities or
#     decision function (format might vary)
#
# Those dictionaries will be used to test systematically some invariance
# properties, e.g. invariance toward several input layout.
#

REGRESSION_METRICS = {
    "mean_absolute_error": mean_absolute_error,
    "mean_squared_error": mean_squared_error,
    "explained_variance_score": explained_variance_score,
    "r2_score": r2_score,
}

CLASSIFICATION_METRICS = {
    "accuracy_score": accuracy_score,
    "unnormalized_accuracy_score": partial(accuracy_score, normalize=False),
    "confusion_matrix": confusion_matrix,
    "hamming_loss": hamming_loss,

    "jaccard_similarity_score": jaccard_similarity_score,
    "unnormalized_jaccard_similarity_score":
    partial(jaccard_similarity_score, normalize=False),

    "zero_one_loss": zero_one_loss,
    "unnormalized_zero_one_loss": partial(zero_one_loss, normalize=False),

    "precision_score": precision_score,
    "recall_score": recall_score,
    "f1_score": f1_score,
    "f2_score": partial(fbeta_score, beta=2),
    "f0.5_score": partial(fbeta_score, beta=0.5),
    "matthews_corrcoef_score": matthews_corrcoef,

    "weighted_f0.5_score": partial(fbeta_score, average="weighted", beta=0.5),
    "weighted_f1_score": partial(f1_score, average="weighted"),
    "weighted_f2_score": partial(fbeta_score, average="weighted", beta=2),
    "weighted_precision_score": partial(precision_score, average="weighted"),
    "weighted_recall_score": partial(recall_score, average="weighted"),

    "micro_f0.5_score": partial(fbeta_score, average="micro", beta=0.5),
    "micro_f1_score": partial(f1_score, average="micro"),
    "micro_f2_score": partial(fbeta_score, average="micro", beta=2),
    "micro_precision_score": partial(precision_score, average="micro"),
    "micro_recall_score": partial(recall_score, average="micro"),

    "macro_f0.5_score": partial(fbeta_score, average="macro", beta=0.5),
    "macro_f1_score": partial(f1_score, average="macro"),
    "macro_f2_score": partial(fbeta_score, average="macro", beta=2),
    "macro_precision_score": partial(precision_score, average="macro"),
    "macro_recall_score": partial(recall_score, average="macro"),

    "samples_f0.5_score": partial(fbeta_score, average="samples", beta=0.5),
    "samples_f1_score": partial(f1_score, average="samples"),
    "samples_f2_score": partial(fbeta_score, average="samples", beta=2),
    "samples_precision_score": partial(precision_score, average="samples"),
    "samples_recall_score": partial(recall_score, average="samples"),
}

THRESHOLDED_METRICS = {
    "log_loss": log_loss,
    "hinge_loss": hinge_loss,

    "roc_auc_score": roc_auc_score,
    "weighted_roc_auc": partial(roc_auc_score, average="weighted"),
    "samples_roc_auc": partial(roc_auc_score, average="samples"),
    "micro_roc_auc": partial(roc_auc_score, average="micro"),
    "macro_roc_auc": partial(roc_auc_score, average="macro"),

    "average_precision_score": average_precision_score,
    "weighted_average_precision_score":
    partial(average_precision_score, average="weighted"),
    "samples_average_precision_score":
    partial(average_precision_score, average="samples"),
    "micro_average_precision_score":
    partial(average_precision_score, average="micro"),
    "macro_average_precision_score":
    partial(average_precision_score, average="macro"),
}

ALL_METRICS = dict()
ALL_METRICS.update(THRESHOLDED_METRICS)
ALL_METRICS.update(CLASSIFICATION_METRICS)
ALL_METRICS.update(REGRESSION_METRICS)

# Lists of metrics with common properties
# ---------------------------------------
# Lists of metrics with common properties are used to test systematically some
# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics that
# are symmetric with respect to their input argument y_true and y_pred.
#
# When you add a new metric or functionality, check if a general test
# is already written.

# Metric undefined with "binary" or "multiclass" input
METRIC_UNDEFINED_MULTICLASS = [
    "samples_f0.5_score", "samples_f1_score", "samples_f2_score",
    "samples_precision_score", "samples_recall_score",

    # Those metrics don't support multiclass outputs
    "average_precision_score", "weighted_average_precision_score",
    "micro_average_precision_score", "macro_average_precision_score",
    "samples_average_precision_score",

    "roc_auc_score", "micro_roc_auc", "weighted_roc_auc",
    "macro_roc_auc",  "samples_roc_auc",
]

# Metrics with an "average" argument
METRICS_WITH_AVERAGING = [
    "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score"
]

# Treshold-based metrics with an "average" argument
THRESHOLDED_METRICS_WITH_AVERAGING = [
    "roc_auc_score", "average_precision_score",
]

# Metrics with a "pos_label" argument
METRICS_WITH_POS_LABEL = [
    "roc_curve", "hinge_loss",

    "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",

    "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
    "weighted_precision_score", "weighted_recall_score",

    "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
    "micro_precision_score", "micro_recall_score",

    "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
    "macro_precision_score", "macro_recall_score",
]

# Metrics with a "labels" argument
METRICS_WITH_LABELS = [
    "confusion_matrix",

    "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",

    "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
    "weighted_precision_score", "weighted_recall_score",

    "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
    "micro_precision_score", "micro_recall_score",

    "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
    "macro_precision_score", "macro_recall_score",
]

# Metrics with a "normalize" option
METRICS_WITH_NORMALIZE_OPTION = [
    "accuracy_score",
    "jaccard_similarity_score",
    "zero_one_loss",
]

# Threshold-based metrics with "multilabel-indicator" format support
THRESHOLDED_MULTILABEL_METRICS = [
    "log_loss",

    "roc_auc_score", "weighted_roc_auc", "samples_roc_auc",
    "micro_roc_auc", "macro_roc_auc",

    "average_precision_score", "weighted_average_precision_score",
    "samples_average_precision_score", "micro_average_precision_score",
    "macro_average_precision_score",
]

# Classification metrics with  "multilabel-indicator" and
# "multilabel-sequence" format support
MULTILABELS_METRICS = [
    "accuracy_score", "unnormalized_accuracy_score",
    "hamming_loss",
    "jaccard_similarity_score", "unnormalized_jaccard_similarity_score",
    "zero_one_loss", "unnormalized_zero_one_loss",

    "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score",

    "weighted_f0.5_score", "weighted_f1_score", "weighted_f2_score",
    "weighted_precision_score", "weighted_recall_score",

    "micro_f0.5_score", "micro_f1_score", "micro_f2_score",
    "micro_precision_score", "micro_recall_score",

    "macro_f0.5_score", "macro_f1_score", "macro_f2_score",
    "macro_precision_score", "macro_recall_score",

    "samples_f0.5_score", "samples_f1_score", "samples_f2_score",
    "samples_precision_score", "samples_recall_score",
]

# Regression metrics with "multioutput-continuous" format support
MULTIOUTPUT_METRICS = [
    "mean_absolute_error", "mean_squared_error", "r2_score",
]

# Symmetric with respect to their input arguments y_true and y_pred
# metric(y_true, y_pred) == metric(y_pred, y_true).
SYMMETRIC_METRICS = [
    "accuracy_score", "unnormalized_accuracy_score",
    "hamming_loss",
    "jaccard_similarity_score", "unnormalized_jaccard_similarity_score",
    "zero_one_loss", "unnormalized_zero_one_loss",

    "f1_score", "weighted_f1_score", "micro_f1_score", "macro_f1_score",

    "matthews_corrcoef_score", "mean_absolute_error", "mean_squared_error"

]

# Asymmetric with respect to their input arguments y_true and y_pred
# metric(y_true, y_pred) != metric(y_pred, y_true).
NOT_SYMMETRIC_METRICS = [
    "explained_variance_score",
    "r2_score",
    "confusion_matrix",

    "precision_score", "recall_score", "f2_score", "f0.5_score",

    "weighted_f0.5_score", "weighted_f2_score", "weighted_precision_score",
    "weighted_recall_score",

    "micro_f0.5_score", "micro_f2_score", "micro_precision_score",
    "micro_recall_score",

    "macro_f0.5_score", "macro_f2_score", "macro_precision_score",
    "macro_recall_score", "log_loss", "hinge_loss"
]


# No Sample weight support
METRICS_WITHOUT_SAMPLE_WEIGHT = [
    "confusion_matrix",
    "hamming_loss",
    "hinge_loss",
    "jaccard_similarity_score", "unnormalized_jaccard_similarity_score",
    "log_loss",
    "matthews_corrcoef_score",
]

###############################################################################
# Utilities for testing


def make_prediction(dataset=None, binary=False):
    """Make some classification predictions on a toy dataset using a SVC

    If binary is True restrict to a binary classification problem instead of a
    multiclass classification problem
    """

    if dataset is None:
        # import some data to play with
        dataset = datasets.load_iris()

    X = dataset.data
    y = dataset.target

    if binary:
        # restrict to a binary classification task
        X, y = X[y < 2], y[y < 2]

    n_samples, n_features = X.shape
    p = np.arange(n_samples)

    rng = check_random_state(37)
    rng.shuffle(p)
    X, y = X[p], y[p]
    half = int(n_samples / 2)

    # add noisy features to make the problem harder and avoid perfect results
    rng = np.random.RandomState(0)
    X = np.c_[X, rng.randn(n_samples, 200 * n_features)]

    # run classifier, get class probabilities and label predictions
    clf = svm.SVC(kernel='linear', probability=True, random_state=0)
    probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:])

    if binary:
        # only interested in probabilities of the positive case
        # XXX: do we really want a special API for the binary case?
        probas_pred = probas_pred[:, 1]

    y_pred = clf.predict(X[half:])
    y_true = y[half:]
    return y_true, y_pred, probas_pred


def _auc(y_true, y_score):
    """Alternative implementation to check for correctness of
    `roc_auc_score`."""
    pos_label = np.unique(y_true)[1]

    # Count the number of times positive samples are correctly ranked above
    # negative samples.
    pos = y_score[y_true == pos_label]
    neg = y_score[y_true != pos_label]
    diff_matrix = pos.reshape(1, -1) - neg.reshape(-1, 1)
    n_correct = np.sum(diff_matrix > 0)

    return n_correct / float(len(pos) * len(neg))


###############################################################################
# Tests
def _average_precision(y_true, y_score):
    """Alternative implementation to check for correctness of
    `average_precision_score`."""
    pos_label = np.unique(y_true)[1]
    n_pos = np.sum(y_true == pos_label)
    order = np.argsort(y_score)[::-1]
    y_score = y_score[order]
    y_true = y_true[order]

    score = 0
    for i in xrange(len(y_score)):
        if y_true[i] == pos_label:
            # Compute precision up to document i
            # i.e, percentage of relevant documents up to document i.
            prec = 0
            for j in xrange(0, i + 1):
                if y_true[j] == pos_label:
                    prec += 1.0
            prec /= (i + 1.0)
            score += prec

    return score / n_pos


def test_roc_curve():
    """Test Area under Receiver Operating Characteristic (ROC) curve"""
    y_true, _, probas_pred = make_prediction(binary=True)

    fpr, tpr, thresholds = roc_curve(y_true, probas_pred)
    roc_auc = auc(fpr, tpr)
    expected_auc = _auc(y_true, probas_pred)
    assert_array_almost_equal(roc_auc, expected_auc, decimal=2)
    assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred))

    assert_almost_equal(roc_auc,
                        ignore_warnings(auc_score)(y_true, probas_pred))

    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)


def test_roc_curve_end_points():
    # Make sure that roc_curve returns a curve start at 0 and ending and
    # 1 even in corner cases
    rng = np.random.RandomState(0)
    y_true = np.array([0] * 50 + [1] * 50)
    y_pred = rng.randint(3, size=100)
    fpr, tpr, thr = roc_curve(y_true, y_pred)
    assert_equal(fpr[0], 0)
    assert_equal(fpr[-1], 1)
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thr.shape)


def test_roc_returns_consistency():
    """Test whether the returned threshold matches up with tpr"""
    # make small toy dataset
    y_true, _, probas_pred = make_prediction(binary=True)
    fpr, tpr, thresholds = roc_curve(y_true, probas_pred)

    # use the given thresholds to determine the tpr
    tpr_correct = []
    for t in thresholds:
        tp = np.sum((probas_pred >= t) & y_true)
        p = np.sum(y_true)
        tpr_correct.append(1.0 * tp / p)

    # compare tpr and tpr_correct to see if the thresholds' order was correct
    assert_array_almost_equal(tpr, tpr_correct, decimal=2)
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)


def test_roc_nonrepeating_thresholds():
    """Test to ensure that we don't return spurious repeating thresholds.

    Duplicated thresholds can arise due to machine precision issues.
    """
    dataset = datasets.load_digits()
    X = dataset['data']
    y = dataset['target']

    # This random forest classifier can only return probabilities
    # significant to two decimal places
    clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=0)

    # How well can the classifier predict whether a digit is less than 5?
    # This task contributes floating point roundoff errors to the probabilities
    train, test = slice(None, None, 2), slice(1, None, 2)
    probas_pred = clf.fit(X[train], y[train]).predict_proba(X[test])
    y_score = probas_pred[:, :5].sum(axis=1)  # roundoff errors begin here
    y_true = [yy < 5 for yy in y[test]]

    # Check for repeating values in the thresholds
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    assert_equal(thresholds.size, np.unique(np.round(thresholds, 2)).size)


def test_roc_curve_multi():
    """roc_curve not applicable for multi-class problems"""
    y_true, _, probas_pred = make_prediction(binary=False)

    assert_raises(ValueError, roc_curve, y_true, probas_pred)


def test_roc_curve_confidence():
    """roc_curve for confidence scores"""
    y_true, _, probas_pred = make_prediction(binary=True)

    fpr, tpr, thresholds = roc_curve(y_true, probas_pred - 0.5)
    roc_auc = auc(fpr, tpr)
    assert_array_almost_equal(roc_auc, 0.90, decimal=2)
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)


def test_roc_curve_hard():
    """roc_curve for hard decisions"""
    y_true, pred, probas_pred = make_prediction(binary=True)

    # always predict one
    trivial_pred = np.ones(y_true.shape)
    fpr, tpr, thresholds = roc_curve(y_true, trivial_pred)
    roc_auc = auc(fpr, tpr)
    assert_array_almost_equal(roc_auc, 0.50, decimal=2)
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)

    # always predict zero
    trivial_pred = np.zeros(y_true.shape)
    fpr, tpr, thresholds = roc_curve(y_true, trivial_pred)
    roc_auc = auc(fpr, tpr)
    assert_array_almost_equal(roc_auc, 0.50, decimal=2)
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)

    # hard decisions
    fpr, tpr, thresholds = roc_curve(y_true, pred)
    roc_auc = auc(fpr, tpr)
    assert_array_almost_equal(roc_auc, 0.78, decimal=2)
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)


def test_roc_curve_one_label():
    y_true = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    y_pred = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
    # assert there are warnings
    w = UndefinedMetricWarning
    fpr, tpr, thresholds = assert_warns(w, roc_curve, y_true, y_pred)
    # all true labels, all fpr should be nan
    assert_array_equal(fpr,
                       np.nan * np.ones(len(thresholds)))
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)

    # assert there are warnings
    fpr, tpr, thresholds = assert_warns(w, roc_curve,
                                        [1 - x for x in y_true],
                                        y_pred)
    # all negative labels, all tpr should be nan
    assert_array_equal(tpr,
                       np.nan * np.ones(len(thresholds)))
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)


def test_roc_curve_toydata():
    # Binary classification
    y_true = [0, 1]
    y_score = [0, 1]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)
    assert_array_almost_equal(tpr, [0, 1])
    assert_array_almost_equal(fpr, [1, 1])
    assert_almost_equal(roc_auc, 1.)

    y_true = [0, 1]
    y_score = [1, 0]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)
    assert_array_almost_equal(tpr, [0, 1, 1])
    assert_array_almost_equal(fpr, [0, 0, 1])
    assert_almost_equal(roc_auc, 0.)

    y_true = [1, 0]
    y_score = [1, 1]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)
    assert_array_almost_equal(tpr, [0, 1])
    assert_array_almost_equal(fpr, [0, 1])
    assert_almost_equal(roc_auc, 0.5)

    y_true = [1, 0]
    y_score = [1, 0]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)
    assert_array_almost_equal(tpr, [0, 1])
    assert_array_almost_equal(fpr, [1, 1])
    assert_almost_equal(roc_auc, 1.)

    y_true = [1, 0]
    y_score = [0.5, 0.5]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)
    assert_array_almost_equal(tpr, [0, 1])
    assert_array_almost_equal(fpr, [0, 1])
    assert_almost_equal(roc_auc, .5)

    y_true = [0, 0]
    y_score = [0.25, 0.75]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    assert_raises(ValueError, roc_auc_score, y_true, y_score)
    assert_array_almost_equal(tpr, [0., 0.5, 1.])
    assert_array_almost_equal(fpr, [np.nan,  np.nan,  np.nan])

    y_true = [1, 1]
    y_score = [0.25, 0.75]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    assert_raises(ValueError, roc_auc_score, y_true, y_score)
    assert_array_almost_equal(tpr, [np.nan, np.nan])
    assert_array_almost_equal(fpr, [0.5, 1.])

    # Multi-label classification task
    y_true = np.array([[0, 1], [0, 1]])
    y_score = np.array([[0, 1], [0, 1]])
    assert_raises(ValueError, roc_auc_score, y_true, y_score, average="macro")
    assert_raises(ValueError, roc_auc_score, y_true, y_score,
                  average="weighted")
    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 1.)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 1.)

    y_true = np.array([[0, 1], [0, 1]])
    y_score = np.array([[0, 1], [1, 0]])
    assert_raises(ValueError, roc_auc_score, y_true, y_score, average="macro")
    assert_raises(ValueError, roc_auc_score, y_true, y_score,
                  average="weighted")
    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5)

    y_true = np.array([[1, 0], [0, 1]])
    y_score = np.array([[0, 1], [1, 0]])
    assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0)

    y_true = np.array([[1, 0], [0, 1]])
    y_score = np.array([[0.5, 0.5], [0.5, 0.5]])
    assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), .5)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), .5)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), .5)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), .5)


def test_auc():
    """Test Area Under Curve (AUC) computation"""
    x = [0, 1]
    y = [0, 1]
    assert_array_almost_equal(auc(x, y), 0.5)
    x = [1, 0]
    y = [0, 1]
    assert_array_almost_equal(auc(x, y), 0.5)
    x = [1, 0, 0]
    y = [0, 1, 1]
    assert_array_almost_equal(auc(x, y), 0.5)
    x = [0, 1]
    y = [1, 1]
    assert_array_almost_equal(auc(x, y), 1)
    x = [0, 0.5, 1]
    y = [0, 0.5, 1]
    assert_array_almost_equal(auc(x, y), 0.5)


def test_auc_duplicate_values():
    # Test Area Under Curve (AUC) computation with duplicate values

    # auc() was previously sorting the x and y arrays according to the indices
    # from numpy.argsort(x), which was reordering the tied 0's in this example
    # and resulting in an incorrect area computation. This test detects the
    # error.
    x = [-2.0, 0.0, 0.0, 0.0, 1.0]
    y1 = [2.0, 0.0, 0.5, 1.0, 1.0]
    y2 = [2.0, 1.0, 0.0, 0.5, 1.0]
    y3 = [2.0, 1.0, 0.5, 0.0, 1.0]

    for y in (y1, y2, y3):
        assert_array_almost_equal(auc(x, y, reorder=True), 3.0)


def test_auc_errors():
    # Incompatible shapes
    assert_raises(ValueError, auc, [0.0, 0.5, 1.0], [0.1, 0.2])

    # Too few x values
    assert_raises(ValueError, auc, [0.0], [0.1])

    # x is not in order
    assert_raises(ValueError, auc, [1.0, 0.0, 0.5], [0.0, 0.0, 0.0])


def test_auc_score_non_binary_class():
    """Test that roc_auc_score function returns an error when trying
    to compute AUC for non-binary class values.
    """
    rng = check_random_state(404)
    y_pred = rng.rand(10)
    # y_true contains only one class value
    y_true = np.zeros(10, dtype="int")
    assert_raise_message(ValueError, "ROC AUC score is not defined",
                         roc_auc_score, y_true, y_pred)
    y_true = np.ones(10, dtype="int")
    assert_raise_message(ValueError, "ROC AUC score is not defined",
                         roc_auc_score, y_true, y_pred)
    y_true = -np.ones(10, dtype="int")
    assert_raise_message(ValueError, "ROC AUC score is not defined",
                         roc_auc_score, y_true, y_pred)
    # y_true contains three different class values
    y_true = rng.randint(0, 3, size=10)
    assert_raise_message(ValueError, "multiclass format is not supported",
                         roc_auc_score, y_true, y_pred)

    with warnings.catch_warnings(record=True):
        rng = check_random_state(404)
        y_pred = rng.rand(10)
        # y_true contains only one class value
        y_true = np.zeros(10, dtype="int")
        assert_raise_message(ValueError, "ROC AUC score is not defined",
                             roc_auc_score, y_true, y_pred)
        y_true = np.ones(10, dtype="int")
        assert_raise_message(ValueError, "ROC AUC score is not defined",
                             roc_auc_score, y_true, y_pred)
        y_true = -np.ones(10, dtype="int")
        assert_raise_message(ValueError, "ROC AUC score is not defined",
                             roc_auc_score, y_true, y_pred)

        # y_true contains three different class values
        y_true = rng.randint(0, 3, size=10)
        assert_raise_message(ValueError, "multiclass format is not supported",
                             roc_auc_score, y_true, y_pred)


def test_precision_recall_f1_score_binary():
    """Test Precision Recall and F1 Score for binary classification task"""
    y_true, y_pred, _ = make_prediction(binary=True)

    # detailed measures for each class
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    assert_array_almost_equal(p, [0.73, 0.85], 2)
    assert_array_almost_equal(r, [0.88, 0.68], 2)
    assert_array_almost_equal(f, [0.80, 0.76], 2)
    assert_array_equal(s, [25, 25])

    # individual scoring function that can be used for grid search: in the
    # binary class case the score is the value of the measure for the positive
    # class (e.g. label == 1)
    ps = precision_score(y_true, y_pred)
    assert_array_almost_equal(ps, 0.85, 2)

    rs = recall_score(y_true, y_pred)
    assert_array_almost_equal(rs, 0.68, 2)

    fs = f1_score(y_true, y_pred)
    assert_array_almost_equal(fs, 0.76, 2)

    assert_almost_equal(fbeta_score(y_true, y_pred, beta=2),
                        (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)


@ignore_warnings
def test_precision_recall_f_binary_single_class():
    """Test precision, recall and F1 score behave with a single positive or
    negative class

    Such a case may occur with non-stratified cross-validation"""
    assert_equal(1., precision_score([1, 1], [1, 1]))
    assert_equal(1., recall_score([1, 1], [1, 1]))
    assert_equal(1., f1_score([1, 1], [1, 1]))

    assert_equal(0., precision_score([-1, -1], [-1, -1]))
    assert_equal(0., recall_score([-1, -1], [-1, -1]))
    assert_equal(0., f1_score([-1, -1], [-1, -1]))


def test_average_precision_score_score_non_binary_class():
    """Test that average_precision_score function returns an error when trying
    to compute average_precision_score for multiclass task.
    """
    rng = check_random_state(404)
    y_pred = rng.rand(10)

    # y_true contains three different class values
    y_true = rng.randint(0, 3, size=10)
    assert_raise_message(ValueError, "multiclass format is not supported",
                         average_precision_score, y_true, y_pred)


def test_average_precision_score_duplicate_values():
    # Duplicate values with precision-recall require a different
    # processing than when computing the AUC of a ROC, because the
    # precision-recall curve is a decreasing curve
    # The following situtation corresponds to a perfect
    # test statistic, the average_precision_score should be 1
    y_true = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
    y_score = [0, .1, .1, .4, .5, .6, .6, .9, .9, 1, 1]
    assert_equal(average_precision_score(y_true, y_score), 1)


def test_average_precision_score_tied_values():
    # Here if we go from left to right in y_true, the 0 values are
    # are separated from the 1 values, so it appears that we've
    # Correctly sorted our classifications. But in fact the first two
    # values have the same score (0.5) and so the first two values
    # could be swapped around, creating an imperfect sorting. This
    # imperfection should come through in the end score, making it less
    # than one.
    y_true = [0, 1, 1]
    y_score = [.5, .5, .6]
    assert_not_equal(average_precision_score(y_true, y_score), 1.)


def test_precision_recall_fscore_support_errors():
    y_true, y_pred, _ = make_prediction(binary=True)

    # Bad beta
    assert_raises(ValueError, precision_recall_fscore_support,
                  y_true, y_pred, beta=0.0)

    # Bad pos_label
    assert_raises(ValueError, precision_recall_fscore_support,
                  y_true, y_pred, pos_label=2, average='macro')

    # Bad average option
    assert_raises(ValueError, precision_recall_fscore_support,
                  [0, 1, 2], [1, 2, 0], average='mega')


def test_confusion_matrix_binary():
    """Test confusion matrix - binary classification case"""
    y_true, y_pred, _ = make_prediction(binary=True)

    def test(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred)
        assert_array_equal(cm, [[22, 3], [8, 17]])

        tp, fp, fn, tn = cm.flatten()
        num = (tp * tn - fp * fn)
        den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

        true_mcc = 0 if den == 0 else num / den
        mcc = matthews_corrcoef(y_true, y_pred)
        assert_array_almost_equal(mcc, true_mcc, decimal=2)
        assert_array_almost_equal(mcc, 0.57, decimal=2)

    test(y_true, y_pred)
    test([str(y) for y in y_true],
         [str(y) for y in y_pred])


@ignore_warnings
def test_matthews_corrcoef_nan():
    assert_equal(matthews_corrcoef([0], [1]), 0.0)
    assert_equal(matthews_corrcoef([0, 0], [0, 1]), 0.0)


def test_precision_recall_f1_score_multiclass():
    """Test Precision Recall and F1 Score for multiclass classification task"""
    y_true, y_pred, _ = make_prediction(binary=False)

    # compute scores with default labels introspection
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None)
    assert_array_almost_equal(p, [0.83, 0.33, 0.42], 2)
    assert_array_almost_equal(r, [0.79, 0.09, 0.90], 2)
    assert_array_almost_equal(f, [0.81, 0.15, 0.57], 2)
    assert_array_equal(s, [24, 31, 20])

    # averaging tests
    ps = precision_score(y_true, y_pred, pos_label=1, average='micro')
    assert_array_almost_equal(ps, 0.53, 2)

    rs = recall_score(y_true, y_pred, average='micro')
    assert_array_almost_equal(rs, 0.53, 2)

    fs = f1_score(y_true, y_pred, average='micro')
    assert_array_almost_equal(fs, 0.53, 2)

    ps = precision_score(y_true, y_pred, average='macro')
    assert_array_almost_equal(ps, 0.53, 2)

    rs = recall_score(y_true, y_pred, average='macro')
    assert_array_almost_equal(rs, 0.60, 2)

    fs = f1_score(y_true, y_pred, average='macro')
    assert_array_almost_equal(fs, 0.51, 2)

    ps = precision_score(y_true, y_pred, average='weighted')
    assert_array_almost_equal(ps, 0.51, 2)

    rs = recall_score(y_true, y_pred, average='weighted')
    assert_array_almost_equal(rs, 0.53, 2)

    fs = f1_score(y_true, y_pred, average='weighted')
    assert_array_almost_equal(fs, 0.47, 2)

    assert_raises(ValueError, precision_score, y_true, y_pred,
                  average="samples")
    assert_raises(ValueError, recall_score, y_true, y_pred, average="samples")
    assert_raises(ValueError, f1_score, y_true, y_pred, average="samples")
    assert_raises(ValueError, fbeta_score, y_true, y_pred, average="samples",
                  beta=0.5)

    # same prediction but with and explicit label ordering
    p, r, f, s = precision_recall_fscore_support(
        y_true, y_pred, labels=[0, 2, 1], average=None)
    assert_array_almost_equal(p, [0.83, 0.41, 0.33], 2)
    assert_array_almost_equal(r, [0.79, 0.90, 0.10], 2)
    assert_array_almost_equal(f, [0.81, 0.57, 0.15], 2)
    assert_array_equal(s, [24, 20, 31])


def test_precision_recall_f1_score_multiclass_pos_label_none():
    """Test Precision Recall and F1 Score for multiclass classification task

    GH Issue #1296
    """
    # initialize data
    y_true = np.array([0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1])
    y_pred = np.array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1])

    # compute scores with default labels introspection
    p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                 pos_label=None,
                                                 average='weighted')


def test_zero_precision_recall():
    """Check that pathological cases do not bring NaNs"""

    old_error_settings = np.seterr(all='raise')

    try:
        y_true = np.array([0, 1, 2, 0, 1, 2])
        y_pred = np.array([2, 0, 1, 1, 2, 0])

        assert_almost_equal(precision_score(y_true, y_pred,
                                            average='weighted'), 0.0, 2)
        assert_almost_equal(recall_score(y_true, y_pred, average='weighted'),
                            0.0, 2)
        assert_almost_equal(f1_score(y_true, y_pred, average='weighted'),
                            0.0, 2)

    finally:
        np.seterr(**old_error_settings)


def test_confusion_matrix_multiclass():
    """Test confusion matrix - multi-class case"""
    y_true, y_pred, _ = make_prediction(binary=False)

    def test(y_true, y_pred, string_type=False):
        # compute confusion matrix with default labels introspection
        cm = confusion_matrix(y_true, y_pred)
        assert_array_equal(cm, [[19, 4, 1],
                                [4, 3, 24],
                                [0, 2, 18]])

        # compute confusion matrix with explicit label ordering
        labels = ['0', '2', '1'] if string_type else [0, 2, 1]
        cm = confusion_matrix(y_true,
                              y_pred,
                              labels=labels)
        assert_array_equal(cm, [[19, 1, 4],
                                [0, 18, 2],
                                [4, 24, 3]])

    test(y_true, y_pred)
    test(list(str(y) for y in y_true),
         list(str(y) for y in y_pred),
         string_type=True)


def test_confusion_matrix_multiclass_subset_labels():
    """Test confusion matrix - multi-class case with subset of labels"""
    y_true, y_pred, _ = make_prediction(binary=False)

    # compute confusion matrix with only first two labels considered
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    assert_array_equal(cm, [[19, 4],
                            [4, 3]])

    # compute confusion matrix with explicit label ordering for only subset
    # of labels
    cm = confusion_matrix(y_true, y_pred, labels=[2, 1])
    assert_array_equal(cm, [[18, 2],
                            [24, 3]])


def test_classification_report_multiclass():
    """Test performance report"""
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = """\
             precision    recall  f1-score   support

     setosa       0.83      0.79      0.81        24
 versicolor       0.33      0.10      0.15        31
  virginica       0.42      0.90      0.57        20

avg / total       0.51      0.53      0.47        75
"""
    report = classification_report(
        y_true, y_pred, labels=np.arange(len(iris.target_names)),
        target_names=iris.target_names)
    assert_equal(report, expected_report)

    # print classification report with label detection
    expected_report = """\
             precision    recall  f1-score   support

          0       0.83      0.79      0.81        24
          1       0.33      0.10      0.15        31
          2       0.42      0.90      0.57        20

avg / total       0.51      0.53      0.47        75
"""
    report = classification_report(y_true, y_pred)
    assert_equal(report, expected_report)


def test_classification_report_multiclass_with_string_label():
    y_true, y_pred, _ = make_prediction(binary=False)

    y_true = np.array(["blue", "green", "red"])[y_true]
    y_pred = np.array(["blue", "green", "red"])[y_pred]

    expected_report = """\
             precision    recall  f1-score   support

       blue       0.83      0.79      0.81        24
      green       0.33      0.10      0.15        31
        red       0.42      0.90      0.57        20

avg / total       0.51      0.53      0.47        75
"""
    report = classification_report(y_true, y_pred)
    assert_equal(report, expected_report)

    expected_report = """\
             precision    recall  f1-score   support

          a       0.83      0.79      0.81        24
          b       0.33      0.10      0.15        31
          c       0.42      0.90      0.57        20

avg / total       0.51      0.53      0.47        75
"""
    report = classification_report(y_true, y_pred,
                                   target_names=["a", "b", "c"])
    assert_equal(report, expected_report)


def test_classification_report_multiclass_with_unicode_label():
    y_true, y_pred, _ = make_prediction(binary=False)

    labels = np.array([u"blue\xa2", u"green\xa2", u"red\xa2"])
    y_true = labels[y_true]
    y_pred = labels[y_pred]

    expected_report = u"""\
             precision    recall  f1-score   support

      blue\xa2       0.83      0.79      0.81        24
     green\xa2       0.33      0.10      0.15        31
       red\xa2       0.42      0.90      0.57        20

avg / total       0.51      0.53      0.47        75
"""
    if np_version[:3] < (1, 7, 0):
        expected_message = ("NumPy < 1.7.0 does not implement"
                            " searchsorted on unicode data correctly.")
        assert_raise_message(RuntimeError, expected_message,
                             classification_report, y_true, y_pred)
    else:
        report = classification_report(y_true, y_pred)
        assert_equal(report, expected_report)


def test_multilabel_classification_report():

    n_classes = 4
    n_samples = 50
    # using sequence of sequences is deprecated, but still tested
    make_ml = ignore_warnings(make_multilabel_classification)
    _, y_true_ll = make_ml(n_features=1, n_classes=n_classes, random_state=0,
                           n_samples=n_samples)
    _, y_pred_ll = make_ml(n_features=1, n_classes=n_classes, random_state=1,
                           n_samples=n_samples)

    expected_report = """\
             precision    recall  f1-score   support

          0       0.39      0.73      0.51        15
          1       0.57      0.75      0.65        28
          2       0.33      0.11      0.17        18
          3       0.44      0.50      0.47        24

avg / total       0.45      0.54      0.47        85
"""

    lb = MultiLabelBinarizer()
    lb.fit([range(4)])
    y_true_bi = lb.transform(y_true_ll)
    y_pred_bi = lb.transform(y_pred_ll)

    for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
        report = classification_report(y_true, y_pred)
        assert_equal(report, expected_report)


def test_precision_recall_curve():
    y_true, _, probas_pred = make_prediction(binary=True)
    _test_precision_recall_curve(y_true, probas_pred)

    # Use {-1, 1} for labels; make sure original labels aren't modified
    y_true[np.where(y_true == 0)] = -1
    y_true_copy = y_true.copy()
    _test_precision_recall_curve(y_true, probas_pred)
    assert_array_equal(y_true_copy, y_true)

    labels = [1, 0, 0, 1]
    predict_probas = [1, 2, 3, 4]
    p, r, t = precision_recall_curve(labels, predict_probas)
    assert_array_almost_equal(p, np.array([0.5, 0.33333333, 0.5, 1., 1.]))
    assert_array_almost_equal(r, np.array([1., 0.5, 0.5, 0.5, 0.]))
    assert_array_almost_equal(t, np.array([1, 2, 3, 4]))
    assert_equal(p.size, r.size)
    assert_equal(p.size, t.size + 1)


def test_precision_recall_curve_pos_label():
    y_true, _, probas_pred = make_prediction(binary=False)
    pos_label = 2
    p, r, thresholds = precision_recall_curve(y_true,
                                              probas_pred[:, pos_label],
                                              pos_label=pos_label)
    p2, r2, thresholds2 = precision_recall_curve(y_true == pos_label,
                                                 probas_pred[:, pos_label])
    assert_array_almost_equal(p, p2)
    assert_array_almost_equal(r, r2)
    assert_array_almost_equal(thresholds, thresholds2)
    assert_equal(p.size, r.size)
    assert_equal(p.size, thresholds.size + 1)


def _test_precision_recall_curve(y_true, probas_pred):
    """Test Precision-Recall and aread under PR curve"""
    p, r, thresholds = precision_recall_curve(y_true, probas_pred)
    precision_recall_auc = auc(r, p)
    assert_array_almost_equal(precision_recall_auc, 0.85, 2)
    assert_array_almost_equal(precision_recall_auc,
                              average_precision_score(y_true, probas_pred))
    assert_almost_equal(_average_precision(y_true, probas_pred),
                        precision_recall_auc, 1)
    assert_equal(p.size, r.size)
    assert_equal(p.size, thresholds.size + 1)
    # Smoke test in the case of proba having only one value
    p, r, thresholds = precision_recall_curve(y_true,
                                              np.zeros_like(probas_pred))
    precision_recall_auc = auc(r, p)
    assert_array_almost_equal(precision_recall_auc, 0.75, 3)
    assert_equal(p.size, r.size)
    assert_equal(p.size, thresholds.size + 1)


def test_precision_recall_curve_errors():
    # Contains non-binary labels
    assert_raises(ValueError, precision_recall_curve,
                  [0, 1, 2], [[0.0], [1.0], [1.0]])


def test_precision_recall_curve_toydata():
    with np.errstate(all="raise"):
        # Binary classification
        y_true = [0, 1]
        y_score = [0, 1]
        p, r, _ = precision_recall_curve(y_true, y_score)
        auc_prc = average_precision_score(y_true, y_score)
        assert_array_almost_equal(p, [1, 1])
        assert_array_almost_equal(r, [1, 0])
        assert_almost_equal(auc_prc, 1.)

        y_true = [0, 1]
        y_score = [1, 0]
        p, r, _ = precision_recall_curve(y_true, y_score)
        auc_prc = average_precision_score(y_true, y_score)
        assert_array_almost_equal(p, [0.5, 0., 1.])
        assert_array_almost_equal(r, [1., 0.,  0.])
        assert_almost_equal(auc_prc, 0.25)

        y_true = [1, 0]
        y_score = [1, 1]
        p, r, _ = precision_recall_curve(y_true, y_score)
        auc_prc = average_precision_score(y_true, y_score)
        assert_array_almost_equal(p, [0.5, 1])
        assert_array_almost_equal(r, [1., 0])
        assert_almost_equal(auc_prc, .75)

        y_true = [1, 0]
        y_score = [1, 0]
        p, r, _ = precision_recall_curve(y_true, y_score)
        auc_prc = average_precision_score(y_true, y_score)
        assert_array_almost_equal(p, [1, 1])
        assert_array_almost_equal(r, [1, 0])
        assert_almost_equal(auc_prc, 1.)

        y_true = [1, 0]
        y_score = [0.5, 0.5]
        p, r, _ = precision_recall_curve(y_true, y_score)
        auc_prc = average_precision_score(y_true, y_score)
        assert_array_almost_equal(p, [0.5, 1])
        assert_array_almost_equal(r, [1, 0.])
        assert_almost_equal(auc_prc, .75)

        y_true = [0, 0]
        y_score = [0.25, 0.75]
        assert_raises(Exception, precision_recall_curve, y_true, y_score)
        assert_raises(Exception, average_precision_score, y_true, y_score)

        y_true = [1, 1]
        y_score = [0.25, 0.75]
        p, r, _ = precision_recall_curve(y_true, y_score)
        assert_almost_equal(average_precision_score(y_true, y_score), 1.)
        assert_array_almost_equal(p, [1., 1., 1.])
        assert_array_almost_equal(r, [1, 0.5, 0.])

        # Multi-label classification task
        y_true = np.array([[0, 1], [0, 1]])
        y_score = np.array([[0, 1], [0, 1]])
        assert_raises(Exception, average_precision_score, y_true, y_score,
                      average="macro")
        assert_raises(Exception, average_precision_score, y_true, y_score,
                      average="weighted")
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="samples"), 1.)
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="micro"), 1.)

        y_true = np.array([[0, 1], [0, 1]])
        y_score = np.array([[0, 1], [1, 0]])
        assert_raises(Exception, average_precision_score, y_true, y_score,
                      average="macro")
        assert_raises(Exception, average_precision_score, y_true, y_score,
                      average="weighted")
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="samples"), 0.625)
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="micro"), 0.625)

        y_true = np.array([[1, 0], [0, 1]])
        y_score = np.array([[0, 1], [1, 0]])
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="macro"), 0.25)
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="weighted"), 0.25)
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="samples"), 0.25)
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="micro"), 0.25)

        y_true = np.array([[1, 0], [0, 1]])
        y_score = np.array([[0.5, 0.5], [0.5, 0.5]])
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="macro"), 0.75)
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="weighted"), 0.75)
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="samples"), 0.75)
        assert_almost_equal(average_precision_score(y_true, y_score,
                            average="micro"), 0.75)


def test_score_scale_invariance():
    # Test that average_precision_score and roc_auc_score are invariant by
    # the scaling or shifting of probabilities
    y_true, _, probas_pred = make_prediction(binary=True)

    roc_auc = roc_auc_score(y_true, probas_pred)
    roc_auc_scaled = roc_auc_score(y_true, 100 * probas_pred)
    roc_auc_shifted = roc_auc_score(y_true, probas_pred - 10)
    assert_equal(roc_auc, roc_auc_scaled)
    assert_equal(roc_auc, roc_auc_shifted)

    f = ignore_warnings(auc_score)
    roc_auc = f(y_true, probas_pred)
    roc_auc_scaled = f(y_true, 100 * probas_pred)
    roc_auc_shifted = f(y_true, probas_pred - 10)
    assert_equal(roc_auc, roc_auc_scaled)
    assert_equal(roc_auc, roc_auc_shifted)

    pr_auc = average_precision_score(y_true, probas_pred)
    pr_auc_scaled = average_precision_score(y_true, 100 * probas_pred)
    pr_auc_shifted = average_precision_score(y_true, probas_pred - 10)
    assert_equal(pr_auc, pr_auc_scaled)
    assert_equal(pr_auc, pr_auc_shifted)


def test_losses():
    """Test loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)
    n_samples = y_true.shape[0]
    n_classes = np.size(unique_labels(y_true))

    # Classification
    # --------------
    # Throw deprecated warning

    assert_almost_equal(zero_one_loss(y_true, y_pred),
                        11 / float(n_samples), 2)
    assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 11)

    assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2)
    assert_almost_equal(hamming_loss(y_true, y_pred),
                        2 * 11. / (n_samples * n_classes), 2)

    assert_equal(accuracy_score(y_true, y_pred),
                 1 - zero_one_loss(y_true, y_pred))

    # Regression
    # ----------
    assert_almost_equal(mean_squared_error(y_true, y_pred),
                        10.999 / n_samples, 2)
    assert_almost_equal(mean_squared_error(y_true, y_true),
                        0.00, 2)

    # mean_absolute_error and mean_squared_error are equal because
    # it is a binary problem.
    assert_almost_equal(mean_absolute_error(y_true, y_pred),
                        10.999 / n_samples, 2)
    assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2)

    assert_almost_equal(explained_variance_score(y_true, y_pred), 0.16, 2)
    assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2)
    assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0)

    assert_almost_equal(r2_score(y_true, y_pred), 0.12, 2)
    assert_almost_equal(r2_score(y_true, y_true), 1.00, 2)
    assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0)
    assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)


def test_losses_at_limits():
    # test limit cases
    assert_almost_equal(mean_squared_error([0.], [0.]), 0.00, 2)
    assert_almost_equal(mean_absolute_error([0.], [0.]), 0.00, 2)
    assert_almost_equal(explained_variance_score([0.], [0.]), 1.00, 2)
    assert_almost_equal(r2_score([0., 1], [0., 1]), 1.00, 2)


def test_symmetry():
    """Test the symmetry of score and loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)

    # We shouldn't forget any metrics
    assert_equal(set(SYMMETRIC_METRICS).union(NOT_SYMMETRIC_METRICS,
                                              THRESHOLDED_METRICS,
                                              METRIC_UNDEFINED_MULTICLASS),
                 set(ALL_METRICS))

    assert_equal(
        set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)),
        set([]))

    # Symmetric metric
    for name in SYMMETRIC_METRICS:
        metric = ALL_METRICS[name]
        assert_almost_equal(metric(y_true, y_pred),
                            metric(y_pred, y_true),
                            err_msg="%s is not symmetric" % name)

    # Not symmetric metrics
    for name in NOT_SYMMETRIC_METRICS:
        metric = ALL_METRICS[name]
        assert_true(np.any(metric(y_true, y_pred) != metric(y_pred, y_true)),
                    msg="%s seems to be symmetric" % name)


def test_sample_order_invariance():
    y_true, y_pred, _ = make_prediction(binary=True)
    y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred, random_state=0)

    for name, metric in ALL_METRICS.items():
        if name in METRIC_UNDEFINED_MULTICLASS:
            continue

        assert_almost_equal(metric(y_true, y_pred),
                            metric(y_true_shuffle, y_pred_shuffle),
                            err_msg="%s is not sample order invariant"
                                    % name)


def test_sample_order_invariance_multilabel_and_multioutput():
    random_state = check_random_state(0)

    # Generate some data
    y_true = random_state.randint(0, 2, size=(20, 25))
    y_pred = random_state.randint(0, 2, size=(20, 25))
    y_score = random_state.normal(size=y_true.shape)

    y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle(y_true,
                                                              y_pred,
                                                              y_score,
                                                              random_state=0)

    for name in MULTILABELS_METRICS:
        metric = ALL_METRICS[name]
        assert_almost_equal(metric(y_true, y_pred),
                            metric(y_true_shuffle, y_pred_shuffle),
                            err_msg="%s is not sample order invariant"
                                    % name)

    for name in THRESHOLDED_MULTILABEL_METRICS:
        metric = ALL_METRICS[name]
        assert_almost_equal(metric(y_true, y_score),
                            metric(y_true_shuffle, y_score_shuffle),
                            err_msg="%s is not sample order invariant"
                                    % name)

    for name in MULTIOUTPUT_METRICS:
        metric = ALL_METRICS[name]
        assert_almost_equal(metric(y_true, y_score),
                            metric(y_true_shuffle, y_score_shuffle),
                            err_msg="%s is not sample order invariant"
                                    % name)
        assert_almost_equal(metric(y_true, y_pred),
                            metric(y_true_shuffle, y_pred_shuffle),
                            err_msg="%s is not sample order invariant"
                                    % name)


def test_format_invariance_with_1d_vectors():
    y1, y2, _ = make_prediction(binary=True)

    y1_list = list(y1)
    y2_list = list(y2)

    y1_1d, y2_1d = np.array(y1), np.array(y2)
    assert_equal(y1_1d.ndim, 1)
    assert_equal(y2_1d.ndim, 1)
    y1_column = np.reshape(y1_1d, (-1, 1))
    y2_column = np.reshape(y2_1d, (-1, 1))
    y1_row = np.reshape(y1_1d, (1, -1))
    y2_row = np.reshape(y2_1d, (1, -1))

    for name, metric in ALL_METRICS.items():
        if name in METRIC_UNDEFINED_MULTICLASS:
            continue

        measure = metric(y1, y2)

        assert_almost_equal(metric(y1_list, y2_list), measure,
                            err_msg="%s is not representation invariant "
                                    "with list" % name)

        assert_almost_equal(metric(y1_1d, y2_1d), measure,
                            err_msg="%s is not representation invariant "
                                    "with np-array-1d" % name)

        assert_almost_equal(metric(y1_column, y2_column), measure,
                            err_msg="%s is not representation invariant "
                                    "with np-array-column" % name)

        # Mix format support
        assert_almost_equal(metric(y1_1d, y2_list), measure,
                            err_msg="%s is not representation invariant "
                                    "with mix np-array-1d and list" % name)

        assert_almost_equal(metric(y1_list, y2_1d), measure,
                            err_msg="%s is not representation invariant "
                                    "with mix np-array-1d and list" % name)

        assert_almost_equal(metric(y1_1d, y2_column), measure,
                            err_msg="%s is not representation invariant "
                                    "with mix np-array-1d and np-array-column"
                                    % name)

        assert_almost_equal(metric(y1_column, y2_1d), measure,
                            err_msg="%s is not representation invariant "
                                    "with mix np-array-1d and np-array-column"
                                    % name)

        assert_almost_equal(metric(y1_list, y2_column), measure,
                            err_msg="%s is not representation invariant "
                                    "with mix list and np-array-column"
                                    % name)

        assert_almost_equal(metric(y1_column, y2_list), measure,
                            err_msg="%s is not representation invariant "
                                    "with mix list and np-array-column"
                                    % name)

        # These mix representations aren't allowed
        assert_raises(ValueError, metric, y1_1d, y2_row)
        assert_raises(ValueError, metric, y1_row, y2_1d)
        assert_raises(ValueError, metric, y1_list, y2_row)
        assert_raises(ValueError, metric, y1_row, y2_list)
        assert_raises(ValueError, metric, y1_column, y2_row)
        assert_raises(ValueError, metric, y1_row, y2_column)

        # NB: We do not test for y1_row, y2_row as these may be
        # interpreted as multilabel or multioutput data.
        if (name not in (MULTIOUTPUT_METRICS + THRESHOLDED_MULTILABEL_METRICS +
                         MULTILABELS_METRICS)):
            assert_raises(ValueError, metric, y1_row, y2_row)


def test_invariance_string_vs_numbers_labels():
    """Ensure that classification metrics with string labels"""
    y1, y2, _ = make_prediction(binary=True)

    y1_str = np.array(["eggs", "spam"])[y1]
    y2_str = np.array(["eggs", "spam"])[y2]

    pos_label_str = "spam"
    labels_str = ["eggs", "spam"]

    for name, metric in CLASSIFICATION_METRICS.items():
        if name in METRIC_UNDEFINED_MULTICLASS:
            continue

        measure_with_number = metric(y1, y2)

        # Ugly, but handle case with a pos_label and label
        metric_str = metric
        if name in METRICS_WITH_POS_LABEL:
            metric_str = partial(metric_str, pos_label=pos_label_str)

        measure_with_str = metric_str(y1_str, y2_str)

        assert_array_equal(measure_with_number, measure_with_str,
                           err_msg="{0} failed string vs number invariance "
                                   "test".format(name))

        measure_with_strobj = metric_str(y1_str.astype('O'),
                                         y2_str.astype('O'))
        assert_array_equal(measure_with_number, measure_with_strobj,
                           err_msg="{0} failed string object vs number "
                                   "invariance test".format(name))

        if name in METRICS_WITH_LABELS:
            metric_str = partial(metric_str, labels=labels_str)
            measure_with_str = metric_str(y1_str, y2_str)
            assert_array_equal(measure_with_number, measure_with_str,
                               err_msg="{0} failed string vs number  "
                                       "invariance test".format(name))

            measure_with_strobj = metric_str(y1_str.astype('O'),
                                             y2_str.astype('O'))
            assert_array_equal(measure_with_number, measure_with_strobj,
                               err_msg="{0} failed string vs number  "
                                       "invariance test".format(name))

    for name, metric in THRESHOLDED_METRICS.items():
        if name in ("log_loss", "hinge_loss"):
            measure_with_number = metric(y1, y2)
            measure_with_str = metric(y1_str, y2)
            assert_array_equal(measure_with_number, measure_with_str,
                               err_msg="{0} failed string vs number "
                               "invariance test".format(name))

            measure_with_strobj = metric(y1_str.astype('O'), y2)
            assert_array_equal(measure_with_number, measure_with_strobj,
                               err_msg="{0} failed string object vs number "
                                       "invariance test".format(name))
        else:
            # TODO those metrics doesn't support string label yet
            assert_raises(ValueError, metric, y1_str, y2)
            assert_raises(ValueError, metric, y1_str.astype('O'), y2)


@ignore_warnings
def check_single_sample(name):
    """Non-regression test: scores should work with a single sample.

    This is important for leave-one-out cross validation.
    Score functions tested are those that formerly called np.squeeze,
    which turns an array of size 1 into a 0-d array (!).
    """
    metric = ALL_METRICS[name]

    # assert that no exception is thrown
    for i, j in product([0, 1], repeat=2):
        metric([i], [j])


@ignore_warnings
def check_single_sample_multioutput(name):
    metric = ALL_METRICS[name]
    for i, j, k, l in product([0, 1], repeat=4):
        metric(np.array([[i, j]]), np.array([[k, l]]))


def test_single_sample():
    for name in ALL_METRICS:
        if name in METRIC_UNDEFINED_MULTICLASS or name in THRESHOLDED_METRICS:
            # Those metrics are not always defined with one sample
            # or in multiclass classification
            continue

        yield check_single_sample, name

    for name in MULTIOUTPUT_METRICS + MULTILABELS_METRICS:
        yield check_single_sample_multioutput, name


def test_hinge_loss_binary():
    y_true = np.array([-1, 1, 1, -1])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4)

    y_true = np.array([0, 2, 2, 0])
    pred_decision = np.array([-8.5, 0.5, 1.5, -0.3])
    assert_equal(hinge_loss(y_true, pred_decision), 1.2 / 4)


def test_multioutput_regression():
    y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]])
    y_pred = np.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]])

    error = mean_squared_error(y_true, y_pred)
    assert_almost_equal(error, (1. / 3 + 2. / 3 + 2. / 3) / 4.)

    # mean_absolute_error and mean_squared_error are equal because
    # it is a binary problem.
    error = mean_absolute_error(y_true, y_pred)
    assert_almost_equal(error, (1. / 3 + 2. / 3 + 2. / 3) / 4.)

    error = r2_score(y_true, y_pred)
    assert_almost_equal(error, 1 - 5. / 2)


def test_multioutput_number_of_output_differ():
    y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]])
    y_pred = np.array([[0, 0], [1, 0], [0, 0]])

    for name in MULTIOUTPUT_METRICS:
        metric = ALL_METRICS[name]
        assert_raises(ValueError, metric, y_true, y_pred)


def test_multioutput_regression_invariance_to_dimension_shuffling():
    # test invariance to dimension shuffling
    y_true, y_pred, _ = make_prediction()
    n_dims = 3
    y_true = np.reshape(y_true, (-1, n_dims))
    y_pred = np.reshape(y_pred, (-1, n_dims))

    rng = check_random_state(314159)
    for name in MULTIOUTPUT_METRICS:
        metric = ALL_METRICS[name]
        error = metric(y_true, y_pred)

        for _ in xrange(3):
            perm = rng.permutation(n_dims)
            assert_almost_equal(metric(y_true[:, perm], y_pred[:, perm]),
                                error,
                                err_msg="%s is not dimension shuffling "
                                        "invariant" % name)


def test_multilabel_representation_invariance():

    # Generate some data
    n_classes = 4
    n_samples = 50
    # using sequence of sequences is deprecated, but still tested
    make_ml = ignore_warnings(make_multilabel_classification)
    _, y1 = make_ml(n_features=1, n_classes=n_classes, random_state=0,
                    n_samples=n_samples)
    _, y2 = make_ml(n_features=1, n_classes=n_classes, random_state=1,
                    n_samples=n_samples)

    # Be sure to have at least one empty label
    y1 += ([], )
    y2 += ([], )

    # NOTE: The "sorted" trick is necessary to shuffle labels, because it
    # allows to return the shuffled tuple.
    rng = check_random_state(42)
    shuffled = lambda x: sorted(x, key=lambda *args: rng.rand())
    y1_shuffle = [shuffled(x) for x in y1]
    y2_shuffle = [shuffled(x) for x in y2]

    # Let's have redundant labels
    y1_redundant = [x * rng.randint(1, 4) for x in y1]
    y2_redundant = [x * rng.randint(1, 4) for x in y2]

    # Binary indicator matrix format
    lb = MultiLabelBinarizer().fit([range(n_classes)])
    y1_binary_indicator = lb.transform(y1)
    y2_binary_indicator = lb.transform(y2)

    y1_shuffle_binary_indicator = lb.transform(y1_shuffle)
    y2_shuffle_binary_indicator = lb.transform(y2_shuffle)

    for name in MULTILABELS_METRICS:
        metric = ALL_METRICS[name]

        # XXX cruel hack to work with partial functions
        if isinstance(metric, partial):
            metric.__module__ = 'tmp'
            metric.__name__ = name
        # Check warning for sequence of sequences
        measure = assert_warns(DeprecationWarning, metric, y1, y2)
        metric = ignore_warnings(metric)

        # Check representation invariance
        assert_almost_equal(metric(y1_binary_indicator,
                                   y2_binary_indicator),
                            measure,
                            err_msg="%s failed representation invariance  "
                                    "between list of list of labels "
                                    "format and dense binary indicator "
                                    "format." % name)

        # Check invariance with redundant labels with list of labels
        assert_almost_equal(metric(y1, y2_redundant), measure,
                            err_msg="%s failed rendundant label invariance"
                                    % name)

        assert_almost_equal(metric(y1_redundant, y2_redundant), measure,
                            err_msg="%s failed rendundant label invariance"
                                    % name)

        assert_almost_equal(metric(y1_redundant, y2), measure,
                            err_msg="%s failed rendundant label invariance"
                                    % name)

        # Check shuffling invariance with list of labels
        assert_almost_equal(metric(y1_shuffle, y2_shuffle), measure,
                            err_msg="%s failed shuffling invariance "
                                    "with list of list of labels format."
                                    % name)

        # Check shuffling invariance with dense binary indicator matrix
        assert_almost_equal(metric(y1_shuffle_binary_indicator,
                                   y2_shuffle_binary_indicator), measure,
                            err_msg="%s failed shuffling invariance "
                                    " with dense binary indicator format."
                                    % name)

        # Check raises error with mix input representation
        assert_raises(ValueError, metric, y1, y2_binary_indicator)
        assert_raises(ValueError, metric, y1_binary_indicator, y2)


def test_multilabel_zero_one_loss_subset():
    # Dense label indicator matrix format
    y1 = np.array([[0, 1, 1], [1, 0, 1]])
    y2 = np.array([[0, 0, 1], [1, 0, 1]])

    assert_equal(zero_one_loss(y1, y2), 0.5)
    assert_equal(zero_one_loss(y1, y1), 0)
    assert_equal(zero_one_loss(y2, y2), 0)
    assert_equal(zero_one_loss(y2, np.logical_not(y2)), 1)
    assert_equal(zero_one_loss(y1, np.logical_not(y1)), 1)
    assert_equal(zero_one_loss(y1, np.zeros(y1.shape)), 1)
    assert_equal(zero_one_loss(y2, np.zeros(y1.shape)), 1)

    # List of tuple of label
    y1 = [(1, 2,), (0, 2,)]
    y2 = [(2,), (0, 2,)]

    assert_equal(zero_one_loss(y1, y2), 0.5)
    assert_equal(zero_one_loss(y1, y1), 0)
    assert_equal(zero_one_loss(y2, y2), 0)
    assert_equal(zero_one_loss(y2, [(), ()]), 1)
    assert_equal(zero_one_loss(y2, [tuple(), (10, )]), 1)


def test_multilabel_hamming_loss():
    # Dense label indicator matrix format
    y1 = np.array([[0, 1, 1], [1, 0, 1]])
    y2 = np.array([[0, 0, 1], [1, 0, 1]])

    assert_equal(hamming_loss(y1, y2), 1 / 6)
    assert_equal(hamming_loss(y1, y1), 0)
    assert_equal(hamming_loss(y2, y2), 0)
    assert_equal(hamming_loss(y2, np.logical_not(y2)), 1)
    assert_equal(hamming_loss(y1, np.logical_not(y1)), 1)
    assert_equal(hamming_loss(y1, np.zeros(y1.shape)), 4 / 6)
    assert_equal(hamming_loss(y2, np.zeros(y1.shape)), 0.5)

    # List of tuple of label
    y1 = [(1, 2,), (0, 2,)]
    y2 = [(2,), (0, 2,)]

    assert_equal(hamming_loss(y1, y2), 1 / 6)
    assert_equal(hamming_loss(y1, y1), 0)
    assert_equal(hamming_loss(y2, y2), 0)
    assert_equal(hamming_loss(y2, [(), ()]), 0.75)
    assert_equal(hamming_loss(y1, [tuple(), (10, )]), 0.625)
    assert_almost_equal(hamming_loss(y2, [tuple(), (10, )],
                                     classes=np.arange(11)), 0.1818, 2)


def test_multilabel_accuracy_score_subset_accuracy():
    # Dense label indicator matrix format
    y1 = np.array([[0, 1, 1], [1, 0, 1]])
    y2 = np.array([[0, 0, 1], [1, 0, 1]])

    assert_equal(accuracy_score(y1, y2), 0.5)
    assert_equal(accuracy_score(y1, y1), 1)
    assert_equal(accuracy_score(y2, y2), 1)
    assert_equal(accuracy_score(y2, np.logical_not(y2)), 0)
    assert_equal(accuracy_score(y1, np.logical_not(y1)), 0)
    assert_equal(accuracy_score(y1, np.zeros(y1.shape)), 0)
    assert_equal(accuracy_score(y2, np.zeros(y1.shape)), 0)

    # List of tuple of label
    y1 = [(1, 2,), (0, 2,)]
    y2 = [(2,), (0, 2,)]

    assert_equal(accuracy_score(y1, y2), 0.5)
    assert_equal(accuracy_score(y1, y1), 1)
    assert_equal(accuracy_score(y2, y2), 1)
    assert_equal(accuracy_score(y2, [(), ()]), 0)
    assert_equal(accuracy_score(y1, y2, normalize=False), 1)
    assert_equal(accuracy_score(y1, y1, normalize=False), 2)
    assert_equal(accuracy_score(y2, y2, normalize=False), 2)
    assert_equal(accuracy_score(y2, [(), ()], normalize=False), 0)


def test_multilabel_jaccard_similarity_score():
    # Dense label indicator matrix format
    y1 = np.array([[0, 1, 1], [1, 0, 1]])
    y2 = np.array([[0, 0, 1], [1, 0, 1]])

    # size(y1 \inter y2) = [1, 2]
    # size(y1 \union y2) = [2, 2]

    assert_equal(jaccard_similarity_score(y1, y2), 0.75)
    assert_equal(jaccard_similarity_score(y1, y1), 1)
    assert_equal(jaccard_similarity_score(y2, y2), 1)
    assert_equal(jaccard_similarity_score(y2, np.logical_not(y2)), 0)
    assert_equal(jaccard_similarity_score(y1, np.logical_not(y1)), 0)
    assert_equal(jaccard_similarity_score(y1, np.zeros(y1.shape)), 0)
    assert_equal(jaccard_similarity_score(y2, np.zeros(y1.shape)), 0)

   # List of tuple of label
    y1 = [(1, 2,), (0, 2,)]
    y2 = [(2,), (0, 2,)]

    assert_equal(jaccard_similarity_score(y1, y2), 0.75)
    assert_equal(jaccard_similarity_score(y1, y1), 1)
    assert_equal(jaccard_similarity_score(y2, y2), 1)
    assert_equal(jaccard_similarity_score(y2, [(), ()]), 0)

    # |y3 inter y4 | = [0, 1, 1]
    # |y3 union y4 | = [2, 1, 3]
    y3 = [(0,), (1,), (3,)]
    y4 = [(4,), (4,), (5, 6)]
    assert_almost_equal(jaccard_similarity_score(y3, y4), 0)

    # |y5 inter y6 | = [0, 1, 1]
    # |y5 union y6 | = [2, 1, 3]
    y5 = [(0,), (1,), (2, 3)]
    y6 = [(1,), (1,), (2, 0)]

    assert_almost_equal(jaccard_similarity_score(y5, y6), (1 + 1 / 3) / 3)


def test_normalize_option_binary_classification():
    # Test in the binary case
    y_true, y_pred, _ = make_prediction(binary=True)
    n_samples = y_true.shape[0]

    for name in METRICS_WITH_NORMALIZE_OPTION:
        metrics = ALL_METRICS[name]
        measure = metrics(y_true, y_pred, normalize=True)
        assert_greater(measure, 0,
                       msg="We failed to test correctly the normalize option")
        assert_almost_equal(metrics(y_true, y_pred, normalize=False)
                            / n_samples, measure)


def test_normalize_option_multiclasss_classification():
    # Test in the multiclass case
    y_true, y_pred, _ = make_prediction(binary=False)
    n_samples = y_true.shape[0]

    for name in METRICS_WITH_NORMALIZE_OPTION:
        metrics = ALL_METRICS[name]
        measure = metrics(y_true, y_pred, normalize=True)
        assert_greater(measure, 0,
                       msg="We failed to test correctly the normalize option")
        assert_almost_equal(metrics(y_true, y_pred, normalize=False)
                            / n_samples, measure)


def test_normalize_option_multilabel_classification():
    # Test in the multilabel case
    n_classes = 4
    n_samples = 100
    # using sequence of sequences is deprecated, but still tested
    make_ml = ignore_warnings(make_multilabel_classification)
    _, y_true = make_ml(n_features=1, n_classes=n_classes,
                        random_state=0, n_samples=n_samples)
    _, y_pred = make_ml(n_features=1, n_classes=n_classes,
                        random_state=1, n_samples=n_samples)

    # Be sure to have at least one empty label
    y_true += ([], )
    y_pred += ([], )
    n_samples += 1

    lb = MultiLabelBinarizer().fit([range(n_classes)])
    y_true_binary_indicator = lb.transform(y_true)
    y_pred_binary_indicator = lb.transform(y_pred)

    for name in METRICS_WITH_NORMALIZE_OPTION:
        metrics = ALL_METRICS[name]

        # List of list of labels
        measure = assert_warns(DeprecationWarning, metrics, y_true, y_pred,
                               normalize=True)
        assert_greater(measure, 0,
                       msg="We failed to test correctly the normalize option")
        assert_almost_equal(metrics(y_true, y_pred, normalize=False)
                            / n_samples, measure,
                            err_msg="Failed with %s" % name)

        # Indicator matrix format
        measure = metrics(y_true_binary_indicator,
                          y_pred_binary_indicator, normalize=True)
        assert_greater(measure, 0,
                       msg="We failed to test correctly the normalize option")
        assert_almost_equal(metrics(y_true_binary_indicator,
                                    y_pred_binary_indicator, normalize=False)
                            / n_samples, measure,
                            err_msg="Failed with %s" % name)


@ignore_warnings
def test_precision_recall_f1_score_multilabel_1():
    """ Test precision_recall_f1_score on a crafted multilabel example
    """
    # First crafted example
    y_true_ll = [(0,), (1,), (2, 3)]
    y_pred_ll = [(1,), (1,), (2, 0)]
    lb = LabelBinarizer()
    lb.fit([range(4)])
    y_true_bi = lb.transform(y_true_ll)
    y_pred_bi = lb.transform(y_pred_ll)

    for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:

        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average=None)
        #tp = [0, 1, 1, 0]
        #fn = [1, 0, 0, 1]
        #fp = [1, 1, 0, 0]
        # Check per class

        assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2)
        assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2)
        assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
        assert_array_almost_equal(s, [1, 1, 1, 1], 2)

        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
        support = s
        assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2)

        # Check macro
        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="macro")
        assert_almost_equal(p, 1.5 / 4)
        assert_almost_equal(r, 0.5)
        assert_almost_equal(f, 2.5 / 1.5 * 0.25)
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="macro"),
                            np.mean(f2))

        # Check micro
        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="micro")
        assert_almost_equal(p, 0.5)
        assert_almost_equal(r, 0.5)
        assert_almost_equal(f, 0.5)
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="micro"),
                            (1 + 4) * p * r / (4 * p + r))

        # Check weigted
        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="weighted")
        assert_almost_equal(p, 1.5 / 4)
        assert_almost_equal(r, 0.5)
        assert_almost_equal(f, 2.5 / 1.5 * 0.25)
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="weighted"),
                            np.average(f2, weights=support))
        # Check weigted
        # |h(x_i) inter y_i | = [0, 1, 1]
        # |y_i| = [1, 1, 2]
        # |h(x_i)| = [1, 1, 2]
        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="samples")
        assert_almost_equal(p, 0.5)
        assert_almost_equal(r, 0.5)
        assert_almost_equal(f, 0.5)
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="samples"),
                            0.5)


@ignore_warnings
def test_precision_recall_f1_score_multilabel_2():
    """ Test precision_recall_f1_score on a crafted multilabel example 2
    """
    # Second crafted example
    y_true_ll = [(1,), (2,), (2, 3)]
    y_pred_ll = [(4,), (4,), (2, 1)]
    lb = LabelBinarizer()
    lb.fit([range(1, 5)])
    y_true_bi = lb.transform(y_true_ll)
    y_pred_bi = lb.transform(y_pred_ll)

    for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
        # tp = [ 0.  1.  0.  0.]
        # fp = [ 1.  0.  0.  2.]
        # fn = [ 1.  1.  1.  0.]

        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average=None)
        assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2)
        assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2)
        assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2)
        assert_array_almost_equal(s, [1, 2, 1, 0], 2)

        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
        support = s
        assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2)

        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="micro")
        assert_almost_equal(p, 0.25)
        assert_almost_equal(r, 0.25)
        assert_almost_equal(f, 2 * 0.25 * 0.25 / 0.5)
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="micro"),
                            (1 + 4) * p * r / (4 * p + r))

        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="macro")
        assert_almost_equal(p, 0.25)
        assert_almost_equal(r, 0.125)
        assert_almost_equal(f, 2 / 12)
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="macro"),
                            np.mean(f2))

        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="weighted")
        assert_almost_equal(p, 2 / 4)
        assert_almost_equal(r, 1 / 4)
        assert_almost_equal(f, 2 / 3 * 2 / 4)
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="weighted"),
                            np.average(f2, weights=support))

        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="samples")
        # Check weigted
        # |h(x_i) inter y_i | = [0, 0, 1]
        # |y_i| = [1, 1, 2]
        # |h(x_i)| = [1, 1, 2]

        assert_almost_equal(p, 1 / 6)
        assert_almost_equal(r, 1 / 6)
        assert_almost_equal(f, 2 / 4 * 1 / 3)
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="samples"),
                            0.1666, 2)


@ignore_warnings
def test_precision_recall_f1_score_with_an_empty_prediction():
    y_true_ll = [(1,), (0,), (2, 1,)]
    y_pred_ll = [tuple(), (3,), (2, 1)]

    lb = LabelBinarizer()
    lb.fit([range(4)])
    y_true_bi = lb.transform(y_true_ll)
    y_pred_bi = lb.transform(y_pred_ll)

    for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]:
        # true_pos = [ 0.  1.  1.  0.]
        # false_pos = [ 0.  0.  0.  1.]
        # false_neg = [ 1.  1.  0.  0.]
        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average=None)
        assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2)
        assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2)
        assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2)
        assert_array_almost_equal(s, [1, 2, 1, 0], 2)

        f2 = fbeta_score(y_true, y_pred, beta=2, average=None)
        support = s
        assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2)

        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="macro")
        assert_almost_equal(p, 0.5)
        assert_almost_equal(r, 1.5 / 4)
        assert_almost_equal(f, 2.5 / (4 * 1.5))
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="macro"),
                            np.mean(f2))

        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="micro")
        assert_almost_equal(p, 2 / 3)
        assert_almost_equal(r, 0.5)
        assert_almost_equal(f, 2 / 3 / (2 / 3 + 0.5))
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="micro"),
                            (1 + 4) * p * r / (4 * p + r))

        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="weighted")
        assert_almost_equal(p, 3 / 4)
        assert_almost_equal(r, 0.5)
        assert_almost_equal(f, (2 / 1.5 + 1) / 4)
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="weighted"),
                            np.average(f2, weights=support))

        p, r, f, s = precision_recall_fscore_support(y_true, y_pred,
                                                     average="samples")
        # |h(x_i) inter y_i | = [0, 0, 2]
        # |y_i| = [1, 1, 2]
        # |h(x_i)| = [0, 1, 2]
        assert_almost_equal(p, 1 / 3)
        assert_almost_equal(r, 1 / 3)
        assert_almost_equal(f, 1 / 3)
        assert_equal(s, None)
        assert_almost_equal(fbeta_score(y_true, y_pred, beta=2,
                                        average="samples"),
                            0.333, 2)


def test_precision_recall_f1_no_labels():
    y_true = np.zeros((20, 3))
    y_pred = np.zeros_like(y_true)

    # tp = [0, 0, 0]
    # fn = [0, 0, 0]
    # fp = [0, 0, 0]
    # support = [0, 0, 0]
    # |y_hat_i inter y_i | = [0, 0, 0]
    # |y_i| = [0, 0, 0]
    # |y_hat_i| = [0, 0, 0]

    for beta in [1]:
        p, r, f, s = assert_warns(UndefinedMetricWarning,
                                  precision_recall_fscore_support,
                                  y_true, y_pred, average=None, beta=beta)
        assert_array_almost_equal(p, [0, 0, 0], 2)
        assert_array_almost_equal(r, [0, 0, 0], 2)
        assert_array_almost_equal(f, [0, 0, 0], 2)
        assert_array_almost_equal(s, [0, 0, 0], 2)

        fbeta = assert_warns(UndefinedMetricWarning, fbeta_score,
                             y_true, y_pred, beta=beta, average=None)
        assert_array_almost_equal(fbeta, [0, 0, 0], 2)

        for average in ["macro", "micro", "weighted", "samples"]:
            p, r, f, s = assert_warns(UndefinedMetricWarning,
                                      precision_recall_fscore_support,
                                      y_true, y_pred, average=average,
                                      beta=beta)
            assert_almost_equal(p, 0)
            assert_almost_equal(r, 0)
            assert_almost_equal(f, 0)
            assert_equal(s, None)

            fbeta = assert_warns(UndefinedMetricWarning, fbeta_score,
                                 y_true, y_pred,
                                 beta=beta, average=average)
            assert_almost_equal(fbeta, 0)


def test_prf_warnings():

    # average of per-label scores
    f, w = precision_recall_fscore_support, UndefinedMetricWarning
    my_assert = assert_warns_message
    for average in [None, 'weighted', 'macro']:
        msg = ('Precision and F-score are ill-defined and '
               'being set to 0.0 in labels with no predicted samples.')
        my_assert(w, msg, f, [0, 1, 2], [1, 1, 2], average=average)

        msg = ('Recall and F-score are ill-defined and '
               'being set to 0.0 in labels with no true samples.')
        my_assert(w, msg, f, [1, 1, 2], [0, 1, 2], average=average)

        # average of per-sample scores
        msg = ('Precision and F-score are ill-defined and '
               'being set to 0.0 in samples with no predicted labels.')
        my_assert(w, msg, f, np.array([[1, 0], [1, 0]]),
                  np.array([[1, 0], [0, 0]]), average='samples')

        msg = ('Recall and F-score are ill-defined and '
               'being set to 0.0 in samples with no true labels.')
        my_assert(w, msg, f, np.array([[1, 0], [0, 0]]),
                  np.array([[1, 0], [1, 0]]),
                  average='samples')

        # single score: micro-average
        msg = ('Precision and F-score are ill-defined and '
               'being set to 0.0 due to no predicted samples.')
        my_assert(w, msg, f, np.array([[1, 1], [1, 1]]),
                  np.array([[0, 0], [0, 0]]), average='micro')

        msg = ('Recall and F-score are ill-defined and '
               'being set to 0.0 due to no true samples.')
        my_assert(w, msg, f, np.array([[0, 0], [0, 0]]),
                  np.array([[1, 1], [1, 1]]), average='micro')

        # single postive label
        msg = ('Precision and F-score are ill-defined and '
               'being set to 0.0 due to no predicted samples.')
        my_assert(w, msg, f, [1, 1], [-1, -1], average='macro')

        msg = ('Recall and F-score are ill-defined and '
               'being set to 0.0 due to no true samples.')
        my_assert(w, msg, f, [-1, -1], [1, 1], average='macro')


def test_recall_warnings():
    assert_no_warnings(recall_score,
                       np.array([[1, 1], [1, 1]]),
                       np.array([[0, 0], [0, 0]]),
                       average='micro')

    with warnings.catch_warnings(record=True) as record:
        warnings.simplefilter('always')
        recall_score(np.array([[0, 0], [0, 0]]),
                     np.array([[1, 1], [1, 1]]),
                     average='micro')
        assert_equal(str(record.pop().message),
                     'Recall is ill-defined and '
                     'being set to 0.0 due to no true samples.')


def test_precision_warnings():
    with warnings.catch_warnings(record=True) as record:
        warnings.simplefilter('always')

        precision_score(np.array([[1, 1], [1, 1]]),
                        np.array([[0, 0], [0, 0]]),
                        average='micro')
        assert_equal(str(record.pop().message),
                     'Precision is ill-defined and '
                     'being set to 0.0 due to no predicted samples.')

    assert_no_warnings(precision_score,
                       np.array([[0, 0], [0, 0]]),
                       np.array([[1, 1], [1, 1]]),
                       average='micro')


def test_fscore_warnings():
    with warnings.catch_warnings(record=True) as record:
        warnings.simplefilter('always')

        for score in [f1_score, partial(fbeta_score, beta=2)]:
            score(np.array([[1, 1], [1, 1]]),
                  np.array([[0, 0], [0, 0]]),
                  average='micro')
            assert_equal(str(record.pop().message),
                         'F-score is ill-defined and '
                         'being set to 0.0 due to no predicted samples.')
            score(np.array([[0, 0], [0, 0]]),
                  np.array([[1, 1], [1, 1]]),
                  average='micro')
            assert_equal(str(record.pop().message),
                         'F-score is ill-defined and '
                         'being set to 0.0 due to no true samples.')


def test__check_clf_targets():
    """Check that _check_clf_targets correctly merges target types, squeezes
    output and fails if input lengths differ."""
    IND = 'multilabel-indicator'
    SEQ = 'multilabel-sequences'
    MC = 'multiclass'
    BIN = 'binary'
    CNT = 'continuous'
    MMC = 'multiclass-multioutput'
    MCN = 'continuous-multioutput'
    # all of length 3
    EXAMPLES = [
        (IND, np.array([[0, 1, 1], [1, 0, 0], [0, 0, 1]])),
        # must not be considered binary
        (IND, np.array([[0, 1], [1, 0], [1, 1]])),
        (SEQ, [[2, 3], [1], [3]]),
        (MC, [2, 3, 1]),
        (BIN, [0, 1, 1]),
        (CNT, [0., 1.5, 1.]),
        (MC, np.array([[2], [3], [1]])),
        (BIN, np.array([[0], [1], [1]])),
        (CNT, np.array([[0.], [1.5], [1.]])),
        (MMC, np.array([[0, 2], [1, 3], [2, 3]])),
        (MCN, np.array([[0.5, 2.], [1.1, 3.], [2., 3.]])),
    ]
    # expected type given input types, or None for error
    # (types will be tried in either order)
    EXPECTED = {
        (IND, IND): IND,
        (SEQ, SEQ): SEQ,
        (MC, MC): MC,
        (BIN, BIN): BIN,

        (IND, SEQ): None,
        (MC, SEQ): None,
        (BIN, SEQ): None,
        (MC, IND): None,
        (BIN, IND): None,
        (BIN, MC): MC,

        # Disallowed types
        (CNT, CNT): None,
        (MMC, MMC): None,
        (MCN, MCN): None,
        (IND, CNT): None,
        (SEQ, CNT): None,
        (MC, CNT): None,
        (BIN, CNT): None,
        (MMC, CNT): None,
        (MCN, CNT): None,
        (IND, MMC): None,
        (SEQ, MMC): None,
        (MC, MMC): None,
        (BIN, MMC): None,
        (MCN, MMC): None,
        (IND, MCN): None,
        (SEQ, MCN): None,
        (MC, MCN): None,
        (BIN, MCN): None,
    }

    for (type1, y1), (type2, y2) in product(EXAMPLES, repeat=2):
        try:
            expected = EXPECTED[type1, type2]
        except KeyError:
            expected = EXPECTED[type2, type1]
        if expected is None:
            assert_raises(ValueError, _check_clf_targets, y1, y2)

            if type1 != type2:
                assert_raise_message(
                    ValueError,
                    "Can't handle mix of {0} and {1}".format(type1, type2),
                    _check_clf_targets, y1, y2)

            else:
                if type1 not in (BIN, MC, SEQ, IND):
                    assert_raise_message(ValueError,
                                         "{0} is not supported".format(type1),
                                         _check_clf_targets, y1, y2)

        else:
            merged_type, y1out, y2out = _check_clf_targets(y1, y2)
            assert_equal(merged_type, expected)
            if not merged_type.startswith('multilabel'):
                assert_array_equal(y1out, np.squeeze(y1))
                assert_array_equal(y2out, np.squeeze(y2))
            assert_raises(ValueError, _check_clf_targets, y1[:-1], y2)


def test__check_reg_targets():
    # All of length 3
    EXAMPLES = [
        ("continuous", [1, 2, 3], 1),
        ("continuous", [[1], [2], [3]], 1),
        ("continuous-multioutput", [[1, 1], [2, 2], [3, 1]], 2),
        ("continuous-multioutput", [[5, 1], [4, 2], [3, 1]], 2),
        ("continuous-multioutput", [[1, 3, 4], [2, 2, 2], [3, 1, 1]], 3),
    ]

    for (type1, y1, n_out1), (type2, y2, n_out2) in product(EXAMPLES,
                                                            repeat=2):

        if type1 == type2 and n_out1 == n_out2:
            y_type, y_check1, y_check2 = _check_reg_targets(y1, y2)
            assert_equal(type1, y_type)
            if type1 == 'continuous':
                assert_array_equal(y_check1, np.reshape(y1, (-1, 1)))
                assert_array_equal(y_check2, np.reshape(y2, (-1, 1)))
            else:
                assert_array_equal(y_check1, y1)
                assert_array_equal(y_check2, y2)
        else:
            assert_raises(ValueError, _check_reg_targets, y1, y2)


def test_log_loss():
    # binary case with symbolic labels ("no" < "yes")
    y_true = ["no", "no", "no", "yes", "yes", "yes"]
    y_pred = np.array([[0.5, 0.5], [0.1, 0.9], [0.01, 0.99],
                       [0.9, 0.1], [0.75, 0.25], [0.001, 0.999]])
    loss = log_loss(y_true, y_pred)
    assert_almost_equal(loss, 1.8817971)

    # multiclass case; adapted from http://bit.ly/RJJHWA
    y_true = [1, 0, 2]
    y_pred = [[0.2, 0.7, 0.1], [0.6, 0.2, 0.2], [0.6, 0.1, 0.3]]
    loss = log_loss(y_true, y_pred, normalize=True)
    assert_almost_equal(loss, 0.6904911)

    # check that we got all the shapes and axes right
    # by doubling the length of y_true and y_pred
    y_true *= 2
    y_pred *= 2
    loss = log_loss(y_true, y_pred, normalize=False)
    assert_almost_equal(loss, 0.6904911 * 6, decimal=6)

    # check eps and handling of absolute zero and one probabilities
    y_pred = np.asarray(y_pred) > .5
    loss = log_loss(y_true, y_pred, normalize=True, eps=.1)
    assert_almost_equal(loss, log_loss(y_true, np.clip(y_pred, .1, .9)))

    # raise error if number of classes are not equal.
    y_true = [1, 0, 2]
    y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1]]
    assert_raises(ValueError, log_loss, y_true, y_pred)

    # case when y_true is a string array object
    y_true = ["ham", "spam", "spam", "ham"]
    y_pred = [[0.2, 0.7], [0.6, 0.5], [0.4, 0.1], [0.7, 0.2]]
    loss = log_loss(y_true, y_pred)
    assert_almost_equal(loss, 1.0383217, decimal=6)


@ignore_warnings
def _check_averaging(metric, y_true, y_pred, y_true_binarize, y_pred_binarize,
                     is_multilabel):
    n_samples, n_classes = y_true_binarize.shape

    # No averaging
    label_measure = metric(y_true, y_pred, average=None)
    assert_array_almost_equal(label_measure,
                              [metric(y_true_binarize[:, i],
                                      y_pred_binarize[:, i])
                               for i in range(n_classes)])

    # Micro measure
    micro_measure = metric(y_true, y_pred, average="micro")
    assert_almost_equal(micro_measure, metric(y_true_binarize.ravel(),
                                              y_pred_binarize.ravel()))

    # Macro measure
    macro_measure = metric(y_true, y_pred, average="macro")
    assert_almost_equal(macro_measure, np.mean(label_measure))

    # Weighted measure
    weights = np.sum(y_true_binarize, axis=0, dtype=int)

    if np.sum(weights) != 0:
        weighted_measure = metric(y_true, y_pred, average="weighted")
        assert_almost_equal(weighted_measure, np.average(label_measure,
                                                         weights=weights))
    else:
        weighted_measure = metric(y_true, y_pred, average="weighted")
        assert_almost_equal(weighted_measure, 0)

    # Sample measure
    if is_multilabel:
        sample_measure = metric(y_true, y_pred, average="samples")
        assert_almost_equal(sample_measure,
                            np.mean([metric(y_true_binarize[i],
                                            y_pred_binarize[i])
                                     for i in range(n_samples)]))

    assert_raises(ValueError, metric, y_true, y_pred, average="unknown")
    assert_raises(ValueError, metric, y_true, y_pred, average="garbage")


def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize,
                    y_score):
    is_multilabel = type_of_target(y_true).startswith("multilabel")

    metric = ALL_METRICS[name]

    if name in METRICS_WITH_AVERAGING:
        _check_averaging(metric, y_true, y_pred, y_true_binarize,
                         y_pred_binarize, is_multilabel)
    elif name in THRESHOLDED_METRICS_WITH_AVERAGING:
        _check_averaging(metric, y_true, y_score, y_true_binarize,
                         y_score, is_multilabel)
    else:
        raise ValueError("Metric is not recorded as having an average option")


def test_averaging_multiclass():
    y_true, y_pred, y_score = make_prediction(binary=False)
    lb = LabelBinarizer().fit(y_true)
    y_true_binarize = lb.transform(y_true)
    y_pred_binarize = lb.transform(y_pred)

    for name in METRICS_WITH_AVERAGING:
        yield (check_averaging, name, y_true, y_true_binarize, y_pred,
               y_pred_binarize, y_score)


def test_averaging_multilabel():
    n_classes = 5
    n_samples = 40
    _, y = make_multilabel_classification(n_features=1, n_classes=n_classes,
                                          random_state=5, n_samples=n_samples,
                                          return_indicator=True,
                                          allow_unlabeled=False)
    y_true = y[:20]
    y_pred = y[20:]
    y_score = check_random_state(0).normal(size=(20, n_classes))
    y_true_binarize = y_true
    y_pred_binarize = y_pred

    for name in METRICS_WITH_AVERAGING + THRESHOLDED_METRICS_WITH_AVERAGING:
        yield (check_averaging, name, y_true, y_true_binarize, y_pred,
               y_pred_binarize, y_score)


def test_averaging_multilabel_all_zeroes():
    y_true = np.zeros((20, 3))
    y_pred = np.zeros((20, 3))
    y_score = np.zeros((20, 3))
    y_true_binarize = y_true
    y_pred_binarize = y_pred

    for name in METRICS_WITH_AVERAGING:
        yield (check_averaging, name, y_true, y_true_binarize, y_pred,
               y_pred_binarize, y_score)

    # Test _average_binary_score for weight.sum() == 0
    binary_metric = (lambda y_true, y_score, average="macro":
                     _average_binary_score(
                         precision_score, y_true, y_score, average))
    _check_averaging(binary_metric, y_true, y_pred, y_true_binarize,
                     y_pred_binarize, is_multilabel=True)


def test_averaging_multilabel_all_ones():
    y_true = np.ones((20, 3))
    y_pred = np.ones((20, 3))
    y_score = np.ones((20, 3))
    y_true_binarize = y_true
    y_pred_binarize = y_pred

    for name in METRICS_WITH_AVERAGING:
        yield (check_averaging, name, y_true, y_true_binarize, y_pred,
               y_pred_binarize, y_score)


@ignore_warnings
def check_sample_weight_invariance(name, metric, y1, y2):

    rng = np.random.RandomState(0)
    sample_weight = rng.randint(1, 10, size=len(y1))

    # check that unit weights gives the same score as no weight
    unweighted_score = metric(y1, y2, sample_weight=None)
    assert_equal(
        unweighted_score,
        metric(y1, y2, sample_weight=np.ones(shape=len(y1))),
        msg="For %s sample_weight=None is not equivalent to "
            "sample_weight=ones" % name)

    # check that the weighted and unweighted scores are unequal
    weighted_score = metric(y1, y2, sample_weight=sample_weight)
    assert_not_equal(
        unweighted_score, weighted_score,
        msg="Unweighted and weighted scores are unexpectedly "
            "equal (%f) for %s" % (weighted_score, name))

    # check that sample_weight can be a list
    weighted_score_list = metric(y1, y2,
                                 sample_weight=sample_weight.tolist())
    assert_equal(
        weighted_score, weighted_score_list,
        msg="Weighted scores for array and list sample_weight input are "
            "not equal (%f != %f) for %s" % (
                weighted_score, weighted_score_list, name))

    # check that integer weights is the same as repeated samples
    repeat_weighted_score = metric(
        np.repeat(y1, sample_weight, axis=0),
        np.repeat(y2, sample_weight, axis=0), sample_weight=None)
    assert_almost_equal(
        weighted_score, repeat_weighted_score,
        err_msg="Weighting %s is not equal to repeating samples" % name)

    # check that ignoring a fraction of the samples is equivalent to setting
    # the corresponding weights to zero
    sample_weight_subset = sample_weight[1::2]
    sample_weight_zeroed = np.copy(sample_weight)
    sample_weight_zeroed[::2] = 0
    y1_subset = y1[1::2]
    y2_subset = y2[1::2]
    weighted_score_subset = metric(y1_subset, y2_subset,
                                   sample_weight=sample_weight_subset)
    weighted_score_zeroed = metric(y1, y2,
                                   sample_weight=sample_weight_zeroed)
    assert_almost_equal(
        weighted_score_subset, weighted_score_zeroed,
        err_msg="Zeroing weights does not give the same result as "
            "removing the corresponding samples (%f != %f) for %s" % (
                weighted_score_zeroed, weighted_score_subset, name))

    if not name.startswith('unnormalized'):
        # check that the score is invariant under scaling of the weights by a
        # common factor
        for scaling in [2, 0.3]:
            assert_almost_equal(
                weighted_score,
                metric(y1, y2, sample_weight=sample_weight * scaling),
                err_msg="%s sample_weight is not invariant "
                        "under scaling" % name)


def test_sample_weight_invariance():
    # binary output
    y1, y2, _ = make_prediction(binary=True)
    for name in ALL_METRICS:
        if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or
                name in METRIC_UNDEFINED_MULTICLASS):
            continue
        metric = ALL_METRICS[name]
        yield check_sample_weight_invariance, name, metric, y1, y2

    # multiclass
    y1, y2, _ = make_prediction()
    for name in ALL_METRICS:
        if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or
                name in METRIC_UNDEFINED_MULTICLASS):
            continue
        metric = ALL_METRICS[name]
        yield check_sample_weight_invariance, name, metric, y1, y2


    # multilabel sequence
    _, ya = make_multilabel_classification(
        n_features=1, n_classes=3,
        random_state=0, n_samples=10)
    _, yb = make_multilabel_classification(
        n_features=1, n_classes=3,
        random_state=1, n_samples=10)
    y1 = ya + yb
    y2 = ya + ya

    for name in MULTILABELS_METRICS:
        if name in METRICS_WITHOUT_SAMPLE_WEIGHT:
            continue
        metric = ALL_METRICS[name]
        yield (check_sample_weight_invariance, name, metric, y1, y2)

    # multilabel indicator
    _, ya = make_multilabel_classification(
        n_features=1, n_classes=6,
        random_state=0, n_samples=10,
        return_indicator=True)
    _, yb = make_multilabel_classification(
        n_features=1, n_classes=6,
        random_state=1, n_samples=10,
        return_indicator=True)
    y1 = np.vstack([ya, yb])
    y2 = np.vstack([ya, ya])
    for name in (MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS +
                 MULTIOUTPUT_METRICS):
        if name in METRICS_WITHOUT_SAMPLE_WEIGHT:
            continue

        metric = ALL_METRICS[name]
        yield (check_sample_weight_invariance, name, metric, y1, y2)