"""Metrics to assess performance on classification task given scores
Functions named as ``*_score`` return a scalar value to maximize: the higher
the better
Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
the lower the better
"""
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Mathieu Blondel <mathieu@mblondel.org>
# Olivier Grisel <olivier.grisel@ensta.org>
# Arnaud Joly <a.joly@ulg.ac.be>
# Jochen Wersdorfer <jochen@wersdoerfer.de>
# Lars Buitinck
# Joel Nothman <joel.nothman@gmail.com>
# Noel Dawe <noel@dawe.me>
# License: BSD 3 clause
import warnings
from functools import partial
import numpy as np
from scipy.sparse import csr_matrix
from scipy.stats import rankdata
from ..utils import assert_all_finite
from ..utils import check_consistent_length
from ..utils import column_or_1d, check_array
from ..utils.multiclass import type_of_target
from ..utils.extmath import stable_cumsum
from ..utils.sparsefuncs import count_nonzero
from ..utils.validation import _deprecate_positional_args
from ..exceptions import UndefinedMetricWarning
from ..preprocessing import label_binarize
from ..preprocessing._label import _encode
from ._base import _average_binary_score, _average_multiclass_ovo_score
def auc(x, y):
"""Compute Area Under the Curve (AUC) using the trapezoidal rule
This is a general function, given points on a curve. For computing the
area under the ROC-curve, see :func:`roc_auc_score`. For an alternative
way to summarize a precision-recall curve, see
:func:`average_precision_score`.
Parameters
----------
x : array, shape = [n]
x coordinates. These must be either monotonic increasing or monotonic
decreasing.
y : array, shape = [n]
y coordinates.
Returns
-------
auc : float
Examples
--------
>>> import numpy as np
>>> from sklearn import metrics
>>> y = np.array([1, 1, 2, 2])
>>> pred = np.array([0.1, 0.4, 0.35, 0.8])
>>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
>>> metrics.auc(fpr, tpr)
0.75
See also
--------
roc_auc_score : Compute the area under the ROC curve
average_precision_score : Compute average precision from prediction scores
precision_recall_curve :
Compute precision-recall pairs for different probability thresholds
"""
check_consistent_length(x, y)
x = column_or_1d(x)
y = column_or_1d(y)
if x.shape[0] < 2:
raise ValueError('At least 2 points are needed to compute'
' area under curve, but x.shape = %s' % x.shape)
direction = 1
dx = np.diff(x)
if np.any(dx < 0):
if np.all(dx <= 0):
direction = -1
else:
raise ValueError("x is neither increasing nor decreasing "
": {}.".format(x))
area = direction * np.trapz(y, x)
if isinstance(area, np.memmap):
# Reductions such as .sum used internally in np.trapz do not return a
# scalar by default for numpy.memmap instances contrary to
# regular numpy.ndarray instances.
area = area.dtype.type(area)
return area
@_deprecate_positional_args
def average_precision_score(y_true, y_score, *, average="macro", pos_label=1,
sample_weight=None):
"""Compute average precision (AP) from prediction scores
AP summarizes a precision-recall curve as the weighted mean of precisions
achieved at each threshold, with the increase in recall from the previous
threshold used as the weight:
.. math::
\\text{AP} = \\sum_n (R_n - R_{n-1}) P_n
where :math:`P_n` and :math:`R_n` are the precision and recall at the nth
threshold [1]_. This implementation is not interpolated and is different
from computing the area under the precision-recall curve with the
trapezoidal rule, which uses linear interpolation and can be too
optimistic.
Note: this implementation is restricted to the binary classification task
or multilabel classification task.
Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
Parameters
----------
y_true : array, shape = [n_samples] or [n_samples, n_classes]
True binary labels or binary label indicators.
y_score : array, shape = [n_samples] or [n_samples, n_classes]
Target scores, can either be probability estimates of the positive
class, confidence values, or non-thresholded measure of decisions
(as returned by "decision_function" on some classifiers).
average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
If ``None``, the scores for each class are returned. Otherwise,
this determines the type of averaging performed on the data:
``'micro'``:
Calculate metrics globally by considering each element of the label
indicator matrix as a label.
``'macro'``:
Calculate metrics for each label, and find their unweighted
mean. This does not take label imbalance into account.
``'weighted'``:
Calculate metrics for each label, and find their average, weighted
by support (the number of true instances for each label).
``'samples'``:
Calculate metrics for each instance, and find their average.
Will be ignored when ``y_true`` is binary.
pos_label : int or str (default=1)
The label of the positive class. Only applied to binary ``y_true``.
For multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
average_precision : float
References
----------
.. [1] `Wikipedia entry for the Average precision
<https://en.wikipedia.org/w/index.php?title=Information_retrieval&
oldid=793358396#Average_precision>`_
See also
--------
roc_auc_score : Compute the area under the ROC curve
precision_recall_curve :
Compute precision-recall pairs for different probability thresholds
Examples
--------
>>> import numpy as np
>>> from sklearn.metrics import average_precision_score
>>> y_true = np.array([0, 0, 1, 1])
>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
>>> average_precision_score(y_true, y_scores)
0.83...
Notes
-----
.. versionchanged:: 0.19
Instead of linearly interpolating between operating points, precisions
are weighted by the change in recall since the last operating point.
"""
def _binary_uninterpolated_average_precision(
y_true, y_score, pos_label=1, sample_weight=None):
precision, recall, _ = precision_recall_curve(
y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)
# Return the step function integral
# The following works because the last entry of precision is
# guaranteed to be 1, as returned by precision_recall_curve
return -np.sum(np.diff(recall) * np.array(precision)[:-1])
y_type = type_of_target(y_true)
if y_type == "multilabel-indicator" and pos_label != 1:
raise ValueError("Parameter pos_label is fixed to 1 for "
"multilabel-indicator y_true. Do not set "
"pos_label or set pos_label to 1.")
elif y_type == "binary":
present_labels = np.unique(y_true)
if len(present_labels) == 2 and pos_label not in present_labels:
raise ValueError("pos_label=%r is invalid. Set it to a label in "
"y_true." % pos_label)
average_precision = partial(_binary_uninterpolated_average_precision,
pos_label=pos_label)
return _average_binary_score(average_precision, y_true, y_score,
average, sample_weight=sample_weight)
def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
"""Binary roc auc score"""
if len(np.unique(y_true)) != 2:
raise ValueError("Only one class present in y_true. ROC AUC score "
"is not defined in that case.")
fpr, tpr, _ = roc_curve(y_true, y_score,
sample_weight=sample_weight)
if max_fpr is None or max_fpr == 1:
return auc(fpr, tpr)
if max_fpr <= 0 or max_fpr > 1:
raise ValueError("Expected max_fpr in range (0, 1], got: %r" % max_fpr)
# Add a single point at max_fpr by linear interpolation
stop = np.searchsorted(fpr, max_fpr, 'right')
x_interp = [fpr[stop - 1], fpr[stop]]
y_interp = [tpr[stop - 1], tpr[stop]]
tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
fpr = np.append(fpr[:stop], max_fpr)
partial_auc = auc(fpr, tpr)
# McClish correction: standardize result to be 0.5 if non-discriminant
# and 1 if maximal
min_area = 0.5 * max_fpr**2
max_area = max_fpr
return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
@_deprecate_positional_args
def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
max_fpr=None, multi_class="raise", labels=None):
"""Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
from prediction scores.
Note: this implementation can be used with binary, multiclass and
multilabel classification, but some restrictions apply (see Parameters).
Read more in the :ref:`User Guide <roc_metrics>`.
Parameters
----------
y_true : array-like of shape (n_samples,) or (n_samples, n_classes)
True labels or binary label indicators. The binary and multiclass cases
expect labels with shape (n_samples,) while the multilabel case expects
binary label indicators with shape (n_samples, n_classes).
y_score : array-like of shape (n_samples,) or (n_samples, n_classes)
Target scores. In the binary and multilabel cases, these can be either
probability estimates or non-thresholded decision values (as returned
by `decision_function` on some classifiers). In the multiclass case,
these must be probability estimates which sum to 1. The binary
case expects a shape (n_samples,), and the scores must be the scores of
the class with the greater label. The multiclass and multilabel
cases expect a shape (n_samples, n_classes). In the multiclass case,
the order of the class scores must correspond to the order of
``labels``, if provided, or else to the numerical or lexicographical
order of the labels in ``y_true``.
average : {'micro', 'macro', 'samples', 'weighted'} or None, \
default='macro'
If ``None``, the scores for each class are returned. Otherwise,
this determines the type of averaging performed on the data:
Note: multiclass ROC AUC currently only handles the 'macro' and
'weighted' averages.
``'micro'``:
Calculate metrics globally by considering each element of the label
indicator matrix as a label.
``'macro'``:
Calculate metrics for each label, and find their unweighted
mean. This does not take label imbalance into account.
``'weighted'``:
Calculate metrics for each label, and find their average, weighted
by support (the number of true instances for each label).
``'samples'``:
Calculate metrics for each instance, and find their average.
Will be ignored when ``y_true`` is binary.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
max_fpr : float > 0 and <= 1, default=None
If not ``None``, the standardized partial AUC [2]_ over the range
[0, max_fpr] is returned. For the multiclass case, ``max_fpr``,
should be either equal to ``None`` or ``1.0`` as AUC ROC partial
computation currently is not supported for multiclass.
multi_class : {'raise', 'ovr', 'ovo'}, default='raise'
Multiclass only. Determines the type of configuration to use. The
default value raises an error, so either ``'ovr'`` or ``'ovo'`` must be
passed explicitly.
``'ovr'``:
Computes the AUC of each class against the rest [3]_ [4]_. This
treats the multiclass case in the same way as the multilabel case.
Sensitive to class imbalance even when ``average == 'macro'``,
because class imbalance affects the composition of each of the
'rest' groupings.
``'ovo'``:
Computes the average AUC of all possible pairwise combinations of
classes [5]_. Insensitive to class imbalance when
``average == 'macro'``.
labels : array-like of shape (n_classes,), default=None
Multiclass only. List of labels that index the classes in ``y_score``.
If ``None``, the numerical or lexicographical order of the labels in
``y_true`` is used.
Returns
-------
auc : float
References
----------
.. [1] `Wikipedia entry for the Receiver operating characteristic
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
.. [2] `Analyzing a portion of the ROC curve. McClish, 1989
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
.. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving
probability estimation trees (Section 6.2), CeDER Working Paper
#IS-00-04, Stern School of Business, New York University.
.. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern
Loading ...