"""Utilities for input validation"""
# Authors: Olivier Grisel
# Gael Varoquaux
# Andreas Mueller
# Lars Buitinck
# Alexandre Gramfort
# Nicolas Tresegnie
# Sylvain Marie
# License: BSD 3 clause
from functools import wraps
import warnings
import numbers
import numpy as np
import scipy.sparse as sp
from distutils.version import LooseVersion
from inspect import signature, isclass, Parameter
from numpy.core.numeric import ComplexWarning
import joblib
from contextlib import suppress
from .fixes import _object_dtype_isnan
from .. import get_config as _get_config
from ..exceptions import NonBLASDotWarning, PositiveSpectrumWarning
from ..exceptions import NotFittedError
from ..exceptions import DataConversionWarning
FLOAT_DTYPES = (np.float64, np.float32, np.float16)
# Silenced by default to reduce verbosity. Turn on at runtime for
# performance profiling.
warnings.simplefilter('ignore', NonBLASDotWarning)
def _deprecate_positional_args(f):
"""Decorator for methods that issues warnings for positional arguments
Using the keyword-only argument syntax in pep 3102, arguments after the
* will issue a warning when passed as a positional argument.
Parameters
----------
f : function
function to check arguments on
"""
sig = signature(f)
kwonly_args = []
all_args = []
for name, param in sig.parameters.items():
if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
all_args.append(name)
elif param.kind == Parameter.KEYWORD_ONLY:
kwonly_args.append(name)
@wraps(f)
def inner_f(*args, **kwargs):
extra_args = len(args) - len(all_args)
if extra_args > 0:
# ignore first 'self' argument for instance methods
args_msg = ['{}={}'.format(name, arg)
for name, arg in zip(kwonly_args[:extra_args],
args[-extra_args:])]
warnings.warn("Pass {} as keyword args. From version 0.25 "
"passing these as positional arguments will "
"result in an error".format(", ".join(args_msg)),
FutureWarning)
kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
return f(**kwargs)
return inner_f
def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
"""Like assert_all_finite, but only for ndarray."""
# validation is also imported in extmath
from .extmath import _safe_accumulator_op
if _get_config()['assume_finite']:
return
X = np.asanyarray(X)
# First try an O(n) time, O(1) space solution for the common case that
# everything is finite; fall back to O(n) space np.isfinite to prevent
# false positives from overflow in sum method. The sum is also calculated
# safely to reduce dtype induced overflows.
is_float = X.dtype.kind in 'fc'
if is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))):
pass
elif is_float:
msg_err = "Input contains {} or a value too large for {!r}."
if (allow_nan and np.isinf(X).any() or
not allow_nan and not np.isfinite(X).all()):
type_err = 'infinity' if allow_nan else 'NaN, infinity'
raise ValueError(
msg_err.format
(type_err,
msg_dtype if msg_dtype is not None else X.dtype)
)
# for object dtype data, we only check for NaNs (GH-13254)
elif X.dtype == np.dtype('object') and not allow_nan:
if _object_dtype_isnan(X).any():
raise ValueError("Input contains NaN")
@_deprecate_positional_args
def assert_all_finite(X, *, allow_nan=False):
"""Throw a ValueError if X contains NaN or infinity.
Parameters
----------
X : array or sparse matrix
allow_nan : bool
"""
_assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
@_deprecate_positional_args
def as_float_array(X, *, copy=True, force_all_finite=True):
"""Converts an array-like to an array of floats.
The new dtype will be np.float32 or np.float64, depending on the original
type. The function can create a copy or modify the argument depending
on the argument copy.
Parameters
----------
X : {array-like, sparse matrix}
copy : bool, optional
If True, a copy of X will be created. If False, a copy may still be
returned if X's dtype is not a floating point type.
force_all_finite : boolean or 'allow-nan', (default=True)
Whether to raise an error on np.inf, np.nan, pd.NA in X. The
possibilities are:
- True: Force all values of X to be finite.
- False: accepts np.inf, np.nan, pd.NA in X.
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
be infinite.
.. versionadded:: 0.20
``force_all_finite`` accepts the string ``'allow-nan'``.
.. versionchanged:: 0.23
Accepts `pd.NA` and converts it into `np.nan`
Returns
-------
XT : {array, sparse matrix}
An array of type np.float
"""
if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray)
and not sp.issparse(X)):
return check_array(X, accept_sparse=['csr', 'csc', 'coo'],
dtype=np.float64, copy=copy,
force_all_finite=force_all_finite, ensure_2d=False)
elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
return X.copy() if copy else X
elif X.dtype in [np.float32, np.float64]: # is numpy array
return X.copy('F' if X.flags['F_CONTIGUOUS'] else 'C') if copy else X
else:
if X.dtype.kind in 'uib' and X.dtype.itemsize <= 4:
return_dtype = np.float32
else:
return_dtype = np.float64
return X.astype(return_dtype)
def _is_arraylike(x):
"""Returns whether the input is array-like"""
return (hasattr(x, '__len__') or
hasattr(x, 'shape') or
hasattr(x, '__array__'))
def _num_samples(x):
"""Return number of samples in array-like x."""
message = 'Expected sequence or array-like, got %s' % type(x)
if hasattr(x, 'fit') and callable(x.fit):
# Don't get num_samples from an ensembles length!
raise TypeError(message)
if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
if hasattr(x, '__array__'):
x = np.asarray(x)
else:
raise TypeError(message)
if hasattr(x, 'shape') and x.shape is not None:
if len(x.shape) == 0:
raise TypeError("Singleton array %r cannot be considered"
" a valid collection." % x)
# Check that shape is returning an integer or default to len
# Dask dataframes may not return numeric shape[0] value
if isinstance(x.shape[0], numbers.Integral):
return x.shape[0]
try:
return len(x)
except TypeError:
raise TypeError(message)
def check_memory(memory):
"""Check that ``memory`` is joblib.Memory-like.
joblib.Memory-like means that ``memory`` can be converted into a
joblib.Memory instance (typically a str denoting the ``location``)
or has the same interface (has a ``cache`` method).
Parameters
----------
memory : None, str or object with the joblib.Memory interface
Returns
-------
memory : object with the joblib.Memory interface
Raises
------
ValueError
If ``memory`` is not joblib.Memory-like.
"""
if memory is None or isinstance(memory, str):
if LooseVersion(joblib.__version__) < '0.12':
memory = joblib.Memory(cachedir=memory, verbose=0)
else:
memory = joblib.Memory(location=memory, verbose=0)
elif not hasattr(memory, 'cache'):
raise ValueError("'memory' should be None, a string or have the same"
" interface as joblib.Memory."
" Got memory='{}' instead.".format(memory))
return memory
def check_consistent_length(*arrays):
"""Check that all arrays have consistent first dimensions.
Checks whether all objects in arrays have the same shape or length.
Parameters
----------
*arrays : list or tuple of input objects.
Objects that will be checked for consistent length.
"""
lengths = [_num_samples(X) for X in arrays if X is not None]
uniques = np.unique(lengths)
if len(uniques) > 1:
raise ValueError("Found input variables with inconsistent numbers of"
" samples: %r" % [int(l) for l in lengths])
def _make_indexable(iterable):
"""Ensure iterable supports indexing or convert to an indexable variant.
Convert sparse matrices to csr and other non-indexable iterable to arrays.
Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
Parameters
----------
iterable : {list, dataframe, array, sparse} or None
Object to be converted to an indexable iterable.
"""
if sp.issparse(iterable):
return iterable.tocsr()
elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"):
return iterable
elif iterable is None:
return iterable
return np.array(iterable)
def indexable(*iterables):
"""Make arrays indexable for cross-validation.
Checks consistent length, passes through None, and ensures that everything
can be indexed by converting sparse matrices to csr and converting
non-interable objects to arrays.
Parameters
----------
*iterables : lists, dataframes, arrays, sparse matrices
List of objects to ensure sliceability.
"""
result = [_make_indexable(X) for X in iterables]
check_consistent_length(*result)
return result
def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
force_all_finite, accept_large_sparse):
"""Convert a sparse matrix to a given format.
Checks the sparse format of spmatrix and converts if necessary.
Parameters
----------
spmatrix : scipy sparse matrix
Input to validate and convert.
accept_sparse : string, boolean or list/tuple of strings
String[s] representing allowed sparse matrix formats ('csc',
'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
not in the allowed format, it will be converted to the first listed
format. True allows the input to be any format. False means
that a sparse matrix input will raise an error.
dtype : string, type or None
Data type of result. If None, the dtype of the input is preserved.
copy : boolean
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.
force_all_finite : boolean or 'allow-nan', (default=True)
Whether to raise an error on np.inf, np.nan, pd.NA in X. The
possibilities are:
- True: Force all values of X to be finite.
- False: accepts np.inf, np.nan, pd.NA in X.
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
be infinite.
.. versionadded:: 0.20
``force_all_finite`` accepts the string ``'allow-nan'``.
.. versionchanged:: 0.23
Accepts `pd.NA` and converts it into `np.nan`
Returns
-------
spmatrix_converted : scipy sparse matrix.
Matrix that is ensured to have an allowed type.
"""
if dtype is None:
dtype = spmatrix.dtype
changed_format = False
Loading ...