# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
# Sergey Feldman <sergeyfeldman@gmail.com>
# License: BSD 3 clause
import numbers
import warnings
import numpy as np
import numpy.ma as ma
from scipy import sparse
from scipy import stats
from ..base import BaseEstimator, TransformerMixin
from ..utils.sparsefuncs import _get_median
from ..utils.validation import check_is_fitted
from ..utils.validation import FLOAT_DTYPES
from ..utils._mask import _get_mask
from ..utils import is_scalar_nan
from ..utils import check_array
def _check_inputs_dtype(X, missing_values):
if (X.dtype.kind in ("f", "i", "u") and
not isinstance(missing_values, numbers.Real)):
raise ValueError("'X' and 'missing_values' types are expected to be"
" both numerical. Got X.dtype={} and "
" type(missing_values)={}."
.format(X.dtype, type(missing_values)))
def _most_frequent(array, extra_value, n_repeat):
"""Compute the most frequent value in a 1d array extended with
[extra_value] * n_repeat, where extra_value is assumed to be not part
of the array."""
# Compute the most frequent value in array only
if array.size > 0:
with warnings.catch_warnings():
# stats.mode raises a warning when input array contains objects due
# to incapacity to detect NaNs. Irrelevant here since input array
# has already been NaN-masked.
warnings.simplefilter("ignore", RuntimeWarning)
mode = stats.mode(array)
most_frequent_value = mode[0][0]
most_frequent_count = mode[1][0]
else:
most_frequent_value = 0
most_frequent_count = 0
# Compare to array + [extra_value] * n_repeat
if most_frequent_count == 0 and n_repeat == 0:
return np.nan
elif most_frequent_count < n_repeat:
return extra_value
elif most_frequent_count > n_repeat:
return most_frequent_value
elif most_frequent_count == n_repeat:
# Ties the breaks. Copy the behaviour of scipy.stats.mode
if most_frequent_value < extra_value:
return most_frequent_value
else:
return extra_value
class _BaseImputer(TransformerMixin, BaseEstimator):
"""Base class for all imputers.
It adds automatically support for `add_indicator`.
"""
def __init__(self, missing_values=np.nan, add_indicator=False):
self.missing_values = missing_values
self.add_indicator = add_indicator
def _fit_indicator(self, X):
"""Fit a MissingIndicator."""
if self.add_indicator:
self.indicator_ = MissingIndicator(
missing_values=self.missing_values, error_on_new=False
)
self.indicator_.fit(X)
else:
self.indicator_ = None
def _transform_indicator(self, X):
"""Compute the indicator mask.'
Note that X must be the original data as passed to the imputer before
any imputation, since imputation may be done inplace in some cases.
"""
if self.add_indicator:
if not hasattr(self, 'indicator_'):
raise ValueError(
"Make sure to call _fit_indicator before "
"_transform_indicator"
)
return self.indicator_.transform(X)
def _concatenate_indicator(self, X_imputed, X_indicator):
"""Concatenate indicator mask with the imputed data."""
if not self.add_indicator:
return X_imputed
hstack = sparse.hstack if sparse.issparse(X_imputed) else np.hstack
if X_indicator is None:
raise ValueError(
"Data from the missing indicator are not provided. Call "
"_fit_indicator and _transform_indicator in the imputer "
"implementation."
)
return hstack((X_imputed, X_indicator))
def _more_tags(self):
return {'allow_nan': is_scalar_nan(self.missing_values)}
class SimpleImputer(_BaseImputer):
"""Imputation transformer for completing missing values.
Read more in the :ref:`User Guide <impute>`.
Parameters
----------
missing_values : number, string, np.nan (default) or None
The placeholder for the missing values. All occurrences of
`missing_values` will be imputed.
strategy : string, optional (default="mean")
The imputation strategy.
- If "mean", then replace missing values using the mean along
each column. Can only be used with numeric data.
- If "median", then replace missing values using the median along
each column. Can only be used with numeric data.
- If "most_frequent", then replace missing using the most frequent
value along each column. Can be used with strings or numeric data.
- If "constant", then replace missing values with fill_value. Can be
used with strings or numeric data.
.. versionadded:: 0.20
strategy="constant" for fixed value imputation.
fill_value : string or numerical value, optional (default=None)
When strategy == "constant", fill_value is used to replace all
occurrences of missing_values.
If left to the default, fill_value will be 0 when imputing numerical
data and "missing_value" for strings or object data types.
verbose : integer, optional (default=0)
Controls the verbosity of the imputer.
copy : boolean, optional (default=True)
If True, a copy of X will be created. If False, imputation will
be done in-place whenever possible. Note that, in the following cases,
a new copy will always be made, even if `copy=False`:
- If X is not an array of floating values;
- If X is encoded as a CSR matrix;
- If add_indicator=True.
add_indicator : boolean, optional (default=False)
If True, a :class:`MissingIndicator` transform will stack onto output
of the imputer's transform. This allows a predictive estimator
to account for missingness despite imputation. If a feature has no
missing values at fit/train time, the feature won't appear on
the missing indicator even if there are missing values at
transform/test time.
Attributes
----------
statistics_ : array of shape (n_features,)
The imputation fill value for each feature.
Computing statistics can result in `np.nan` values.
During :meth:`transform`, features corresponding to `np.nan`
statistics will be discarded.
indicator_ : :class:`sklearn.impute.MissingIndicator`
Indicator used to add binary indicators for missing values.
``None`` if add_indicator is False.
See also
--------
IterativeImputer : Multivariate imputation of missing values.
Examples
--------
>>> import numpy as np
>>> from sklearn.impute import SimpleImputer
>>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
>>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
SimpleImputer()
>>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
>>> print(imp_mean.transform(X))
[[ 7. 2. 3. ]
[ 4. 3.5 6. ]
[10. 3.5 9. ]]
Notes
-----
Columns which only contained missing values at :meth:`fit` are discarded
upon :meth:`transform` if strategy is not "constant".
"""
def __init__(self, missing_values=np.nan, strategy="mean",
fill_value=None, verbose=0, copy=True, add_indicator=False):
super().__init__(
missing_values=missing_values,
add_indicator=add_indicator
)
self.strategy = strategy
self.fill_value = fill_value
self.verbose = verbose
self.copy = copy
def _validate_input(self, X):
allowed_strategies = ["mean", "median", "most_frequent", "constant"]
if self.strategy not in allowed_strategies:
raise ValueError("Can only use these strategies: {0} "
" got strategy={1}".format(allowed_strategies,
self.strategy))
if self.strategy in ("most_frequent", "constant"):
dtype = None
else:
dtype = FLOAT_DTYPES
if not is_scalar_nan(self.missing_values):
force_all_finite = True
else:
force_all_finite = "allow-nan"
try:
X = check_array(X, accept_sparse='csc', dtype=dtype,
force_all_finite=force_all_finite, copy=self.copy)
except ValueError as ve:
if "could not convert" in str(ve):
new_ve = ValueError("Cannot use {} strategy with non-numeric "
"data:\n{}".format(self.strategy, ve))
raise new_ve from None
else:
raise ve
_check_inputs_dtype(X, self.missing_values)
if X.dtype.kind not in ("i", "u", "f", "O"):
raise ValueError("SimpleImputer does not support data with dtype "
"{0}. Please provide either a numeric array (with"
" a floating point or integer dtype) or "
"categorical data represented either as an array "
"with integer dtype or an array of string values "
"with an object dtype.".format(X.dtype))
return X
def fit(self, X, y=None):
"""Fit the imputer on X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Input data, where ``n_samples`` is the number of samples and
``n_features`` is the number of features.
Returns
-------
self : SimpleImputer
"""
X = self._validate_input(X)
super()._fit_indicator(X)
# default fill_value is 0 for numerical input and "missing_value"
# otherwise
if self.fill_value is None:
if X.dtype.kind in ("i", "u", "f"):
fill_value = 0
else:
fill_value = "missing_value"
else:
fill_value = self.fill_value
# fill_value should be numerical in case of numerical input
if (self.strategy == "constant" and
X.dtype.kind in ("i", "u", "f") and
not isinstance(fill_value, numbers.Real)):
raise ValueError("'fill_value'={0} is invalid. Expected a "
"numerical value when imputing numerical "
"data".format(fill_value))
if sparse.issparse(X):
# missing_values = 0 not allowed with sparse data as it would
# force densification
if self.missing_values == 0:
raise ValueError("Imputation not possible when missing_values "
"== 0 and input is sparse. Provide a dense "
"array instead.")
else:
self.statistics_ = self._sparse_fit(X,
self.strategy,
self.missing_values,
fill_value)
else:
self.statistics_ = self._dense_fit(X,
self.strategy,
self.missing_values,
fill_value)
return self
def _sparse_fit(self, X, strategy, missing_values, fill_value):
"""Fit the transformer on sparse data."""
mask_data = _get_mask(X.data, missing_values)
n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
statistics = np.empty(X.shape[1])
if strategy == "constant":
# for constant strategy, self.statistcs_ is used to store
# fill_value in each column
statistics.fill(fill_value)
else:
for i in range(X.shape[1]):
column = X.data[X.indptr[i]:X.indptr[i + 1]]
mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
column = column[~mask_column]
# combine explicit and implicit zeros
mask_zeros = _get_mask(column, 0)
column = column[~mask_zeros]
n_explicit_zeros = mask_zeros.sum()
n_zeros = n_implicit_zeros[i] + n_explicit_zeros
if strategy == "mean":
s = column.size + n_zeros
statistics[i] = np.nan if s == 0 else column.sum() / s
elif strategy == "median":
statistics[i] = _get_median(column,
n_zeros)
elif strategy == "most_frequent":
statistics[i] = _most_frequent(column,
0,
n_zeros)
return statistics
def _dense_fit(self, X, strategy, missing_values, fill_value):
"""Fit the transformer on dense data."""
mask = _get_mask(X, missing_values)
masked_X = ma.masked_array(X, mask=mask)
# Mean
if strategy == "mean":
mean_masked = np.ma.mean(masked_X, axis=0)
# Avoid the warning "Warning: converting a masked element to nan."
mean = np.ma.getdata(mean_masked)
mean[np.ma.getmask(mean_masked)] = np.nan
return mean
# Median
elif strategy == "median":
median_masked = np.ma.median(masked_X, axis=0)
# Avoid the warning "Warning: converting a masked element to nan."
median = np.ma.getdata(median_masked)
median[np.ma.getmaskarray(median_masked)] = np.nan
return median
# Most frequent
elif strategy == "most_frequent":
# Avoid use of scipy.stats.mstats.mode due to the required
# additional overhead and slow benchmarking performance.
# See Issue 14325 and PR 14399 for full discussion.
# To be able access the elements by columns
X = X.transpose()
mask = mask.transpose()
if X.dtype.kind == "O":
most_frequent = np.empty(X.shape[0], dtype=object)
else:
most_frequent = np.empty(X.shape[0])
for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
row_mask = np.logical_not(row_mask).astype(np.bool)
row = row[row_mask]
most_frequent[i] = _most_frequent(row, np.nan, 0)
return most_frequent
# Constant
elif strategy == "constant":
# for constant strategy, self.statistcs_ is used to store
# fill_value in each column
return np.full(X.shape[1], fill_value, dtype=X.dtype)
def transform(self, X):
"""Impute all missing values in X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The input data to complete.
"""
check_is_fitted(self)
X = self._validate_input(X)
X_indicator = super()._transform_indicator(X)
statistics = self.statistics_
if X.shape[1] != statistics.shape[0]:
raise ValueError("X has %d features per sample, expected %d"
% (X.shape[1], self.statistics_.shape[0]))
# Delete the invalid columns if strategy is not constant
if self.strategy == "constant":
valid_statistics = statistics
else:
# same as np.isnan but also works for object dtypes
invalid_mask = _get_mask(statistics, np.nan)
valid_mask = np.logical_not(invalid_mask)
valid_statistics = statistics[valid_mask]
valid_statistics_indexes = np.flatnonzero(valid_mask)
if invalid_mask.any():
missing = np.arange(X.shape[1])[invalid_mask]
if self.verbose:
warnings.warn("Deleting features without "
"observed values: %s" % missing)
X = X[:, valid_statistics_indexes]
# Do actual imputation
if sparse.issparse(X):
if self.missing_values == 0:
raise ValueError("Imputation not possible when missing_values "
"== 0 and input is sparse. Provide a dense "
"array instead.")
else:
mask = _get_mask(X.data, self.missing_values)
indexes = np.repeat(
np.arange(len(X.indptr) - 1, dtype=np.int),
np.diff(X.indptr))[mask]
X.data[mask] = valid_statistics[indexes].astype(X.dtype,
copy=False)
else:
mask = _get_mask(X, self.missing_values)
n_missing = np.sum(mask, axis=0)
values = np.repeat(valid_statistics, n_missing)
coordinates = np.where(mask.transpose())[::-1]
X[coordinates] = values
return super()._concatenate_indicator(X, X_indicator)
class MissingIndicator(TransformerMixin, BaseEstimator):
"""Binary indicators for missing values.
Note that this component typically should not be used in a vanilla
:class:`Pipeline` consisting of transformers and a classifier, but rather
could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
Read more in the :ref:`User Guide <impute>`.
Parameters
----------
missing_values : number, string, np.nan (default) or None
The placeholder for the missing values. All occurrences of
`missing_values` will be indicated (True in the output array), the
other values will be marked as False.
features : str, optional
Whether the imputer mask should represent all or a subset of
features.
- If "missing-only" (default), the imputer mask will only represent
features containing missing values during fit time.
- If "all", the imputer mask will represent all features.
sparse : boolean or "auto", optional
Whether the imputer mask format should be sparse or dense.
- If "auto" (default), the imputer mask will be of same type as
input.
- If True, the imputer mask will be a sparse matrix.
- If False, the imputer mask will be a numpy array.
error_on_new : boolean, optional
If True (default), transform will raise an error when there are
features with missing values in transform that have no missing values
in fit. This is applicable only when ``features="missing-only"``.
Attributes
----------
features_ : ndarray, shape (n_missing_features,) or (n_features,)
The features indices which will be returned when calling ``transform``.
They are computed during ``fit``. For ``features='all'``, it is
to ``range(n_features)``.
Examples
--------
>>> import numpy as np
>>> from sklearn.impute import MissingIndicator
>>> X1 = np.array([[np.nan, 1, 3],
... [4, 0, np.nan],
... [8, 1, 0]])
>>> X2 = np.array([[5, 1, np.nan],
... [np.nan, 2, 3],
... [2, 4, 0]])
>>> indicator = MissingIndicator()
>>> indicator.fit(X1)
MissingIndicator()
>>> X2_tr = indicator.transform(X2)
>>> X2_tr
array([[False, True],
[ True, False],
[False, False]])
"""
def __init__(self, missing_values=np.nan, features="missing-only",
sparse="auto", error_on_new=True):
self.missing_values = missing_values
self.features = features
self.sparse = sparse
self.error_on_new = error_on_new
def _get_missing_features_info(self, X):
"""Compute the imputer mask and the indices of the features
containing missing values.
Parameters
----------
X : {ndarray or sparse matrix}, shape (n_samples, n_features)
The input data with missing values. Note that ``X`` has been
checked in ``fit`` and ``transform`` before to call this function.
Returns
-------
imputer_mask : {ndarray or sparse matrix}, shape \
(n_samples, n_features)
The imputer mask of the original data.
features_with_missing : ndarray, shape (n_features_with_missing)
The features containing missing values.
"""
if sparse.issparse(X):
mask = _get_mask(X.data, self.missing_values)
# The imputer mask will be constructed with the same sparse format
# as X.
sparse_constructor = (sparse.csr_matrix if X.format == 'csr'
else sparse.csc_matrix)
imputer_mask = sparse_constructor(
(mask, X.indices.copy(), X.indptr.copy()),
shape=X.shape, dtype=bool)
imputer_mask.eliminate_zeros()
if self.features == 'missing-only':
n_missing = imputer_mask.getnnz(axis=0)
if self.sparse is False:
imputer_mask = imputer_mask.toarray()
elif imputer_mask.format == 'csr':
imputer_mask = imputer_mask.tocsc()
else:
imputer_mask = _get_mask(X, self.missing_values)
if self.features == 'missing-only':
n_missing = imputer_mask.sum(axis=0)
if self.sparse is True:
imputer_mask = sparse.csc_matrix(imputer_mask)
if self.features == 'all':
features_indices = np.arange(X.shape[1])
else:
features_indices = np.flatnonzero(n_missing)
return imputer_mask, features_indices
def _validate_input(self, X):
if not is_scalar_nan(self.missing_values):
force_all_finite = True
else:
force_all_finite = "allow-nan"
X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
force_all_finite=force_all_finite)
_check_inputs_dtype(X, self.missing_values)
if X.dtype.kind not in ("i", "u", "f", "O"):
raise ValueError("MissingIndicator does not support data with "
"dtype {0}. Please provide either a numeric array"
" (with a floating point or integer dtype) or "
"categorical data represented either as an array "
"with integer dtype or an array of string values "
"with an object dtype.".format(X.dtype))
if sparse.issparse(X) and self.missing_values == 0:
# missing_values = 0 not allowed with sparse data as it would
# force densification
raise ValueError("Sparse input with missing_values=0 is "
"not supported. Provide a dense "
"array instead.")
return X
def _fit(self, X, y=None):
"""Fit the transformer on X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Input data, where ``n_samples`` is the number of samples and
``n_features`` is the number of features.
Returns
-------
imputer_mask : {ndarray or sparse matrix}, shape (n_samples, \
n_features)
The imputer mask of the original data.
"""
X = self._validate_input(X)
self._n_features = X.shape[1]
if self.features not in ('missing-only', 'all'):
raise ValueError("'features' has to be either 'missing-only' or "
"'all'. Got {} instead.".format(self.features))
if not ((isinstance(self.sparse, str) and
self.sparse == "auto") or isinstance(self.sparse, bool)):
raise ValueError("'sparse' has to be a boolean or 'auto'. "
"Got {!r} instead.".format(self.sparse))
missing_features_info = self._get_missing_features_info(X)
self.features_ = missing_features_info[1]
return missing_features_info[0]
def fit(self, X, y=None):
"""Fit the transformer on X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Input data, where ``n_samples`` is the number of samples and
``n_features`` is the number of features.
Returns
-------
self : object
Returns self.
"""
self._fit(X, y)
return self
def transform(self, X):
"""Generate missing values indicator for X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The input data to complete.
Returns
-------
Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) \
or (n_samples, n_features_with_missing)
The missing indicator for input data. The data type of ``Xt``
will be boolean.
"""
check_is_fitted(self)
X = self._validate_input(X)
if X.shape[1] != self._n_features:
raise ValueError("X has a different number of features "
"than during fitting.")
imputer_mask, features = self._get_missing_features_info(X)
if self.features == "missing-only":
features_diff_fit_trans = np.setdiff1d(features, self.features_)
if (self.error_on_new and features_diff_fit_trans.size > 0):
raise ValueError("The features {} have missing values "
"in transform but have no missing values "
"in fit.".format(features_diff_fit_trans))
if self.features_.size < self._n_features:
imputer_mask = imputer_mask[:, self.features_]
return imputer_mask
def fit_transform(self, X, y=None):
"""Generate missing values indicator for X.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The input data to complete.
Returns
-------
Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) \
or (n_samples, n_features_with_missing)
The missing indicator for input data. The data type of ``Xt``
will be boolean.
"""
imputer_mask = self._fit(X, y)
if self.features_.size < self._n_features:
imputer_mask = imputer_mask[:, self.features_]
return imputer_mask
def _more_tags(self):
return {'allow_nan': True,
'X_types': ['2darray', 'string']}