import copy
import sys
import warnings
import numpy as np
from pandas._libs import lib
from pandas.compat import range, set_function_name, string_types
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
is_bool_dtype, is_float, is_float_dtype, is_integer, is_integer_dtype,
is_list_like, is_object_dtype, is_scalar)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, notna
from pandas.core import nanops
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.tools.numeric import to_numeric
class _IntegerDtype(ExtensionDtype):
"""
An ExtensionDtype to hold a single size & kind of integer dtype.
These specific implementations are subclasses of the non-public
_IntegerDtype. For example we have Int8Dtype to represnt signed int 8s.
The attributes name & type are set when these subclasses are created.
"""
name = None
base = None
type = None
na_value = np.nan
def __repr__(self):
sign = 'U' if self.is_unsigned_integer else ''
return "{sign}Int{size}Dtype()".format(sign=sign,
size=8 * self.itemsize)
@cache_readonly
def is_signed_integer(self):
return self.kind == 'i'
@cache_readonly
def is_unsigned_integer(self):
return self.kind == 'u'
@property
def _is_numeric(self):
return True
@cache_readonly
def numpy_dtype(self):
""" Return an instance of our numpy dtype """
return np.dtype(self.type)
@cache_readonly
def kind(self):
return self.numpy_dtype.kind
@cache_readonly
def itemsize(self):
""" Return the number of bytes in this dtype """
return self.numpy_dtype.itemsize
@classmethod
def construct_array_type(cls):
"""Return the array type associated with this dtype
Returns
-------
type
"""
return IntegerArray
@classmethod
def construct_from_string(cls, string):
"""
Construction from a string, raise a TypeError if not
possible
"""
if string == cls.name:
return cls()
raise TypeError("Cannot construct a '{}' from "
"'{}'".format(cls, string))
def integer_array(values, dtype=None, copy=False):
"""
Infer and return an integer array of the values.
Parameters
----------
values : 1D list-like
dtype : dtype, optional
dtype to coerce
copy : boolean, default False
Returns
-------
IntegerArray
Raises
------
TypeError if incompatible types
"""
values, mask = coerce_to_array(values, dtype=dtype, copy=copy)
return IntegerArray(values, mask)
def safe_cast(values, dtype, copy):
"""
Safely cast the values to the dtype if they
are equivalent, meaning floats must be equivalent to the
ints.
"""
try:
return values.astype(dtype, casting='safe', copy=copy)
except TypeError:
casted = values.astype(dtype, copy=copy)
if (casted == values).all():
return casted
raise TypeError("cannot safely cast non-equivalent {} to {}".format(
values.dtype, np.dtype(dtype)))
def coerce_to_array(values, dtype, mask=None, copy=False):
"""
Coerce the input values array to numpy arrays with a mask
Parameters
----------
values : 1D list-like
dtype : integer dtype
mask : boolean 1D array, optional
copy : boolean, default False
if True, copy the input
Returns
-------
tuple of (values, mask)
"""
# if values is integer numpy array, preserve it's dtype
if dtype is None and hasattr(values, 'dtype'):
if is_integer_dtype(values.dtype):
dtype = values.dtype
if dtype is not None:
if (isinstance(dtype, string_types) and
(dtype.startswith("Int") or dtype.startswith("UInt"))):
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
# https://github.com/numpy/numpy/pull/7476
dtype = dtype.lower()
if not issubclass(type(dtype), _IntegerDtype):
try:
dtype = _dtypes[str(np.dtype(dtype))]
except KeyError:
raise ValueError("invalid dtype specified {}".format(dtype))
if isinstance(values, IntegerArray):
values, mask = values._data, values._mask
if dtype is not None:
values = values.astype(dtype.numpy_dtype, copy=False)
if copy:
values = values.copy()
mask = mask.copy()
return values, mask
values = np.array(values, copy=copy)
if is_object_dtype(values):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == 'empty':
values = np.empty(len(values))
values.fill(np.nan)
elif inferred_type not in ['floating', 'integer',
'mixed-integer', 'mixed-integer-float']:
raise TypeError("{} cannot be converted to an IntegerDtype".format(
values.dtype))
elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError("{} cannot be converted to an IntegerDtype".format(
values.dtype))
if mask is None:
mask = isna(values)
else:
assert len(mask) == len(values)
if not values.ndim == 1:
raise TypeError("values must be a 1D list-like")
if not mask.ndim == 1:
raise TypeError("mask must be a 1D list-like")
# infer dtype if needed
if dtype is None:
dtype = np.dtype('int64')
else:
dtype = dtype.type
# if we are float, let's make sure that we can
# safely cast
# we copy as need to coerce here
if mask.any():
values = values.copy()
values[mask] = 1
values = safe_cast(values, dtype, copy=False)
else:
values = safe_cast(values, dtype, copy=False)
return values, mask
class IntegerArray(ExtensionArray, ExtensionOpsMixin):
"""
Array of integer (optional missing) values.
.. versionadded:: 0.24.0
.. warning::
IntegerArray is currently experimental, and its API or internal
implementation may change without warning.
We represent an IntegerArray with 2 numpy arrays:
- data: contains a numpy integer array of the appropriate dtype
- mask: a boolean array holding a mask on the data, True is missing
To construct an IntegerArray from generic array-like input, use
:func:`pandas.array` with one of the integer dtypes (see examples).
See :ref:`integer_na` for more.
Parameters
----------
values : numpy.ndarray
A 1-d integer-dtype array.
mask : numpy.ndarray
A 1-d boolean-dtype array indicating missing values.
copy : bool, default False
Whether to copy the `values` and `mask`.
Returns
-------
IntegerArray
Examples
--------
Create an IntegerArray with :func:`pandas.array`.
>>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
>>> int_array
<IntegerArray>
[1, NaN, 3]
Length: 3, dtype: Int32
String aliases for the dtypes are also available. They are capitalized.
>>> pd.array([1, None, 3], dtype='Int32')
<IntegerArray>
[1, NaN, 3]
Length: 3, dtype: Int32
>>> pd.array([1, None, 3], dtype='UInt16')
<IntegerArray>
[1, NaN, 3]
Length: 3, dtype: UInt16
"""
@cache_readonly
def dtype(self):
return _dtypes[str(self._data.dtype)]
def __init__(self, values, mask, copy=False):
if not (isinstance(values, np.ndarray)
and is_integer_dtype(values.dtype)):
raise TypeError("values should be integer numpy array. Use "
"the 'integer_array' function instead")
if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
raise TypeError("mask should be boolean numpy array. Use "
"the 'integer_array' function instead")
if copy:
values = values.copy()
mask = mask.copy()
self._data = values
self._mask = mask
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
return integer_array(scalars, dtype=dtype, copy=copy)
@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
scalars = to_numeric(strings, errors="raise")
return cls._from_sequence(scalars, dtype, copy)
@classmethod
def _from_factorized(cls, values, original):
return integer_array(values, dtype=original.dtype)
def _formatter(self, boxed=False):
def fmt(x):
if isna(x):
return 'NaN'
return str(x)
return fmt
def __getitem__(self, item):
if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]
return type(self)(self._data[item], self._mask[item])
def _coerce_to_ndarray(self):
"""
coerce to an ndarary of object dtype
"""
# TODO(jreback) make this better
data = self._data.astype(object)
data[self._mask] = self._na_value
return data
__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
def __array__(self, dtype=None):
"""
the array interface, return my values
We return an object array here to preserve our scalar values
"""
return self._coerce_to_ndarray()
Loading ...