"""
Functions for preparing various inputs passed to the DataFrame or Series
constructors before passing them to a BlockManager.
"""
from collections import OrderedDict
import numpy as np
import numpy.ma as ma
from pandas._libs import lib
from pandas._libs.tslibs import IncompatibleFrequency
import pandas.compat as compat
from pandas.compat import (
get_range_parameters, lmap, lrange, raise_with_traceback, range)
from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na,
construct_1d_object_array_from_listlike, infer_dtype_from_scalar,
maybe_cast_to_datetime, maybe_cast_to_integer_array, maybe_castable,
maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast)
from pandas.core.dtypes.common import (
is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal,
is_extension_array_dtype, is_extension_type, is_float_dtype,
is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype)
from pandas.core.dtypes.generic import (
ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPandasArray,
ABCPeriodIndex, ABCSeries, ABCTimedeltaIndex)
from pandas.core.dtypes.missing import isna
from pandas.core import algorithms, common as com
from pandas.core.arrays import Categorical, ExtensionArray, period_array
from pandas.core.index import (
Index, _get_objs_combined_axis, _union_indexes, ensure_index)
from pandas.core.indexes import base as ibase
from pandas.core.internals import (
create_block_manager_from_arrays, create_block_manager_from_blocks)
from pandas.core.internals.arrays import extract_array
# ---------------------------------------------------------------------
# BlockManager Interface
def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
"""
Segregate Series based on type and coerce into matrices.
Needs to handle a lot of exceptional cases.
"""
# figure out the index, if necessary
if index is None:
index = extract_index(arrays)
else:
index = ensure_index(index)
# don't force copy because getting jammed in an ndarray anyway
arrays = _homogenize(arrays, index, dtype)
# from BlockManager perspective
axes = [ensure_index(columns), index]
return create_block_manager_from_arrays(arrays, arr_names, axes)
def masked_rec_array_to_mgr(data, index, columns, dtype, copy):
"""
Extract from a masked rec array and create the manager.
"""
# essentially process a record array then fill it
fill_value = data.fill_value
fdata = ma.getdata(data)
if index is None:
index = get_names_from_index(fdata)
if index is None:
index = ibase.default_index(len(data))
index = ensure_index(index)
if columns is not None:
columns = ensure_index(columns)
arrays, arr_columns = to_arrays(fdata, columns)
# fill if needed
new_arrays = []
for fv, arr, col in zip(fill_value, arrays, arr_columns):
mask = ma.getmaskarray(data[col])
if mask.any():
arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)
arr[mask] = fv
new_arrays.append(arr)
# create the manager
arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns)
if columns is None:
columns = arr_columns
mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)
if copy:
mgr = mgr.copy()
return mgr
# ---------------------------------------------------------------------
# DataFrame Constructor Interface
def init_ndarray(values, index, columns, dtype=None, copy=False):
# input must be a ndarray, list, Series, index
if isinstance(values, ABCSeries):
if columns is None:
if values.name is not None:
columns = [values.name]
if index is None:
index = values.index
else:
values = values.reindex(index)
# zero len case (GH #2234)
if not len(values) and columns is not None and len(columns):
values = np.empty((0, 1), dtype=object)
# we could have a categorical type passed or coerced to 'category'
# recast this to an arrays_to_mgr
if (is_categorical_dtype(getattr(values, 'dtype', None)) or
is_categorical_dtype(dtype)):
if not hasattr(values, 'dtype'):
values = prep_ndarray(values, copy=copy)
values = values.ravel()
elif copy:
values = values.copy()
index, columns = _get_axes(len(values), 1, index, columns)
return arrays_to_mgr([values], columns, index, columns,
dtype=dtype)
elif (is_datetime64tz_dtype(values) or
is_extension_array_dtype(values)):
# GH#19157
if columns is None:
columns = [0]
return arrays_to_mgr([values], columns, index, columns,
dtype=dtype)
# by definition an array here
# the dtypes will be coerced to a single dtype
values = prep_ndarray(values, copy=copy)
if dtype is not None:
if not is_dtype_equal(values.dtype, dtype):
try:
values = values.astype(dtype)
except Exception as orig:
e = ValueError("failed to cast to '{dtype}' (Exception "
"was: {orig})".format(dtype=dtype,
orig=orig))
raise_with_traceback(e)
index, columns = _get_axes(*values.shape, index=index, columns=columns)
values = values.T
# if we don't have a dtype specified, then try to convert objects
# on the entire block; this is to convert if we have datetimelike's
# embedded in an object type
if dtype is None and is_object_dtype(values):
values = maybe_infer_to_datetimelike(values)
return create_block_manager_from_blocks([values], [columns, index])
def init_dict(data, index, columns, dtype=None):
"""
Segregate Series based on type and coerce into matrices.
Needs to handle a lot of exceptional cases.
"""
if columns is not None:
from pandas.core.series import Series
arrays = Series(data, index=columns, dtype=object)
data_names = arrays.index
missing = arrays.isnull()
if index is None:
# GH10856
# raise ValueError if only scalars in dict
index = extract_index(arrays[~missing])
else:
index = ensure_index(index)
# no obvious "empty" int column
if missing.any() and not is_integer_dtype(dtype):
if dtype is None or np.issubdtype(dtype, np.flexible):
# GH#1783
nan_dtype = object
else:
nan_dtype = dtype
val = construct_1d_arraylike_from_scalar(np.nan, len(index),
nan_dtype)
arrays.loc[missing] = [val] * missing.sum()
else:
for key in data:
if (isinstance(data[key], ABCDatetimeIndex) and
data[key].tz is not None):
# GH#24096 need copy to be deep for datetime64tz case
# TODO: See if we can avoid these copies
data[key] = data[key].copy(deep=True)
keys = com.dict_keys_to_ordered_list(data)
columns = data_names = Index(keys)
arrays = [data[k] for k in keys]
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
# ---------------------------------------------------------------------
def prep_ndarray(values, copy=True):
if not isinstance(values, (np.ndarray, ABCSeries, Index)):
if len(values) == 0:
return np.empty((0, 0), dtype=object)
def convert(v):
return maybe_convert_platform(v)
# we could have a 1-dim or 2-dim list here
# this is equiv of np.asarray, but does object conversion
# and platform dtype preservation
try:
if is_list_like(values[0]) or hasattr(values[0], 'len'):
values = np.array([convert(v) for v in values])
elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:
# GH#21861
values = np.array([convert(v) for v in values])
else:
values = convert(values)
except (ValueError, TypeError):
values = convert(values)
else:
# drop subclass info, do not copy data
values = np.asarray(values)
if copy:
values = values.copy()
if values.ndim == 1:
values = values.reshape((values.shape[0], 1))
elif values.ndim != 2:
raise ValueError('Must pass 2-d input')
return values
def _homogenize(data, index, dtype=None):
oindex = None
homogenized = []
for val in data:
if isinstance(val, ABCSeries):
if dtype is not None:
val = val.astype(dtype)
if val.index is not index:
# Forces alignment. No need to copy data since we
# are putting it into an ndarray later
val = val.reindex(index, copy=False)
else:
if isinstance(val, dict):
if oindex is None:
oindex = index.astype('O')
if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)):
val = com.dict_compat(val)
else:
val = dict(val)
val = lib.fast_multiget(val, oindex.values, default=np.nan)
val = sanitize_array(val, index, dtype=dtype, copy=False,
raise_cast_failure=False)
homogenized.append(val)
return homogenized
def extract_index(data):
index = None
if len(data) == 0:
index = Index([])
elif len(data) > 0:
raw_lengths = []
indexes = []
have_raw_arrays = False
have_series = False
have_dicts = False
for val in data:
if isinstance(val, ABCSeries):
have_series = True
indexes.append(val.index)
elif isinstance(val, dict):
have_dicts = True
indexes.append(list(val.keys()))
elif is_list_like(val) and getattr(val, 'ndim', 1) == 1:
have_raw_arrays = True
raw_lengths.append(len(val))
if not indexes and not raw_lengths:
raise ValueError('If using all scalar values, you must pass'
' an index')
if have_series or have_dicts:
index = _union_indexes(indexes)
if have_raw_arrays:
lengths = list(set(raw_lengths))
if len(lengths) > 1:
raise ValueError('arrays must all be same length')
if have_dicts:
raise ValueError('Mixing dicts with non-Series may lead to '
'ambiguous ordering.')
if have_series:
if lengths[0] != len(index):
msg = ('array length {length} does not match index '
'length {idx_len}'
.format(length=lengths[0], idx_len=len(index)))
raise ValueError(msg)
else:
index = ibase.default_index(lengths[0])
return ensure_index(index)
def reorder_arrays(arrays, arr_columns, columns):
# reorder according to the columns
if (columns is not None and len(columns) and arr_columns is not None and
len(arr_columns)):
indexer = ensure_index(arr_columns).get_indexer(columns)
arr_columns = ensure_index([arr_columns[i] for i in indexer])
arrays = [arrays[i] for i in indexer]
return arrays, arr_columns
def get_names_from_index(data):
Loading ...