"""
Data structures for sparse float data. Life is made simpler by dealing only
with float64 data
"""
from __future__ import division
import warnings
import numpy as np
from pandas._libs.sparse import BlockIndex, get_blocks
import pandas.compat as compat
from pandas.compat import lmap
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender
from pandas.core.dtypes.cast import find_common_type, maybe_upcast
from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse
from pandas.core.dtypes.missing import isna, notna
import pandas.core.algorithms as algos
from pandas.core.arrays.sparse import SparseArray, SparseDtype
import pandas.core.common as com
from pandas.core.frame import DataFrame
import pandas.core.generic as generic
from pandas.core.index import Index, MultiIndex, ensure_index
import pandas.core.indexes.base as ibase
from pandas.core.internals import (
BlockManager, create_block_manager_from_arrays)
from pandas.core.internals.construction import extract_index, prep_ndarray
import pandas.core.ops as ops
from pandas.core.series import Series
from pandas.core.sparse.series import SparseSeries
# pylint: disable=E1101,E1103,W0231,E0202
_shared_doc_kwargs = dict(klass='SparseDataFrame')
class SparseDataFrame(DataFrame):
"""
DataFrame containing sparse floating point data in the form of SparseSeries
objects
Parameters
----------
data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
.. versionchanged :: 0.23.0
If data is a dict, argument order is maintained for Python 3.6
and later.
index : array-like, optional
column : array-like, optional
default_kind : {'block', 'integer'}, default 'block'
Default sparse kind for converting Series to SparseSeries. Will not
override SparseSeries passed into constructor
default_fill_value : float
Default fill_value for converting Series to SparseSeries
(default: nan). Will not override SparseSeries passed in.
"""
_subtyp = 'sparse_frame'
def __init__(self, data=None, index=None, columns=None, default_kind=None,
default_fill_value=None, dtype=None, copy=False):
# pick up the defaults from the Sparse structures
if isinstance(data, SparseDataFrame):
if index is None:
index = data.index
if columns is None:
columns = data.columns
if default_fill_value is None:
default_fill_value = data.default_fill_value
if default_kind is None:
default_kind = data.default_kind
elif isinstance(data, (SparseSeries, SparseArray)):
if index is None:
index = data.index
if default_fill_value is None:
default_fill_value = data.fill_value
if columns is None and hasattr(data, 'name'):
columns = [data.name]
if columns is None:
raise Exception("cannot pass a series w/o a name or columns")
data = {columns[0]: data}
if default_fill_value is None:
default_fill_value = np.nan
if default_kind is None:
default_kind = 'block'
self._default_kind = default_kind
self._default_fill_value = default_fill_value
if is_scipy_sparse(data):
mgr = self._init_spmatrix(data, index, columns, dtype=dtype,
fill_value=default_fill_value)
elif isinstance(data, dict):
mgr = self._init_dict(data, index, columns, dtype=dtype)
elif isinstance(data, (np.ndarray, list)):
mgr = self._init_matrix(data, index, columns, dtype=dtype)
elif isinstance(data, SparseDataFrame):
mgr = self._init_mgr(data._data,
dict(index=index, columns=columns),
dtype=dtype, copy=copy)
elif isinstance(data, DataFrame):
mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
elif isinstance(data, Series):
mgr = self._init_dict(data.to_frame(), data.index,
columns=None, dtype=dtype)
elif isinstance(data, BlockManager):
mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
dtype=dtype, copy=copy)
elif data is None:
data = DataFrame()
if index is None:
index = Index([])
else:
index = ensure_index(index)
if columns is None:
columns = Index([])
else:
for c in columns:
data[c] = SparseArray(np.nan, index=index,
kind=self._default_kind,
fill_value=self._default_fill_value)
mgr = to_manager(data, columns, index)
if dtype is not None:
mgr = mgr.astype(dtype)
else:
msg = ('SparseDataFrame called with unknown type "{data_type}" '
'for data argument')
raise TypeError(msg.format(data_type=type(data).__name__))
generic.NDFrame.__init__(self, mgr)
@property
def _constructor(self):
return SparseDataFrame
_constructor_sliced = SparseSeries
def _init_dict(self, data, index, columns, dtype=None):
# pre-filter out columns if we passed it
if columns is not None:
columns = ensure_index(columns)
data = {k: v for k, v in compat.iteritems(data) if k in columns}
else:
keys = com.dict_keys_to_ordered_list(data)
columns = Index(keys)
if index is None:
index = extract_index(list(data.values()))
def sp_maker(x):
return SparseArray(x, kind=self._default_kind,
fill_value=self._default_fill_value,
copy=True, dtype=dtype)
sdict = {}
for k, v in compat.iteritems(data):
if isinstance(v, Series):
# Force alignment, no copy necessary
if not v.index.equals(index):
v = v.reindex(index)
if not isinstance(v, SparseSeries):
v = sp_maker(v.values)
elif isinstance(v, SparseArray):
v = v.copy()
else:
if isinstance(v, dict):
v = [v.get(i, np.nan) for i in index]
v = sp_maker(v)
if index is not None and len(v) != len(index):
msg = "Length of passed values is {}, index implies {}"
raise ValueError(msg.format(len(v), len(index)))
sdict[k] = v
if len(columns.difference(sdict)):
# TODO: figure out how to handle this case, all nan's?
# add in any other columns we want to have (completeness)
nan_arr = np.empty(len(index), dtype='float64')
nan_arr.fill(np.nan)
nan_arr = SparseArray(nan_arr, kind=self._default_kind,
fill_value=self._default_fill_value,
copy=False)
sdict.update((c, nan_arr) for c in columns if c not in sdict)
return to_manager(sdict, columns, index)
def _init_matrix(self, data, index, columns, dtype=None):
""" Init self from ndarray or list of lists """
data = prep_ndarray(data, copy=False)
index, columns = self._prep_index(data, index, columns)
data = {idx: data[:, i] for i, idx in enumerate(columns)}
return self._init_dict(data, index, columns, dtype)
def _init_spmatrix(self, data, index, columns, dtype=None,
fill_value=None):
""" Init self from scipy.sparse matrix """
index, columns = self._prep_index(data, index, columns)
data = data.tocoo()
N = len(index)
# Construct a dict of SparseSeries
sdict = {}
values = Series(data.data, index=data.row, copy=False)
for col, rowvals in values.groupby(data.col):
# get_blocks expects int32 row indices in sorted order
rowvals = rowvals.sort_index()
rows = rowvals.index.values.astype(np.int32)
blocs, blens = get_blocks(rows)
sdict[columns[col]] = SparseSeries(
rowvals.values, index=index,
fill_value=fill_value,
sparse_index=BlockIndex(N, blocs, blens))
# Add any columns that were empty and thus not grouped on above
sdict.update({column: SparseSeries(index=index,
fill_value=fill_value,
sparse_index=BlockIndex(N, [], []))
for column in columns
if column not in sdict})
return self._init_dict(sdict, index, columns, dtype)
def _prep_index(self, data, index, columns):
N, K = data.shape
if index is None:
index = ibase.default_index(N)
if columns is None:
columns = ibase.default_index(K)
if len(columns) != K:
raise ValueError('Column length mismatch: {columns} vs. {K}'
.format(columns=len(columns), K=K))
if len(index) != N:
raise ValueError('Index length mismatch: {index} vs. {N}'
.format(index=len(index), N=N))
return index, columns
def to_coo(self):
"""
Return the contents of the frame as a sparse SciPy COO matrix.
.. versionadded:: 0.20.0
Returns
-------
coo_matrix : scipy.sparse.spmatrix
If the caller is heterogeneous and contains booleans or objects,
the result will be of dtype=object. See Notes.
Notes
-----
The dtype will be the lowest-common-denominator type (implicit
upcasting); that is to say if the dtypes (even of numeric types)
are mixed, the one that accommodates all will be chosen.
e.g. If the dtypes are float16 and float32, dtype will be upcast to
float32. By numpy.find_common_type convention, mixing int64 and
and uint64 will result in a float64 dtype.
"""
try:
from scipy.sparse import coo_matrix
except ImportError:
raise ImportError('Scipy is not installed')
dtype = find_common_type(self.dtypes)
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype
cols, rows, datas = [], [], []
for col, name in enumerate(self):
s = self[name]
row = s.sp_index.to_int_index().indices
cols.append(np.repeat(col, len(row)))
rows.append(row)
datas.append(s.sp_values.astype(dtype, copy=False))
cols = np.concatenate(cols)
rows = np.concatenate(rows)
datas = np.concatenate(datas)
return coo_matrix((datas, (rows, cols)), shape=self.shape)
def __array_wrap__(self, result):
return self._constructor(
result, index=self.index, columns=self.columns,
default_kind=self._default_kind,
default_fill_value=self._default_fill_value).__finalize__(self)
def __getstate__(self):
# pickling
return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data,
_default_fill_value=self._default_fill_value,
_default_kind=self._default_kind)
def _unpickle_sparse_frame_compat(self, state):
""" original pickle format """
series, cols, idx, fv, kind = state
if not isinstance(cols, Index): # pragma: no cover
from pandas.io.pickle import _unpickle_array
columns = _unpickle_array(cols)
else:
columns = cols
if not isinstance(idx, Index): # pragma: no cover
from pandas.io.pickle import _unpickle_array
index = _unpickle_array(idx)
else:
index = idx
series_dict = DataFrame()
for col, (sp_index, sp_values) in compat.iteritems(series):
series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index,
fill_value=fv)
self._data = to_manager(series_dict, columns, index)
self._default_fill_value = fv
self._default_kind = kind
def to_dense(self):
"""
Convert to dense DataFrame
Returns
-------
df : DataFrame
"""
data = {k: v.to_dense() for k, v in compat.iteritems(self)}
return DataFrame(data, index=self.index, columns=self.columns)
def _apply_columns(self, func):
""" get new SparseDataFrame applying func to each columns """
new_data = {col: func(series)
for col, series in compat.iteritems(self)}
Loading ...