Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

/ core / sparse / frame.py

"""
Data structures for sparse float data. Life is made simpler by dealing only
with float64 data
"""
from __future__ import division

import warnings

import numpy as np

from pandas._libs.sparse import BlockIndex, get_blocks
import pandas.compat as compat
from pandas.compat import lmap
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender

from pandas.core.dtypes.cast import find_common_type, maybe_upcast
from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse
from pandas.core.dtypes.missing import isna, notna

import pandas.core.algorithms as algos
from pandas.core.arrays.sparse import SparseArray, SparseDtype
import pandas.core.common as com
from pandas.core.frame import DataFrame
import pandas.core.generic as generic
from pandas.core.index import Index, MultiIndex, ensure_index
import pandas.core.indexes.base as ibase
from pandas.core.internals import (
    BlockManager, create_block_manager_from_arrays)
from pandas.core.internals.construction import extract_index, prep_ndarray
import pandas.core.ops as ops
from pandas.core.series import Series
from pandas.core.sparse.series import SparseSeries

# pylint: disable=E1101,E1103,W0231,E0202


_shared_doc_kwargs = dict(klass='SparseDataFrame')


class SparseDataFrame(DataFrame):
    """
    DataFrame containing sparse floating point data in the form of SparseSeries
    objects

    Parameters
    ----------
    data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
        .. versionchanged :: 0.23.0
           If data is a dict, argument order is maintained for Python 3.6
           and later.

    index : array-like, optional
    column : array-like, optional
    default_kind : {'block', 'integer'}, default 'block'
        Default sparse kind for converting Series to SparseSeries. Will not
        override SparseSeries passed into constructor
    default_fill_value : float
        Default fill_value for converting Series to SparseSeries
        (default: nan). Will not override SparseSeries passed in.
    """
    _subtyp = 'sparse_frame'

    def __init__(self, data=None, index=None, columns=None, default_kind=None,
                 default_fill_value=None, dtype=None, copy=False):

        # pick up the defaults from the Sparse structures
        if isinstance(data, SparseDataFrame):
            if index is None:
                index = data.index
            if columns is None:
                columns = data.columns
            if default_fill_value is None:
                default_fill_value = data.default_fill_value
            if default_kind is None:
                default_kind = data.default_kind
        elif isinstance(data, (SparseSeries, SparseArray)):
            if index is None:
                index = data.index
            if default_fill_value is None:
                default_fill_value = data.fill_value
            if columns is None and hasattr(data, 'name'):
                columns = [data.name]
            if columns is None:
                raise Exception("cannot pass a series w/o a name or columns")
            data = {columns[0]: data}

        if default_fill_value is None:
            default_fill_value = np.nan
        if default_kind is None:
            default_kind = 'block'

        self._default_kind = default_kind
        self._default_fill_value = default_fill_value

        if is_scipy_sparse(data):
            mgr = self._init_spmatrix(data, index, columns, dtype=dtype,
                                      fill_value=default_fill_value)
        elif isinstance(data, dict):
            mgr = self._init_dict(data, index, columns, dtype=dtype)
        elif isinstance(data, (np.ndarray, list)):
            mgr = self._init_matrix(data, index, columns, dtype=dtype)
        elif isinstance(data, SparseDataFrame):
            mgr = self._init_mgr(data._data,
                                 dict(index=index, columns=columns),
                                 dtype=dtype, copy=copy)
        elif isinstance(data, DataFrame):
            mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
        elif isinstance(data, Series):
            mgr = self._init_dict(data.to_frame(), data.index,
                                  columns=None, dtype=dtype)
        elif isinstance(data, BlockManager):
            mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
                                 dtype=dtype, copy=copy)
        elif data is None:
            data = DataFrame()

            if index is None:
                index = Index([])
            else:
                index = ensure_index(index)

            if columns is None:
                columns = Index([])
            else:
                for c in columns:
                    data[c] = SparseArray(np.nan, index=index,
                                          kind=self._default_kind,
                                          fill_value=self._default_fill_value)
            mgr = to_manager(data, columns, index)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        else:
            msg = ('SparseDataFrame called with unknown type "{data_type}" '
                   'for data argument')
            raise TypeError(msg.format(data_type=type(data).__name__))

        generic.NDFrame.__init__(self, mgr)

    @property
    def _constructor(self):
        return SparseDataFrame

    _constructor_sliced = SparseSeries

    def _init_dict(self, data, index, columns, dtype=None):
        # pre-filter out columns if we passed it
        if columns is not None:
            columns = ensure_index(columns)
            data = {k: v for k, v in compat.iteritems(data) if k in columns}
        else:
            keys = com.dict_keys_to_ordered_list(data)
            columns = Index(keys)

        if index is None:
            index = extract_index(list(data.values()))

        def sp_maker(x):
            return SparseArray(x, kind=self._default_kind,
                               fill_value=self._default_fill_value,
                               copy=True, dtype=dtype)
        sdict = {}
        for k, v in compat.iteritems(data):
            if isinstance(v, Series):
                # Force alignment, no copy necessary
                if not v.index.equals(index):
                    v = v.reindex(index)

                if not isinstance(v, SparseSeries):
                    v = sp_maker(v.values)
            elif isinstance(v, SparseArray):
                v = v.copy()
            else:
                if isinstance(v, dict):
                    v = [v.get(i, np.nan) for i in index]

                v = sp_maker(v)

            if index is not None and len(v) != len(index):
                msg = "Length of passed values is {}, index implies {}"
                raise ValueError(msg.format(len(v), len(index)))
            sdict[k] = v

        if len(columns.difference(sdict)):
            # TODO: figure out how to handle this case, all nan's?
            # add in any other columns we want to have (completeness)
            nan_arr = np.empty(len(index), dtype='float64')
            nan_arr.fill(np.nan)
            nan_arr = SparseArray(nan_arr, kind=self._default_kind,
                                  fill_value=self._default_fill_value,
                                  copy=False)
            sdict.update((c, nan_arr) for c in columns if c not in sdict)

        return to_manager(sdict, columns, index)

    def _init_matrix(self, data, index, columns, dtype=None):
        """ Init self from ndarray or list of lists """
        data = prep_ndarray(data, copy=False)
        index, columns = self._prep_index(data, index, columns)
        data = {idx: data[:, i] for i, idx in enumerate(columns)}
        return self._init_dict(data, index, columns, dtype)

    def _init_spmatrix(self, data, index, columns, dtype=None,
                       fill_value=None):
        """ Init self from scipy.sparse matrix """
        index, columns = self._prep_index(data, index, columns)
        data = data.tocoo()
        N = len(index)

        # Construct a dict of SparseSeries
        sdict = {}
        values = Series(data.data, index=data.row, copy=False)
        for col, rowvals in values.groupby(data.col):
            # get_blocks expects int32 row indices in sorted order
            rowvals = rowvals.sort_index()
            rows = rowvals.index.values.astype(np.int32)
            blocs, blens = get_blocks(rows)

            sdict[columns[col]] = SparseSeries(
                rowvals.values, index=index,
                fill_value=fill_value,
                sparse_index=BlockIndex(N, blocs, blens))

        # Add any columns that were empty and thus not grouped on above
        sdict.update({column: SparseSeries(index=index,
                                           fill_value=fill_value,
                                           sparse_index=BlockIndex(N, [], []))
                      for column in columns
                      if column not in sdict})

        return self._init_dict(sdict, index, columns, dtype)

    def _prep_index(self, data, index, columns):
        N, K = data.shape
        if index is None:
            index = ibase.default_index(N)
        if columns is None:
            columns = ibase.default_index(K)

        if len(columns) != K:
            raise ValueError('Column length mismatch: {columns} vs. {K}'
                             .format(columns=len(columns), K=K))
        if len(index) != N:
            raise ValueError('Index length mismatch: {index} vs. {N}'
                             .format(index=len(index), N=N))
        return index, columns

    def to_coo(self):
        """
        Return the contents of the frame as a sparse SciPy COO matrix.

        .. versionadded:: 0.20.0

        Returns
        -------
        coo_matrix : scipy.sparse.spmatrix
            If the caller is heterogeneous and contains booleans or objects,
            the result will be of dtype=object. See Notes.

        Notes
        -----
        The dtype will be the lowest-common-denominator type (implicit
        upcasting); that is to say if the dtypes (even of numeric types)
        are mixed, the one that accommodates all will be chosen.

        e.g. If the dtypes are float16 and float32, dtype will be upcast to
        float32. By numpy.find_common_type convention, mixing int64 and
        and uint64 will result in a float64 dtype.
        """
        try:
            from scipy.sparse import coo_matrix
        except ImportError:
            raise ImportError('Scipy is not installed')

        dtype = find_common_type(self.dtypes)
        if isinstance(dtype, SparseDtype):
            dtype = dtype.subtype

        cols, rows, datas = [], [], []
        for col, name in enumerate(self):
            s = self[name]
            row = s.sp_index.to_int_index().indices
            cols.append(np.repeat(col, len(row)))
            rows.append(row)
            datas.append(s.sp_values.astype(dtype, copy=False))

        cols = np.concatenate(cols)
        rows = np.concatenate(rows)
        datas = np.concatenate(datas)
        return coo_matrix((datas, (rows, cols)), shape=self.shape)

    def __array_wrap__(self, result):
        return self._constructor(
            result, index=self.index, columns=self.columns,
            default_kind=self._default_kind,
            default_fill_value=self._default_fill_value).__finalize__(self)

    def __getstate__(self):
        # pickling
        return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data,
                    _default_fill_value=self._default_fill_value,
                    _default_kind=self._default_kind)

    def _unpickle_sparse_frame_compat(self, state):
        """ original pickle format """
        series, cols, idx, fv, kind = state

        if not isinstance(cols, Index):  # pragma: no cover
            from pandas.io.pickle import _unpickle_array
            columns = _unpickle_array(cols)
        else:
            columns = cols

        if not isinstance(idx, Index):  # pragma: no cover
            from pandas.io.pickle import _unpickle_array
            index = _unpickle_array(idx)
        else:
            index = idx

        series_dict = DataFrame()
        for col, (sp_index, sp_values) in compat.iteritems(series):
            series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index,
                                            fill_value=fv)

        self._data = to_manager(series_dict, columns, index)
        self._default_fill_value = fv
        self._default_kind = kind

    def to_dense(self):
        """
        Convert to dense DataFrame

        Returns
        -------
        df : DataFrame
        """
        data = {k: v.to_dense() for k, v in compat.iteritems(self)}
        return DataFrame(data, index=self.index, columns=self.columns)

    def _apply_columns(self, func):
        """ get new SparseDataFrame applying func to each columns """

        new_data = {col: func(series)
                    for col, series in compat.iteritems(self)}
Loading ...