core/sparse/frame.py · aaronreidsmith/pandas

aaronreidsmith / pandas python

Repository URL to install this package:
Version: 0.25.3

/ core / sparse / frame.py

"""
Data structures for sparse float data. Life is made simpler by dealing only
with float64 data
"""
import warnings

import numpy as np

from pandas._libs.lib import is_scalar, item_from_zerodim
from pandas._libs.sparse import BlockIndex, get_blocks
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender

from pandas.core.dtypes.cast import maybe_upcast
from pandas.core.dtypes.common import ensure_platform_int, is_scipy_sparse
from pandas.core.dtypes.missing import isna, notna

import pandas.core.algorithms as algos
from pandas.core.arrays.sparse import SparseArray, SparseFrameAccessor
import pandas.core.common as com
from pandas.core.frame import DataFrame
import pandas.core.generic as generic
from pandas.core.index import Index, MultiIndex, ensure_index
from pandas.core.internals import BlockManager, create_block_manager_from_arrays
from pandas.core.internals.construction import extract_index, prep_ndarray
import pandas.core.ops as ops
from pandas.core.series import Series
from pandas.core.sparse.series import SparseSeries

_shared_doc_kwargs = dict(klass="SparseDataFrame")
depr_msg = """\
SparseDataFrame is deprecated and will be removed in a future version.
Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/\
user_guide/sparse.html#migrating for more.
"""


class SparseDataFrame(DataFrame):
    """
    DataFrame containing sparse floating point data in the form of SparseSeries
    objects

    .. deprecated:: 0.25.0

       Use a DataFrame with sparse values instead.

    Parameters
    ----------
    data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
        .. versionchanged :: 0.23.0
           If data is a dict, argument order is maintained for Python 3.6
           and later.

    index : array-like, optional
    column : array-like, optional
    default_kind : {'block', 'integer'}, default 'block'
        Default sparse kind for converting Series to SparseSeries. Will not
        override SparseSeries passed into constructor
    default_fill_value : float
        Default fill_value for converting Series to SparseSeries
        (default: nan). Will not override SparseSeries passed in.
    """

    _subtyp = "sparse_frame"

    def __init__(
        self,
        data=None,
        index=None,
        columns=None,
        default_kind=None,
        default_fill_value=None,
        dtype=None,
        copy=False,
    ):
        if not is_scalar(default_fill_value):
            raise ValueError("'default_fill_value' must be a scalar")

        warnings.warn(depr_msg, FutureWarning, stacklevel=2)
        # pick up the defaults from the Sparse structures
        if isinstance(data, SparseDataFrame):
            if index is None:
                index = data.index
            if columns is None:
                columns = data.columns
            if default_fill_value is None:
                default_fill_value = data.default_fill_value
            if default_kind is None:
                default_kind = data.default_kind
        elif isinstance(data, (SparseSeries, SparseArray)):
            if index is None:
                index = data.index
            if default_fill_value is None:
                default_fill_value = data.fill_value
            if columns is None and hasattr(data, "name"):
                columns = [data.name]
            if columns is None:
                raise Exception("cannot pass a series w/o a name or columns")
            data = {columns[0]: data}

        if default_fill_value is None:
            default_fill_value = np.nan
        if default_kind is None:
            default_kind = "block"

        self._default_kind = default_kind
        self._default_fill_value = default_fill_value

        if is_scipy_sparse(data):
            mgr = self._init_spmatrix(
                data, index, columns, dtype=dtype, fill_value=default_fill_value
            )
        elif isinstance(data, dict):
            mgr = self._init_dict(data, index, columns, dtype=dtype)
        elif isinstance(data, (np.ndarray, list)):
            mgr = self._init_matrix(data, index, columns, dtype=dtype)
        elif isinstance(data, SparseDataFrame):
            mgr = self._init_mgr(
                data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy
            )
        elif isinstance(data, DataFrame):
            mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
        elif isinstance(data, Series):
            mgr = self._init_dict(
                data.to_frame(), data.index, columns=None, dtype=dtype
            )
        elif isinstance(data, BlockManager):
            mgr = self._init_mgr(
                data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
            )
        elif data is None:
            data = DataFrame()

            if index is None:
                index = Index([])
            else:
                index = ensure_index(index)

            if columns is None:
                columns = Index([])
            else:
                for c in columns:
                    data[c] = SparseArray(
                        self._default_fill_value,
                        index=index,
                        kind=self._default_kind,
                        fill_value=self._default_fill_value,
                    )
            mgr = to_manager(data, columns, index)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        else:
            msg = (
                'SparseDataFrame called with unknown type "{data_type}" '
                "for data argument"
            )
            raise TypeError(msg.format(data_type=type(data).__name__))

        generic.NDFrame.__init__(self, mgr)

    @property
    def _constructor(self):
        return SparseDataFrame

    _constructor_sliced = SparseSeries

    def _init_dict(self, data, index, columns, dtype=None):
        # pre-filter out columns if we passed it
        if columns is not None:
            columns = ensure_index(columns)
            data = {k: v for k, v in data.items() if k in columns}
        else:
            keys = com.dict_keys_to_ordered_list(data)
            columns = Index(keys)

        if index is None:
            index = extract_index(list(data.values()))

        def sp_maker(x):
            return SparseArray(
                x,
                kind=self._default_kind,
                fill_value=self._default_fill_value,
                copy=True,
                dtype=dtype,
            )

        sdict = {}
        for k, v in data.items():
            if isinstance(v, Series):
                # Force alignment, no copy necessary
                if not v.index.equals(index):
                    v = v.reindex(index)

                if not isinstance(v, SparseSeries):
                    v = sp_maker(v.values)
            elif isinstance(v, SparseArray):
                v = v.copy()
            else:
                if isinstance(v, dict):
                    v = [v.get(i, np.nan) for i in index]

                v = sp_maker(v)

            if index is not None and len(v) != len(index):
                msg = "Length of passed values is {}, index implies {}"
                raise ValueError(msg.format(len(v), len(index)))
            sdict[k] = v

        if len(columns.difference(sdict)):
            # TODO: figure out how to handle this case, all nan's?
            # add in any other columns we want to have (completeness)
            nan_arr = np.empty(len(index), dtype="float64")
            nan_arr.fill(np.nan)
            nan_arr = SparseArray(
                nan_arr,
                kind=self._default_kind,
                fill_value=self._default_fill_value,
                copy=False,
            )
            sdict.update((c, nan_arr) for c in columns if c not in sdict)

        return to_manager(sdict, columns, index)

    def _init_matrix(self, data, index, columns, dtype=None):
        """
        Init self from ndarray or list of lists.
        """
        data = prep_ndarray(data, copy=False)
        index, columns = SparseFrameAccessor._prep_index(data, index, columns)
        data = {idx: data[:, i] for i, idx in enumerate(columns)}
        return self._init_dict(data, index, columns, dtype)

    def _init_spmatrix(self, data, index, columns, dtype=None, fill_value=None):
        """
        Init self from scipy.sparse matrix.
        """
        index, columns = SparseFrameAccessor._prep_index(data, index, columns)
        data = data.tocoo()
        N = len(index)

        # Construct a dict of SparseSeries
        sdict = {}
        values = Series(data.data, index=data.row, copy=False)
        for col, rowvals in values.groupby(data.col):
            # get_blocks expects int32 row indices in sorted order
            rowvals = rowvals.sort_index()
            rows = rowvals.index.values.astype(np.int32)
            blocs, blens = get_blocks(rows)

            sdict[columns[col]] = SparseSeries(
                rowvals.values,
                index=index,
                fill_value=fill_value,
                sparse_index=BlockIndex(N, blocs, blens),
            )

        # Add any columns that were empty and thus not grouped on above
        sdict.update(
            {
                column: SparseSeries(
                    index=index,
                    fill_value=fill_value,
                    sparse_index=BlockIndex(N, [], []),
                )
                for column in columns
                if column not in sdict
            }
        )

        return self._init_dict(sdict, index, columns, dtype)

    @Appender(SparseFrameAccessor.to_coo.__doc__)
    def to_coo(self):
        return SparseFrameAccessor(self).to_coo()

    def __repr__(self):
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", "Sparse")
            return super().__repr__()

    def __getstate__(self):
        # pickling
        return dict(
            _typ=self._typ,
            _subtyp=self._subtyp,
            _data=self._data,
            _default_fill_value=self._default_fill_value,
            _default_kind=self._default_kind,
        )

    def _unpickle_sparse_frame_compat(self, state):
        """
        Original pickle format
        """
        series, cols, idx, fv, kind = state

        if not isinstance(cols, Index):  # pragma: no cover
            from pandas.io.pickle import _unpickle_array

            columns = _unpickle_array(cols)
        else:
            columns = cols

        if not isinstance(idx, Index):  # pragma: no cover
            from pandas.io.pickle import _unpickle_array

            index = _unpickle_array(idx)
        else:
            index = idx

        series_dict = DataFrame()
        for col, (sp_index, sp_values) in series.items():
            series_dict[col] = SparseSeries(
                sp_values, sparse_index=sp_index, fill_value=fv
            )

        self._data = to_manager(series_dict, columns, index)
        self._default_fill_value = fv
        self._default_kind = kind

    @Appender(SparseFrameAccessor.to_dense.__doc__)
    def to_dense(self):
        return SparseFrameAccessor(self).to_dense()

    def _apply_columns(self, func):
        """
        Get new SparseDataFrame applying func to each columns
        """

        new_data = {col: func(series) for col, series in self.items()}

        return self._constructor(
            data=new_data,
            index=self.index,
            columns=self.columns,
            default_fill_value=self.default_fill_value,
        ).__finalize__(self)

    def astype(self, dtype):
        return self._apply_columns(lambda x: x.astype(dtype))

    def copy(self, deep=True):
        """
        Make a copy of this SparseDataFrame
        """
        result = super().copy(deep=deep)
        result._default_fill_value = self._default_fill_value
        result._default_kind = self._default_kind
        return result

    @property
    def default_fill_value(self):
        return self._default_fill_value

    @property
    def default_kind(self):
        return self._default_kind

    @property
    def density(self):
        """
        Ratio of non-sparse points to total (dense) data points
        represented in the frame
        """
        tot_nonsparse = sum(ser.sp_index.npoints for _, ser in self.items())
        tot = len(self.index) * len(self.columns)
        return tot_nonsparse / float(tot)

    def fillna(
        self, value=None, method=None, axis=0, inplace=False, limit=None, downcast=None
    ):
        new_self = super().fillna(
            value=value,
            method=method,
            axis=axis,
            inplace=inplace,
            limit=limit,
            downcast=downcast,
        )
        if not inplace:
            self = new_self

        # set the fill value if we are filling as a scalar with nothing special
        # going on
        if value is not None and value == value and method is None and limit is None:
            self._default_fill_value = value

        if not inplace:
            return self

    # ----------------------------------------------------------------------
    # Support different internal representation of SparseDataFrame

    def _sanitize_column(self, key, value, **kwargs):
        """
        Creates a new SparseArray from the input value.

        Parameters
        ----------
        key : object
        value : scalar, Series, or array-like
        kwargs : dict

        Returns
        -------
        sanitized_column : SparseArray

        """

        def sp_maker(x, index=None):
            return SparseArray(
                x,
                index=index,
                fill_value=self._default_fill_value,
                kind=self._default_kind,
            )

        if isinstance(value, SparseSeries):
            clean = value.reindex(self.index).as_sparse_array(
                fill_value=self._default_fill_value, kind=self._default_kind
            )

        elif isinstance(value, SparseArray):
            if len(value) != len(self.index):
                raise ValueError("Length of values does not match " "length of index")
            clean = value

        elif hasattr(value, "__iter__"):
            if isinstance(value, Series):
                clean = value.reindex(self.index)
                if not isinstance(value, SparseSeries):
                    clean = sp_maker(clean)
            else:
                if len(value) != len(self.index):
                    raise ValueError(
                        "Length of values does not match " "length of index"
                    )
                clean = sp_maker(value)

        # Scalar
        else:
            clean = sp_maker(value, self.index)

        # always return a SparseArray!
        return clean

    def get_value(self, index, col, takeable=False):
        """
        Quickly retrieve single value at passed column and index

        .. deprecated:: 0.21.0

        Please use .at[] or .iat[] accessors.

        Parameters
        ----------
        index : row label
        col : column label
        takeable : interpret the index/col as indexers, default False

        Returns
        -------
        value : scalar value
        """
        warnings.warn(
            "get_value is deprecated and will be removed "
            "in a future release. Please use "
            ".at[] or .iat[] accessors instead",
            FutureWarning,
            stacklevel=2,
        )
        return self._get_value(index, col, takeable=takeable)

    def _get_value(self, index, col, takeable=False):
        if takeable is True:
            series = self._iget_item_cache(col)
        else:
            series = self._get_item_cache(col)

        return series._get_value(index, takeable=takeable)

    _get_value.__doc__ = get_value.__doc__

    def set_value(self, index, col, value, takeable=False):
        """
        Put single value at passed column and index

        .. deprecated:: 0.21.0

        Please use .at[] or .iat[] accessors.

        Parameters
        ----------
        index : row label
        col : column label
        value : scalar value
        takeable : interpret the index/col as indexers, default False

        Notes
        -----
        This method *always* returns a new object. It is currently not
        particularly efficient (and potentially very expensive) but is provided
        for API compatibility with DataFrame

        Returns
        -------
        frame : DataFrame
        """
        warnings.warn(
            "set_value is deprecated and will be removed "
            "in a future release. Please use "
            ".at[] or .iat[] accessors instead",
            FutureWarning,
            stacklevel=2,
        )
        return self._set_value(index, col, value, takeable=takeable)

    def _set_value(self, index, col, value, takeable=False):
        dense = self.to_dense()._set_value(index, col, value, takeable=takeable)
        return dense.to_sparse(
            kind=self._default_kind, fill_value=self._default_fill_value
        )

    _set_value.__doc__ = set_value.__doc__

    def _slice(self, slobj, axis=0, kind=None):
        if axis == 0:
            new_index = self.index[slobj]
            new_columns = self.columns
        else:
            new_index = self.index
            new_columns = self.columns[slobj]

        return self.reindex(index=new_index, columns=new_columns)

    def xs(self, key, axis=0, copy=False):
        """
        Returns a row (cross-section) from the SparseDataFrame as a Series
        object.

        Parameters
        ----------
        key : some index contained in the index

        Returns
        -------
        xs : Series
        """
        if axis == 1:
            data = self[key]
            return data

        i = self.index.get_loc(key)
        data = self.take([i])._internal_get_values()[0]
        return Series(data, index=self.columns)

    # ----------------------------------------------------------------------
    # Arithmetic-related methods

    def _combine_frame(self, other, func, fill_value=None, level=None):
        if level is not None:
            raise NotImplementedError("'level' argument is not supported")

        this, other = self.align(other, join="outer", level=level, copy=False)
        new_index, new_columns = this.index, this.columns

        if self.empty and other.empty:
            return self._constructor(index=new_index).__finalize__(self)

        new_data = {}
        if fill_value is not None:
            # TODO: be a bit more intelligent here
            for col in new_columns:
                if col in this and col in other:
                    dleft = this[col].to_dense()
                    dright = other[col].to_dense()
                    result = dleft._binop(dright, func, fill_value=fill_value)
                    result = result.to_sparse(fill_value=this[col].fill_value)
                    new_data[col] = result
        else:

            for col in new_columns:
                if col in this and col in other:
                    new_data[col] = func(this[col], other[col])

        new_fill_value = self._get_op_result_fill_value(other, func)

        return self._constructor(
            data=new_data,
            index=new_index,
            columns=new_columns,
            default_fill_value=new_fill_value,
        ).__finalize__(self)

    def _combine_match_index(self, other, func, level=None):
        new_data = {}

        if level is not None:
            raise NotImplementedError("'level' argument is not supported")

        this, other = self.align(other, join="outer", axis=0, level=level, copy=False)

        for col, series in this.items():
            new_data[col] = func(series.values, other.values)

        fill_value = self._get_op_result_fill_value(other, func)

        return self._constructor(
            new_data,
            index=this.index,
            columns=self.columns,
            default_fill_value=fill_value,
        ).__finalize__(self)

    def _combine_match_columns(self, other, func, level=None):
        # patched version of DataFrame._combine_match_columns to account for
        # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series,
        # where 3.0 is numpy.float64 and series is a SparseSeries. Still
        # possible for this to happen, which is bothersome

        if level is not None:
            raise NotImplementedError("'level' argument is not supported")

        left, right = self.align(other, join="outer", axis=1, level=level, copy=False)
        assert left.columns.equals(right.index)

        new_data = {}

        for col in left.columns:
            new_data[col] = func(left[col], float(right[col]))

        return self._constructor(
            new_data,
            index=left.index,
            columns=left.columns,
            default_fill_value=self.default_fill_value,
        ).__finalize__(self)

    def _combine_const(self, other, func):
        return self._apply_columns(lambda x: func(x, other))

    def _get_op_result_fill_value(self, other, func):
        own_default = self.default_fill_value

        if isinstance(other, DataFrame):
            # i.e. called from _combine_frame

            other_default = getattr(other, "default_fill_value", np.nan)

            # if the fill values are the same use them? or use a valid one
            if own_default == other_default:
                # TOOD: won't this evaluate as False if both are np.nan?
                fill_value = own_default
            elif np.isnan(own_default) and not np.isnan(other_default):
                fill_value = other_default
            elif not np.isnan(own_default) and np.isnan(other_default):
                fill_value = own_default
            else:
                fill_value = None

        elif isinstance(other, SparseSeries):
            # i.e. called from _combine_match_index

            # fill_value is a function of our operator
            if isna(other.fill_value) or isna(own_default):
                fill_value = np.nan
            else:
                fill_value = func(np.float64(own_default), np.float64(other.fill_value))
                fill_value = item_from_zerodim(fill_value)
        else:
            raise NotImplementedError(type(other))

        return fill_value

    def _reindex_index(
        self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False
    ):
        if level is not None:
            raise TypeError("Reindex by level not supported for sparse")

        if self.index.equals(index):
            if copy:
                return self.copy()
            else:
                return self

        if len(self.index) == 0:
            return self._constructor(index=index, columns=self.columns).__finalize__(
                self
            )

        indexer = self.index.get_indexer(index, method, limit=limit)
        indexer = ensure_platform_int(indexer)
        mask = indexer == -1
        need_mask = mask.any()

        new_series = {}
        for col, series in self.items():
            if mask.all():
                continue

            values = series.values
            # .take returns SparseArray
            new = values.take(indexer)
            if need_mask:
                new = new.to_dense()
                # convert integer to float if necessary. need to do a lot
                # more than that, handle boolean etc also
                new, fill_value = maybe_upcast(new, fill_value=fill_value)
                np.putmask(new, mask, fill_value)

            new_series[col] = new

        return self._constructor(
            new_series,
            index=index,
            columns=self.columns,
            default_fill_value=self._default_fill_value,
        ).__finalize__(self)

    def _reindex_columns(
        self, columns, method, copy, level, fill_value=None, limit=None, takeable=False
    ):
        if level is not None:
            raise TypeError("Reindex by level not supported for sparse")

        if notna(fill_value):
            raise NotImplementedError("'fill_value' argument is not supported")

        if limit:
            raise NotImplementedError("'limit' argument is not supported")

        if method is not None:
            raise NotImplementedError("'method' argument is not supported")

        # TODO: fill value handling
        sdict = {k: v for k, v in self.items() if k in columns}
        return self._constructor(
            sdict,
            index=self.index,
            columns=columns,
            default_fill_value=self._default_fill_value,
        ).__finalize__(self)

    def _reindex_with_indexers(
        self,
        reindexers,
        method=None,
        fill_value=None,
        limit=None,
        copy=False,
        allow_dups=False,
    ):

        if method is not None or limit is not None:
            raise NotImplementedError(
                "cannot reindex with a method or limit " "with sparse"
            )

        if fill_value is None:
            fill_value = np.nan

        reindexers = {self._get_axis_number(a): val for (a, val) in reindexers.items()}

        index, row_indexer = reindexers.get(0, (None, None))
        columns, col_indexer = reindexers.get(1, (None, None))

        if columns is None:
            columns = self.columns

        new_arrays = {}
        for col in columns:
            if col not in self:
                continue
            if row_indexer is not None:
                new_arrays[col] = algos.take_1d(
                    self[col]._internal_get_values(), row_indexer, fill_value=fill_value
                )
            else:
                new_arrays[col] = self[col]

        return self._constructor(new_arrays, index=index, columns=columns).__finalize__(
            self
        )

    def _join_compat(
        self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
    ):
        if on is not None:
            raise NotImplementedError(
                "'on' keyword parameter is not yet " "implemented"
            )
        return self._join_index(other, how, lsuffix, rsuffix)

    def _join_index(self, other, how, lsuffix, rsuffix):
        if isinstance(other, Series):
            if other.name is None:
                raise ValueError("Other Series must have a name")

            other = SparseDataFrame(
                {other.name: other}, default_fill_value=self._default_fill_value
            )

        join_index = self.index.join(other.index, how=how)

        this = self.reindex(join_index)
        other = other.reindex(join_index)

        this, other = this._maybe_rename_join(other, lsuffix, rsuffix)

        from pandas import concat

        return concat([this, other], axis=1, verify_integrity=True)

    def _maybe_rename_join(self, other, lsuffix, rsuffix):
        to_rename = self.columns.intersection(other.columns)
        if len(to_rename) > 0:
            if not lsuffix and not rsuffix:
                raise ValueError(
                    "columns overlap but no suffix specified: "
                    "{to_rename}".format(to_rename=to_rename)
                )

            def lrenamer(x):
                if x in to_rename:
                    return "{x}{lsuffix}".format(x=x, lsuffix=lsuffix)
                return x

            def rrenamer(x):
                if x in to_rename:
                    return "{x}{rsuffix}".format(x=x, rsuffix=rsuffix)
                return x

            this = self.rename(columns=lrenamer)
            other = other.rename(columns=rrenamer)
        else:
            this = self

        return this, other

    def transpose(self, *args, **kwargs):
        """
        Returns a DataFrame with the rows/columns switched.
        """
        nv.validate_transpose(args, kwargs)
        return self._constructor(
            self.values.T,
            index=self.columns,
            columns=self.index,
            default_fill_value=self._default_fill_value,
            default_kind=self._default_kind,
        ).__finalize__(self)

    T = property(transpose)

    @Appender(DataFrame.count.__doc__)
    def count(self, axis=0, **kwds):
        if axis is None:
            axis = self._stat_axis_number

        return self.apply(lambda x: x.count(), axis=axis)

    def cumsum(self, axis=0, *args, **kwargs):
        """
        Return SparseDataFrame of cumulative sums over requested axis.

        Parameters
        ----------
        axis : {0, 1}
            0 for row-wise, 1 for column-wise

        Returns
        -------
        y : SparseDataFrame
        """
        nv.validate_cumsum(args, kwargs)

        if axis is None:
            axis = self._stat_axis_number

        return self.apply(lambda x: x.cumsum(), axis=axis)

    @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs)
    def isna(self):
        return self._apply_columns(lambda x: x.isna())

    isnull = isna

    @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs)
    def notna(self):
        return self._apply_columns(lambda x: x.notna())

    notnull = notna

    def apply(self, func, axis=0, broadcast=None, reduce=None, result_type=None):
        """
        Analogous to DataFrame.apply, for SparseDataFrame

        Parameters
        ----------
        func : function
            Function to apply to each column
        axis : {0, 1, 'index', 'columns'}
        broadcast : bool, default False
            For aggregation functions, return object of same size with values
            propagated

            .. deprecated:: 0.23.0
               This argument will be removed in a future version, replaced
               by result_type='broadcast'.

        reduce : boolean or None, default None
            Try to apply reduction procedures. If the DataFrame is empty,
            apply will use reduce to determine whether the result should be a
            Series or a DataFrame. If reduce is None (the default), apply's
            return value will be guessed by calling func an empty Series (note:
            while guessing, exceptions raised by func will be ignored). If
            reduce is True a Series will always be returned, and if False a
            DataFrame will always be returned.

            .. deprecated:: 0.23.0
               This argument will be removed in a future version, replaced
               by result_type='reduce'.

        result_type : {'expand', 'reduce', 'broadcast, None}
            These only act when axis=1 {columns}:

            * 'expand' : list-like results will be turned into columns.
            * 'reduce' : return a Series if possible rather than expanding
              list-like results. This is the opposite to 'expand'.
            * 'broadcast' : results will be broadcast to the original shape
              of the frame, the original index & columns will be retained.

            The default behaviour (None) depends on the return value of the
            applied function: list-like results will be returned as a Series
            of those. However if the apply function returns a Series these
            are expanded to columns.

            .. versionadded:: 0.23.0

        Returns
        -------
        applied : Series or SparseDataFrame
        """
        if not len(self.columns):
            return self
        axis = self._get_axis_number(axis)

        if isinstance(func, np.ufunc):
            new_series = {}
            for k, v in self.items():
                applied = func(v)
                applied.fill_value = func(v.fill_value)
                new_series[k] = applied
            return self._constructor(
                new_series,
                index=self.index,
                columns=self.columns,
                default_fill_value=self._default_fill_value,
                default_kind=self._default_kind,
            ).__finalize__(self)

        from pandas.core.apply import frame_apply

        op = frame_apply(
            self,
            func=func,
            axis=axis,
            reduce=reduce,
            broadcast=broadcast,
            result_type=result_type,
        )
        return op.get_result()

    def applymap(self, func):
        """
        Apply a function to a DataFrame that is intended to operate
        elementwise, i.e. like doing map(func, series) for each series in the
        DataFrame

        Parameters
        ----------
        func : function
            Python function, returns a single value from a single value

        Returns
        -------
        applied : DataFrame
        """
        return self.apply(lambda x: [func(y) for y in x])


def to_manager(sdf, columns, index):
    """ create and return the block manager from a dataframe of series,
    columns, index
    """

    # from BlockManager perspective
    axes = [ensure_index(columns), ensure_index(index)]

    return create_block_manager_from_arrays([sdf[c] for c in columns], columns, axes)


def stack_sparse_frame(frame):
    """
    Only makes sense when fill_value is NaN
    """
    lengths = [s.sp_index.npoints for _, s in frame.items()]
    nobs = sum(lengths)

    # this is pretty fast
    minor_codes = np.repeat(np.arange(len(frame.columns)), lengths)

    inds_to_concat = []
    vals_to_concat = []
    # TODO: Figure out whether this can be reached.
    # I think this currently can't be reached because you can't build a
    # SparseDataFrame with a non-np.NaN fill value (fails earlier).
    for _, series in frame.items():
        if not np.isnan(series.fill_value):
            raise TypeError("This routine assumes NaN fill value")

        int_index = series.sp_index.to_int_index()
        inds_to_concat.append(int_index.indices)
        vals_to_concat.append(series.sp_values)

    major_codes = np.concatenate(inds_to_concat)
    stacked_values = np.concatenate(vals_to_concat)
    index = MultiIndex(
        levels=[frame.index, frame.columns],
        codes=[major_codes, minor_codes],
        verify_integrity=False,
    )

    lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=["foo"])
    return lp.sort_index(level=0)


def homogenize(series_dict):
    """
    Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex
    corresponding to the locations where they all have data

    Parameters
    ----------
    series_dict : dict or DataFrame

    Notes
    -----
    Using the dumbest algorithm I could think of. Should put some more thought
    into this

    Returns
    -------
    homogenized : dict of SparseSeries
    """
    index = None

    need_reindex = False

    for _, series in series_dict.items():
        if not np.isnan(series.fill_value):
            raise TypeError("this method is only valid with NaN fill values")

        if index is None:
            index = series.sp_index
        elif not series.sp_index.equals(index):
            need_reindex = True
            index = index.intersect(series.sp_index)

    if need_reindex:
        output = {}
        for name, series in series_dict.items():
            if not series.sp_index.equals(index):
                series = series.sparse_reindex(index)

            output[name] = series
    else:
        output = series_dict

    return output


# use unaccelerated ops for sparse objects
ops.add_flex_arithmetic_methods(SparseDataFrame)
ops.add_special_arithmetic_methods(SparseDataFrame)
aaronreidsmith / pandas python

Version: 0.25.3

/ core / sparse / frame.py

Products

About

Resources

Contact Gemfury