Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
modin / pandas / general.py
Size: Mime:
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership.  The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

"""Implement pandas general API."""

import pandas
import numpy as np

from typing import Hashable, Iterable, Mapping, Union
from pandas.core.dtypes.common import is_list_like

from modin.error_message import ErrorMessage
from .base import BasePandasDataset
from .dataframe import DataFrame
from .series import Series
from modin.utils import to_pandas
from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler
from modin.utils import _inherit_docstrings
from modin.logging import logger_decorator


@_inherit_docstrings(pandas.isna, apilink="pandas.isna")
@logger_decorator("PANDAS-API", "isna", "info")
def isna(obj):  # noqa: PR01, RT01, D200
    """
    Detect missing values for an array-like object.
    """
    if isinstance(obj, BasePandasDataset):
        return obj.isna()
    else:
        return pandas.isna(obj)


isnull = isna


@_inherit_docstrings(pandas.notna, apilink="pandas.notna")
@logger_decorator("PANDAS-API", "notna", "info")
def notna(obj):  # noqa: PR01, RT01, D200
    """
    Detect non-missing values for an array-like object.
    """
    if isinstance(obj, BasePandasDataset):
        return obj.notna()
    else:
        return pandas.notna(obj)


notnull = notna


@_inherit_docstrings(pandas.merge, apilink="pandas.merge")
@logger_decorator("PANDAS-API", "merge", "info")
def merge(
    left,
    right,
    how: str = "inner",
    on=None,
    left_on=None,
    right_on=None,
    left_index: bool = False,
    right_index: bool = False,
    sort: bool = False,
    suffixes=("_x", "_y"),
    copy: bool = True,
    indicator: bool = False,
    validate=None,
):  # noqa: PR01, RT01, D200
    """
    Merge DataFrame or named Series objects with a database-style join.
    """
    if isinstance(left, Series):
        if left.name is None:
            raise ValueError("Cannot merge a Series without a name")
        else:
            left = left.to_frame()

    if not isinstance(left, DataFrame):
        raise TypeError(
            f"Can only merge Series or DataFrame objects, a {type(left)} was passed"
        )

    return left.merge(
        right,
        how=how,
        on=on,
        left_on=left_on,
        right_on=right_on,
        left_index=left_index,
        right_index=right_index,
        sort=sort,
        suffixes=suffixes,
        copy=copy,
        indicator=indicator,
        validate=validate,
    )


@_inherit_docstrings(pandas.merge_ordered, apilink="pandas.merge_ordered")
@logger_decorator("PANDAS-API", "merge_ordered", "info")
def merge_ordered(
    left,
    right,
    on=None,
    left_on=None,
    right_on=None,
    left_by=None,
    right_by=None,
    fill_method=None,
    suffixes=("_x", "_y"),
    how: str = "outer",
) -> DataFrame:  # noqa: PR01, RT01, D200
    """
    Perform a merge for ordered data with optional filling/interpolation.
    """
    if not isinstance(left, DataFrame):
        raise ValueError(
            "can not merge DataFrame with instance of type {}".format(type(right))
        )
    ErrorMessage.default_to_pandas("`merge_ordered`")
    if isinstance(right, DataFrame):
        right = to_pandas(right)
    return DataFrame(
        pandas.merge_ordered(
            to_pandas(left),
            right,
            on=on,
            left_on=left_on,
            right_on=right_on,
            left_by=left_by,
            right_by=right_by,
            fill_method=fill_method,
            suffixes=suffixes,
            how=how,
        )
    )


@_inherit_docstrings(pandas.merge_asof, apilink="pandas.merge_asof")
@logger_decorator("PANDAS-API", "merge_asof", "info")
def merge_asof(
    left,
    right,
    on=None,
    left_on=None,
    right_on=None,
    left_index: bool = False,
    right_index: bool = False,
    by=None,
    left_by=None,
    right_by=None,
    suffixes=("_x", "_y"),
    tolerance=None,
    allow_exact_matches: bool = True,
    direction: str = "backward",
) -> DataFrame:  # noqa: PR01, RT01, D200
    """
    Perform a merge by key distance.
    """
    if not isinstance(left, DataFrame):
        raise ValueError(
            "can not merge DataFrame with instance of type {}".format(type(right))
        )
    ErrorMessage.default_to_pandas("`merge_asof`")

    # As of Pandas 1.2 these should raise an error; before that it did
    # something likely random:
    if (
        (on and (left_index or right_index))
        or (left_on and left_index)
        or (right_on and right_index)
    ):
        raise ValueError("Can't combine left/right_index with left/right_on or on.")

    # Pandas fallbacks for tricky cases:
    if (
        # No idea how this works or why it does what it does; and in fact
        # there's a Pandas bug suggesting it's wrong:
        # https://github.com/pandas-dev/pandas/issues/33463
        (left_index and right_on is not None)
        # This is the case where by is a list of columns. If we're copying lots
        # of columns out of Pandas, maybe not worth trying our path, it's not
        # clear it's any better:
        or not isinstance(by, (str, type(None)))
        or not isinstance(left_by, (str, type(None)))
        or not isinstance(right_by, (str, type(None)))
        # The implementation below assumes that the right index is unique
        # because it uses merge_asof to map each position in the merged
        # index to the label of the one right row that should be merged
        # at that row position.
        or not right.index.is_unique
    ):
        if isinstance(right, DataFrame):
            right = to_pandas(right)
        return DataFrame(
            pandas.merge_asof(
                to_pandas(left),
                right,
                on=on,
                left_on=left_on,
                right_on=right_on,
                left_index=left_index,
                right_index=right_index,
                by=by,
                left_by=left_by,
                right_by=right_by,
                suffixes=suffixes,
                tolerance=tolerance,
                allow_exact_matches=allow_exact_matches,
                direction=direction,
            )
        )

    left_column = None
    right_column = None

    if on is not None:
        if left_on is not None or right_on is not None:
            raise ValueError("If 'on' is set, 'left_on' and 'right_on' can't be set.")
        left_on = on
        right_on = on

    if left_on is not None:
        left_column = to_pandas(left[left_on])
    elif left_index:
        left_column = left.index
    else:
        raise ValueError("Need some sort of 'on' spec")

    if right_on is not None:
        right_column = to_pandas(right[right_on])
    elif right_index:
        right_column = right.index
    else:
        raise ValueError("Need some sort of 'on' spec")

    # If we haven't set these by now, there's a bug in this function.
    assert left_column is not None
    assert right_column is not None

    if by is not None:
        if left_by is not None or right_by is not None:
            raise ValueError("Can't have both 'by' and 'left_by' or 'right_by'")
        left_by = right_by = by

    # List of columns case should have been handled by direct Pandas fallback
    # earlier:
    assert isinstance(left_by, (str, type(None)))
    assert isinstance(right_by, (str, type(None)))

    left_pandas_limited = {"on": left_column}
    right_pandas_limited = {"on": right_column, "right_labels": right.index}
    extra_kwargs = {}  # extra arguments to Pandas merge_asof

    if left_by is not None or right_by is not None:
        extra_kwargs["by"] = "by"
        left_pandas_limited["by"] = to_pandas(left[left_by])
        right_pandas_limited["by"] = to_pandas(right[right_by])

    # 1. Construct Pandas DataFrames with just the 'on' and optional 'by'
    # columns, and the index as another column.
    left_pandas_limited = pandas.DataFrame(left_pandas_limited, index=left.index)
    right_pandas_limited = pandas.DataFrame(right_pandas_limited)

    # 2. Use Pandas' merge_asof to figure out how to map labels on left to
    # labels on the right.
    merged = pandas.merge_asof(
        left_pandas_limited,
        right_pandas_limited,
        on="on",
        direction=direction,
        allow_exact_matches=allow_exact_matches,
        tolerance=tolerance,
        **extra_kwargs,
    )
    # Now merged["right_labels"] shows which labels from right map to left's index.

    # 3. Re-index right using the merged["right_labels"]; at this point right
    # should be same length and (semantically) same order as left:
    right_subset = right.reindex(index=pandas.Index(merged["right_labels"]))
    if not right_index:
        right_subset.drop(columns=[right_on], inplace=True)
    if right_by is not None and left_by == right_by:
        right_subset.drop(columns=[right_by], inplace=True)
    right_subset.index = left.index

    # 4. Merge left and the new shrunken right:
    result = merge(
        left,
        right_subset,
        left_index=True,
        right_index=True,
        suffixes=suffixes,
        how="left",
    )

    # 5. Clean up to match Pandas output:
    if left_on is not None and right_index:
        result.insert(
            # In theory this could use get_indexer_for(), but that causes an error:
            list(result.columns).index(left_on + suffixes[0]),
            left_on,
            result[left_on + suffixes[0]],
        )
    if not left_index and not right_index:
        result.index = pandas.RangeIndex(start=0, stop=len(result))

    return result


@_inherit_docstrings(pandas.pivot_table, apilink="pandas.pivot_table")
@logger_decorator("PANDAS-API", "pivot_table", "info")
def pivot_table(
    data,
    values=None,
    index=None,
    columns=None,
    aggfunc="mean",
    fill_value=None,
    margins=False,
    dropna=True,
    margins_name="All",
    observed=False,
    sort=True,
):  # noqa: PR01, RT01, D200
    """
    Create a spreadsheet-style pivot table as a DataFrame.
    """
    if not isinstance(data, DataFrame):
        raise ValueError(
            "can not create pivot table with instance of type {}".format(type(data))
        )

    return data.pivot_table(
        values=values,
        index=index,
        columns=columns,
        aggfunc=aggfunc,
        fill_value=fill_value,
        margins=margins,
        dropna=dropna,
        margins_name=margins_name,
        sort=sort,
    )


@_inherit_docstrings(pandas.pivot, apilink="pandas.pivot")
@logger_decorator("PANDAS-API", "pivot", "info")
def pivot(data, index=None, columns=None, values=None):  # noqa: PR01, RT01, D200
    """
    Return reshaped DataFrame organized by given index / column values.
    """
    if not isinstance(data, DataFrame):
        raise ValueError("can not pivot with instance of type {}".format(type(data)))
    return data.pivot(index=index, columns=columns, values=values)


@_inherit_docstrings(pandas.to_numeric, apilink="pandas.to_numeric")
@logger_decorator("PANDAS-API", "to_numeric", "info")
def to_numeric(arg, errors="raise", downcast=None):  # noqa: PR01, RT01, D200
    """
    Convert argument to a numeric type.
    """
    if not isinstance(arg, Series):
        return pandas.to_numeric(arg, errors=errors, downcast=downcast)
    return arg._to_numeric(errors=errors, downcast=downcast)


@_inherit_docstrings(pandas.unique, apilink="pandas.unique")
@logger_decorator("PANDAS-API", "unique", "info")
def unique(values):  # noqa: PR01, RT01, D200
    """
    Return unique values based on a hash table.
    """
    return Series(values).unique()


# Adding docstring since pandas docs don't have web section for this function.
@logger_decorator("PANDAS-API", "value_counts", "info")
def value_counts(
    values, sort=True, ascending=False, normalize=False, bins=None, dropna=True
):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
        Values to perform computation.
    sort : bool, default: True
        Sort by values.
    ascending : bool, default: False
        Sort in ascending order.
    normalize : bool, default: False
        If True then compute a relative histogram.
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data.
    dropna : bool, default: True
        Don't include counts of NaN.

    Returns
    -------
    Series
    """
    return Series(values).value_counts(
        sort=sort,
        ascending=ascending,
        normalize=normalize,
        bins=bins,
        dropna=dropna,
    )


@_inherit_docstrings(pandas.concat, apilink="pandas.concat")
@logger_decorator("PANDAS-API", "concat", "info")
def concat(
    objs: "Iterable[DataFrame | Series] | Mapping[Hashable, DataFrame | Series]",
    axis=0,
    join="outer",
    ignore_index: bool = False,
    keys=None,
    levels=None,
    names=None,
    verify_integrity: bool = False,
    sort: bool = False,
    copy: bool = True,
) -> "DataFrame | Series":  # noqa: PR01, RT01, D200
    """
    Concatenate Modin objects along a particular axis.
    """
    if isinstance(objs, (pandas.Series, Series, DataFrame, str, pandas.DataFrame)):
        raise TypeError(
            "first argument must be an iterable of pandas "
            + "objects, you passed an object of type "
            + f'"{type(objs).__name__}"'
        )
    axis = pandas.DataFrame()._get_axis_number(axis)
    if isinstance(objs, dict):
        list_of_objs = list(objs.values())
    else:
        list_of_objs = list(objs)
    if len(list_of_objs) == 0:
        raise ValueError("No objects to concatenate")

    list_of_objs = [obj for obj in list_of_objs if obj is not None]

    if len(list_of_objs) == 0:
        raise ValueError("All objects passed were None")
    try:
        type_check = next(
            obj
            for obj in list_of_objs
            if not isinstance(obj, (pandas.Series, Series, pandas.DataFrame, DataFrame))
        )
    except StopIteration:
        type_check = None
    if type_check is not None:
        raise ValueError(
            'cannot concatenate object of type "{0}"; only '
            + "modin.pandas.Series "
            + "and modin.pandas.DataFrame objs are "
            + "valid",
            type(type_check),
        )
    all_series = all(isinstance(obj, Series) for obj in list_of_objs)
    if all_series and axis == 0:
        return Series(
            query_compiler=list_of_objs[0]._query_compiler.concat(
                axis,
                [o._query_compiler for o in list_of_objs[1:]],
                join=join,
                join_axes=None,
                ignore_index=ignore_index,
                keys=None,
                levels=None,
                names=None,
                verify_integrity=False,
                copy=True,
                sort=sort,
            )
        )
    if join not in ["inner", "outer"]:
        raise ValueError(
            "Only can inner (intersect) or outer (union) join the other axis"
        )
    # We have the weird Series and axis check because, when concatenating a
    # dataframe to a series on axis=0, pandas ignores the name of the series,
    # and this check aims to mirror that (possibly buggy) functionality
    list_of_objs = [
        obj
        if isinstance(obj, DataFrame)
        else DataFrame(obj.rename())
        if isinstance(obj, (pandas.Series, Series)) and axis == 0
        else DataFrame(obj)
        for obj in list_of_objs
    ]
    list_of_objs = [
        obj._query_compiler
        for obj in list_of_objs
        if (not obj._query_compiler.lazy_execution and len(obj.index))
        or len(obj.columns)
    ]
    if keys is not None:
        if all_series:
            new_idx = keys
        else:
            list_of_objs = [
                list_of_objs[i] for i in range(min(len(list_of_objs), len(keys)))
            ]
            new_idx_labels = {
                k: v.index if axis == 0 else v.columns
                for k, v in zip(keys, list_of_objs)
            }
            tuples = [
                (k, *o) if isinstance(o, tuple) else (k, o)
                for k, obj in new_idx_labels.items()
                for o in obj
            ]
            new_idx = pandas.MultiIndex.from_tuples(tuples)
            if names is not None:
                new_idx.names = names
            else:
                old_name = _determine_name(list_of_objs, axis)
                if old_name is not None:
                    new_idx.names = [None] + old_name
    elif isinstance(objs, dict):
        new_idx = pandas.concat(
            {k: pandas.Series(index=obj.axes[axis]) for k, obj in objs.items()}
        ).index
    else:
        new_idx = None
    new_query_compiler = list_of_objs[0].concat(
        axis,
        list_of_objs[1:],
        join=join,
        join_axes=None,
        ignore_index=ignore_index,
        keys=None,
        levels=None,
        names=None,
        verify_integrity=False,
        copy=True,
        sort=sort,
    )
    result_df = DataFrame(query_compiler=new_query_compiler)
    if new_idx is not None:
        if axis == 0:
            result_df.index = new_idx
        else:
            result_df.columns = new_idx
    return result_df


@_inherit_docstrings(pandas.to_datetime, apilink="pandas.to_datetime")
@logger_decorator("PANDAS-API", "to_datetime", "info")
def to_datetime(
    arg,
    errors="raise",
    dayfirst=False,
    yearfirst=False,
    utc=None,
    format=None,
    exact=True,
    unit=None,
    infer_datetime_format=False,
    origin="unix",
    cache=True,
):  # noqa: PR01, RT01, D200
    """
    Convert argument to datetime.
    """
    if not isinstance(arg, (DataFrame, Series)):
        return pandas.to_datetime(
            arg,
            errors=errors,
            dayfirst=dayfirst,
            yearfirst=yearfirst,
            utc=utc,
            format=format,
            exact=exact,
            unit=unit,
            infer_datetime_format=infer_datetime_format,
            origin=origin,
            cache=cache,
        )
    return arg._to_datetime(
        errors=errors,
        dayfirst=dayfirst,
        yearfirst=yearfirst,
        utc=utc,
        format=format,
        exact=exact,
        unit=unit,
        infer_datetime_format=infer_datetime_format,
        origin=origin,
        cache=cache,
    )


@_inherit_docstrings(pandas.get_dummies, apilink="pandas.get_dummies")
@logger_decorator("PANDAS-API", "get_dummies", "info")
def get_dummies(
    data,
    prefix=None,
    prefix_sep="_",
    dummy_na=False,
    columns=None,
    sparse=False,
    drop_first=False,
    dtype=None,
):  # noqa: PR01, RT01, D200
    """
    Convert categorical variable into dummy/indicator variables.
    """
    if sparse:
        raise NotImplementedError(
            "SparseDataFrame is not implemented. "
            + "To contribute to Modin, please visit "
            + "github.com/modin-project/modin."
        )
    if not isinstance(data, DataFrame):
        ErrorMessage.default_to_pandas("`get_dummies` on non-DataFrame")
        if isinstance(data, Series):
            data = data._to_pandas()
        return DataFrame(
            pandas.get_dummies(
                data,
                prefix=prefix,
                prefix_sep=prefix_sep,
                dummy_na=dummy_na,
                columns=columns,
                sparse=sparse,
                drop_first=drop_first,
                dtype=dtype,
            )
        )
    else:
        new_manager = data._query_compiler.get_dummies(
            columns,
            prefix=prefix,
            prefix_sep=prefix_sep,
            dummy_na=dummy_na,
            drop_first=drop_first,
            dtype=dtype,
        )
        return DataFrame(query_compiler=new_manager)


@_inherit_docstrings(pandas.melt, apilink="pandas.melt")
@logger_decorator("PANDAS-API", "melt", "info")
def melt(
    frame,
    id_vars=None,
    value_vars=None,
    var_name=None,
    value_name="value",
    col_level=None,
    ignore_index: bool = True,
):  # noqa: PR01, RT01, D200
    """
    Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
    """
    return frame.melt(
        id_vars=id_vars,
        value_vars=value_vars,
        var_name=var_name,
        value_name=value_name,
        col_level=col_level,
        ignore_index=ignore_index,
    )


@_inherit_docstrings(pandas.crosstab, apilink="pandas.crosstab")
@logger_decorator("PANDAS-API", "crosstab", "info")
def crosstab(
    index,
    columns,
    values=None,
    rownames=None,
    colnames=None,
    aggfunc=None,
    margins=False,
    margins_name: str = "All",
    dropna: bool = True,
    normalize=False,
) -> DataFrame:  # noqa: PR01, RT01, D200
    """
    Compute a simple cross tabulation of two (or more) factors.
    """
    ErrorMessage.default_to_pandas("`crosstab`")
    pandas_crosstab = pandas.crosstab(
        index,
        columns,
        values,
        rownames,
        colnames,
        aggfunc,
        margins,
        margins_name,
        dropna,
        normalize,
    )
    return DataFrame(pandas_crosstab)


# Adding docstring since pandas docs don't have web section for this function.
@logger_decorator("PANDAS-API", "lreshape", "info")
def lreshape(data: DataFrame, groups, dropna=True, label=None):
    """
    Reshape wide-format data to long. Generalized inverse of ``DataFrame.pivot``.

    Accepts a dictionary, `groups`, in which each key is a new column name
    and each value is a list of old column names that will be "melted" under
    the new column name as part of the reshape.

    Parameters
    ----------
    data : DataFrame
        The wide-format DataFrame.
    groups : dict
        Dictionary in the form: `{new_name : list_of_columns}`.
    dropna : bool, default: True
        Whether include columns whose entries are all NaN or not.
    label : optional
        Deprecated parameter.

    Returns
    -------
    DataFrame
        Reshaped DataFrame.
    """
    if not isinstance(data, DataFrame):
        raise ValueError("can not lreshape with instance of type {}".format(type(data)))
    ErrorMessage.default_to_pandas("`lreshape`")
    return DataFrame(
        pandas.lreshape(to_pandas(data), groups, dropna=dropna, label=label)
    )


@_inherit_docstrings(pandas.wide_to_long, apilink="pandas.wide_to_long")
@logger_decorator("PANDAS-API", "wide_to_long", "info")
def wide_to_long(
    df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
) -> DataFrame:  # noqa: PR01, RT01, D200
    """
    Unpivot a DataFrame from wide to long format.
    """
    if not isinstance(df, DataFrame):
        raise ValueError(
            "can not wide_to_long with instance of type {}".format(type(df))
        )
    ErrorMessage.default_to_pandas("`wide_to_long`")
    return DataFrame(
        pandas.wide_to_long(to_pandas(df), stubnames, i, j, sep=sep, suffix=suffix)
    )


def _determine_name(objs: Iterable[BaseQueryCompiler], axis: Union[int, str]):
    """
    Determine names of index after concatenation along passed axis.

    Parameters
    ----------
    objs : iterable of QueryCompilers
        Objects to concatenate.
    axis : int or str
        The axis to concatenate along.

    Returns
    -------
    list with single element
        Computed index name, `None` if it could not be determined.
    """
    axis = pandas.DataFrame()._get_axis_number(axis)

    def get_names(obj):
        return obj.columns.names if axis else obj.index.names

    names = np.array([get_names(obj) for obj in objs])

    # saving old name, only if index names of all objs are the same
    if np.all(names == names[0]):
        # we must do this check to avoid this calls `list(str_like_name)`
        return list(names[0]) if is_list_like(names[0]) else [names[0]]
    else:
        return None