Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

/ core / reshape / merge.py

"""
SQL-style merge routines
"""

import copy
import string
import warnings

import numpy as np

from pandas._libs import hashtable as libhashtable, join as libjoin, lib
import pandas.compat as compat
from pandas.compat import filter, lzip, map, range, zip
from pandas.errors import MergeError
from pandas.util._decorators import Appender, Substitution

from pandas.core.dtypes.common import (
    ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool,
    is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
    is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
    is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
    is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
    is_object_dtype, needs_i8_conversion)
from pandas.core.dtypes.missing import isnull, na_value_for_dtype

from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
import pandas.core.algorithms as algos
from pandas.core.arrays.categorical import _recode_for_categories
import pandas.core.common as com
from pandas.core.frame import _merge_doc
from pandas.core.internals import (
    concatenate_block_managers, items_overlap_with_suffix)
import pandas.core.sorting as sorting
from pandas.core.sorting import is_int64_overflow_possible


@Substitution('\nleft : DataFrame')
@Appender(_merge_doc, indents=0)
def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
          left_index=False, right_index=False, sort=False,
          suffixes=('_x', '_y'), copy=True, indicator=False,
          validate=None):
    op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
                         right_on=right_on, left_index=left_index,
                         right_index=right_index, sort=sort, suffixes=suffixes,
                         copy=copy, indicator=indicator,
                         validate=validate)
    return op.get_result()


if __debug__:
    merge.__doc__ = _merge_doc % '\nleft : DataFrame'


def _groupby_and_merge(by, on, left, right, _merge_pieces,
                       check_duplicates=True):
    """
    groupby & merge; we are always performing a left-by type operation

    Parameters
    ----------
    by: field to group
    on: duplicates field
    left: left frame
    right: right frame
    _merge_pieces: function for merging
    check_duplicates: boolean, default True
        should we check & clean duplicates
    """

    pieces = []
    if not isinstance(by, (list, tuple)):
        by = [by]

    lby = left.groupby(by, sort=False)

    # if we can groupby the rhs
    # then we can get vastly better perf
    try:

        # we will check & remove duplicates if indicated
        if check_duplicates:
            if on is None:
                on = []
            elif not isinstance(on, (list, tuple)):
                on = [on]

            if right.duplicated(by + on).any():
                right = right.drop_duplicates(by + on, keep='last')
        rby = right.groupby(by, sort=False)
    except KeyError:
        rby = None

    for key, lhs in lby:

        if rby is None:
            rhs = right
        else:
            try:
                rhs = right.take(rby.indices[key])
            except KeyError:
                # key doesn't exist in left
                lcols = lhs.columns.tolist()
                cols = lcols + [r for r in right.columns
                                if r not in set(lcols)]
                merged = lhs.reindex(columns=cols)
                merged.index = range(len(merged))
                pieces.append(merged)
                continue

        merged = _merge_pieces(lhs, rhs)

        # make sure join keys are in the merged
        # TODO, should _merge_pieces do this?
        for k in by:
            try:
                if k in merged:
                    merged[k] = key
            except KeyError:
                pass

        pieces.append(merged)

    # preserve the original order
    # if we have a missing piece this can be reset
    from pandas.core.reshape.concat import concat
    result = concat(pieces, ignore_index=True)
    result = result.reindex(columns=pieces[0].columns, copy=False)
    return result, lby


def merge_ordered(left, right, on=None,
                  left_on=None, right_on=None,
                  left_by=None, right_by=None,
                  fill_method=None, suffixes=('_x', '_y'),
                  how='outer'):
    """Perform merge with optional filling/interpolation designed for ordered
    data like time series data. Optionally perform group-wise merge (see
    examples)

    Parameters
    ----------
    left : DataFrame
    right : DataFrame
    on : label or list
        Field names to join on. Must be found in both DataFrames.
    left_on : label or list, or array-like
        Field names to join on in left DataFrame. Can be a vector or list of
        vectors of the length of the DataFrame to use a particular vector as
        the join key instead of columns
    right_on : label or list, or array-like
        Field names to join on in right DataFrame or vector/list of vectors per
        left_on docs
    left_by : column name or list of column names
        Group left DataFrame by group columns and merge piece by piece with
        right DataFrame
    right_by : column name or list of column names
        Group right DataFrame by group columns and merge piece by piece with
        left DataFrame
    fill_method : {'ffill', None}, default None
        Interpolation method for data
    suffixes : 2-length sequence (tuple, list, ...)
        Suffix to apply to overlapping column names in the left and right
        side, respectively
    how : {'left', 'right', 'outer', 'inner'}, default 'outer'
        * left: use only keys from left frame (SQL: left outer join)
        * right: use only keys from right frame (SQL: right outer join)
        * outer: use union of keys from both frames (SQL: full outer join)
        * inner: use intersection of keys from both frames (SQL: inner join)

        .. versionadded:: 0.19.0

    Returns
    -------
    merged : DataFrame
        The output type will the be same as 'left', if it is a subclass
        of DataFrame.

    See Also
    --------
    merge
    merge_asof

    Examples
    --------
    >>> A                      >>> B
          key  lvalue group        key  rvalue
    0   a       1     a        0     b       1
    1   c       2     a        1     c       2
    2   e       3     a        2     d       3
    3   a       1     b
    4   c       2     b
    5   e       3     b

    >>> merge_ordered(A, B, fill_method='ffill', left_by='group')
      group key  lvalue  rvalue
    0     a   a       1     NaN
    1     a   b       1     1.0
    2     a   c       2     2.0
    3     a   d       2     3.0
    4     a   e       3     3.0
    5     b   a       1     NaN
    6     b   b       1     1.0
    7     b   c       2     2.0
    8     b   d       2     3.0
    9     b   e       3     3.0
    """
    def _merger(x, y):
        # perform the ordered merge operation
        op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on,
                           suffixes=suffixes, fill_method=fill_method,
                           how=how)
        return op.get_result()

    if left_by is not None and right_by is not None:
        raise ValueError('Can only group either left or right frames')
    elif left_by is not None:
        result, _ = _groupby_and_merge(left_by, on, left, right,
                                       lambda x, y: _merger(x, y),
                                       check_duplicates=False)
    elif right_by is not None:
        result, _ = _groupby_and_merge(right_by, on, right, left,
                                       lambda x, y: _merger(y, x),
                                       check_duplicates=False)
    else:
        result = _merger(left, right)
    return result


def merge_asof(left, right, on=None,
               left_on=None, right_on=None,
               left_index=False, right_index=False,
               by=None, left_by=None, right_by=None,
               suffixes=('_x', '_y'),
               tolerance=None,
               allow_exact_matches=True,
               direction='backward'):
    """Perform an asof merge. This is similar to a left-join except that we
    match on nearest key rather than equal keys.

    Both DataFrames must be sorted by the key.

    For each row in the left DataFrame:

      - A "backward" search selects the last row in the right DataFrame whose
        'on' key is less than or equal to the left's key.

      - A "forward" search selects the first row in the right DataFrame whose
        'on' key is greater than or equal to the left's key.

      - A "nearest" search selects the row in the right DataFrame whose 'on'
        key is closest in absolute distance to the left's key.

    The default is "backward" and is compatible in versions below 0.20.0.
    The direction parameter was added in version 0.20.0 and introduces
    "forward" and "nearest".

    Optionally match on equivalent keys with 'by' before searching with 'on'.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    left : DataFrame
    right : DataFrame
    on : label
        Field name to join on. Must be found in both DataFrames.
        The data MUST be ordered. Furthermore this must be a numeric column,
        such as datetimelike, integer, or float. On or left_on/right_on
        must be given.
    left_on : label
        Field name to join on in left DataFrame.
    right_on : label
        Field name to join on in right DataFrame.
    left_index : boolean
        Use the index of the left DataFrame as the join key.

        .. versionadded:: 0.19.2

    right_index : boolean
        Use the index of the right DataFrame as the join key.

        .. versionadded:: 0.19.2

    by : column name or list of column names
        Match on these columns before performing merge operation.
    left_by : column name
        Field names to match on in the left DataFrame.

        .. versionadded:: 0.19.2

    right_by : column name
        Field names to match on in the right DataFrame.

        .. versionadded:: 0.19.2

    suffixes : 2-length sequence (tuple, list, ...)
        Suffix to apply to overlapping column names in the left and right
        side, respectively.
    tolerance : integer or Timedelta, optional, default None
        Select asof tolerance within this range; must be compatible
        with the merge index.
    allow_exact_matches : boolean, default True

        - If True, allow matching with the same 'on' value
          (i.e. less-than-or-equal-to / greater-than-or-equal-to)
        - If False, don't match the same 'on' value
          (i.e., strictly less-than / strictly greater-than)

    direction : 'backward' (default), 'forward', or 'nearest'
        Whether to search for prior, subsequent, or closest matches.

        .. versionadded:: 0.20.0

    Returns
    -------
    merged : DataFrame

    See Also
    --------
    merge
    merge_ordered

    Examples
    --------
    >>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']})
    >>> left
        a left_val
    0   1        a
    1   5        b
    2  10        c

    >>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
    ...                       'right_val': [1, 2, 3, 6, 7]})
    >>> right
       a  right_val
    0  1          1
    1  2          2
    2  3          3
    3  6          6
    4  7          7

    >>> pd.merge_asof(left, right, on='a')
        a left_val  right_val
    0   1        a          1
Loading ...