"""
SQL-style merge routines
"""
import copy
import string
import warnings
import numpy as np
from pandas._libs import hashtable as libhashtable, join as libjoin, lib
import pandas.compat as compat
from pandas.compat import filter, lzip, map, range, zip
from pandas.errors import MergeError
from pandas.util._decorators import Appender, Substitution
from pandas.core.dtypes.common import (
ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool,
is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
is_object_dtype, needs_i8_conversion)
from pandas.core.dtypes.missing import isnull, na_value_for_dtype
from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
import pandas.core.algorithms as algos
from pandas.core.arrays.categorical import _recode_for_categories
import pandas.core.common as com
from pandas.core.frame import _merge_doc
from pandas.core.internals import (
concatenate_block_managers, items_overlap_with_suffix)
import pandas.core.sorting as sorting
from pandas.core.sorting import is_int64_overflow_possible
@Substitution('\nleft : DataFrame')
@Appender(_merge_doc, indents=0)
def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False,
validate=None):
op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
right_on=right_on, left_index=left_index,
right_index=right_index, sort=sort, suffixes=suffixes,
copy=copy, indicator=indicator,
validate=validate)
return op.get_result()
if __debug__:
merge.__doc__ = _merge_doc % '\nleft : DataFrame'
def _groupby_and_merge(by, on, left, right, _merge_pieces,
check_duplicates=True):
"""
groupby & merge; we are always performing a left-by type operation
Parameters
----------
by: field to group
on: duplicates field
left: left frame
right: right frame
_merge_pieces: function for merging
check_duplicates: boolean, default True
should we check & clean duplicates
"""
pieces = []
if not isinstance(by, (list, tuple)):
by = [by]
lby = left.groupby(by, sort=False)
# if we can groupby the rhs
# then we can get vastly better perf
try:
# we will check & remove duplicates if indicated
if check_duplicates:
if on is None:
on = []
elif not isinstance(on, (list, tuple)):
on = [on]
if right.duplicated(by + on).any():
right = right.drop_duplicates(by + on, keep='last')
rby = right.groupby(by, sort=False)
except KeyError:
rby = None
for key, lhs in lby:
if rby is None:
rhs = right
else:
try:
rhs = right.take(rby.indices[key])
except KeyError:
# key doesn't exist in left
lcols = lhs.columns.tolist()
cols = lcols + [r for r in right.columns
if r not in set(lcols)]
merged = lhs.reindex(columns=cols)
merged.index = range(len(merged))
pieces.append(merged)
continue
merged = _merge_pieces(lhs, rhs)
# make sure join keys are in the merged
# TODO, should _merge_pieces do this?
for k in by:
try:
if k in merged:
merged[k] = key
except KeyError:
pass
pieces.append(merged)
# preserve the original order
# if we have a missing piece this can be reset
from pandas.core.reshape.concat import concat
result = concat(pieces, ignore_index=True)
result = result.reindex(columns=pieces[0].columns, copy=False)
return result, lby
def merge_ordered(left, right, on=None,
left_on=None, right_on=None,
left_by=None, right_by=None,
fill_method=None, suffixes=('_x', '_y'),
how='outer'):
"""Perform merge with optional filling/interpolation designed for ordered
data like time series data. Optionally perform group-wise merge (see
examples)
Parameters
----------
left : DataFrame
right : DataFrame
on : label or list
Field names to join on. Must be found in both DataFrames.
left_on : label or list, or array-like
Field names to join on in left DataFrame. Can be a vector or list of
vectors of the length of the DataFrame to use a particular vector as
the join key instead of columns
right_on : label or list, or array-like
Field names to join on in right DataFrame or vector/list of vectors per
left_on docs
left_by : column name or list of column names
Group left DataFrame by group columns and merge piece by piece with
right DataFrame
right_by : column name or list of column names
Group right DataFrame by group columns and merge piece by piece with
left DataFrame
fill_method : {'ffill', None}, default None
Interpolation method for data
suffixes : 2-length sequence (tuple, list, ...)
Suffix to apply to overlapping column names in the left and right
side, respectively
how : {'left', 'right', 'outer', 'inner'}, default 'outer'
* left: use only keys from left frame (SQL: left outer join)
* right: use only keys from right frame (SQL: right outer join)
* outer: use union of keys from both frames (SQL: full outer join)
* inner: use intersection of keys from both frames (SQL: inner join)
.. versionadded:: 0.19.0
Returns
-------
merged : DataFrame
The output type will the be same as 'left', if it is a subclass
of DataFrame.
See Also
--------
merge
merge_asof
Examples
--------
>>> A >>> B
key lvalue group key rvalue
0 a 1 a 0 b 1
1 c 2 a 1 c 2
2 e 3 a 2 d 3
3 a 1 b
4 c 2 b
5 e 3 b
>>> merge_ordered(A, B, fill_method='ffill', left_by='group')
group key lvalue rvalue
0 a a 1 NaN
1 a b 1 1.0
2 a c 2 2.0
3 a d 2 3.0
4 a e 3 3.0
5 b a 1 NaN
6 b b 1 1.0
7 b c 2 2.0
8 b d 2 3.0
9 b e 3 3.0
"""
def _merger(x, y):
# perform the ordered merge operation
op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on,
suffixes=suffixes, fill_method=fill_method,
how=how)
return op.get_result()
if left_by is not None and right_by is not None:
raise ValueError('Can only group either left or right frames')
elif left_by is not None:
result, _ = _groupby_and_merge(left_by, on, left, right,
lambda x, y: _merger(x, y),
check_duplicates=False)
elif right_by is not None:
result, _ = _groupby_and_merge(right_by, on, right, left,
lambda x, y: _merger(y, x),
check_duplicates=False)
else:
result = _merger(left, right)
return result
def merge_asof(left, right, on=None,
left_on=None, right_on=None,
left_index=False, right_index=False,
by=None, left_by=None, right_by=None,
suffixes=('_x', '_y'),
tolerance=None,
allow_exact_matches=True,
direction='backward'):
"""Perform an asof merge. This is similar to a left-join except that we
match on nearest key rather than equal keys.
Both DataFrames must be sorted by the key.
For each row in the left DataFrame:
- A "backward" search selects the last row in the right DataFrame whose
'on' key is less than or equal to the left's key.
- A "forward" search selects the first row in the right DataFrame whose
'on' key is greater than or equal to the left's key.
- A "nearest" search selects the row in the right DataFrame whose 'on'
key is closest in absolute distance to the left's key.
The default is "backward" and is compatible in versions below 0.20.0.
The direction parameter was added in version 0.20.0 and introduces
"forward" and "nearest".
Optionally match on equivalent keys with 'by' before searching with 'on'.
.. versionadded:: 0.19.0
Parameters
----------
left : DataFrame
right : DataFrame
on : label
Field name to join on. Must be found in both DataFrames.
The data MUST be ordered. Furthermore this must be a numeric column,
such as datetimelike, integer, or float. On or left_on/right_on
must be given.
left_on : label
Field name to join on in left DataFrame.
right_on : label
Field name to join on in right DataFrame.
left_index : boolean
Use the index of the left DataFrame as the join key.
.. versionadded:: 0.19.2
right_index : boolean
Use the index of the right DataFrame as the join key.
.. versionadded:: 0.19.2
by : column name or list of column names
Match on these columns before performing merge operation.
left_by : column name
Field names to match on in the left DataFrame.
.. versionadded:: 0.19.2
right_by : column name
Field names to match on in the right DataFrame.
.. versionadded:: 0.19.2
suffixes : 2-length sequence (tuple, list, ...)
Suffix to apply to overlapping column names in the left and right
side, respectively.
tolerance : integer or Timedelta, optional, default None
Select asof tolerance within this range; must be compatible
with the merge index.
allow_exact_matches : boolean, default True
- If True, allow matching with the same 'on' value
(i.e. less-than-or-equal-to / greater-than-or-equal-to)
- If False, don't match the same 'on' value
(i.e., strictly less-than / strictly greater-than)
direction : 'backward' (default), 'forward', or 'nearest'
Whether to search for prior, subsequent, or closest matches.
.. versionadded:: 0.20.0
Returns
-------
merged : DataFrame
See Also
--------
merge
merge_ordered
Examples
--------
>>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']})
>>> left
a left_val
0 1 a
1 5 b
2 10 c
>>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7],
... 'right_val': [1, 2, 3, 6, 7]})
>>> right
a right_val
0 1 1
1 2 2
2 3 3
3 6 6
4 7 7
>>> pd.merge_asof(left, right, on='a')
a left_val right_val
0 1 a 1
Loading ...