Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / pandas   python

Repository URL to install this package:

Version: 0.24.2 

/ tests / reshape / merge / test_multi.py

# pylint: disable=E1103

from collections import OrderedDict

import numpy as np
from numpy import nan
from numpy.random import randn
import pytest

import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.core.reshape.concat import concat
from pandas.core.reshape.merge import merge
import pandas.util.testing as tm


@pytest.fixture
def left():
    """left dataframe (not multi-indexed) for multi-index join tests"""
    # a little relevant example with NAs
    key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
            'qux', 'snap']
    key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
            'three', 'one']

    data = np.random.randn(len(key1))
    return DataFrame({'key1': key1, 'key2': key2, 'data': data})


@pytest.fixture
def right():
    """right dataframe (multi-indexed) for multi-index join tests"""
    index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                               ['one', 'two', 'three']],
                       codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                              [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                       names=['key1', 'key2'])

    return DataFrame(np.random.randn(10, 3), index=index,
                     columns=['j_one', 'j_two', 'j_three'])


@pytest.fixture
def left_multi():
    return (
        DataFrame(
            dict(Origin=['A', 'A', 'B', 'B', 'C'],
                 Destination=['A', 'B', 'A', 'C', 'A'],
                 Period=['AM', 'AM', 'IP', 'AM', 'OP'],
                 TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'],
                 Trips=[1987, 3647, 2470, 4296, 4444]),
            columns=['Origin', 'Destination', 'Period',
                     'TripPurp', 'Trips'])
        .set_index(['Origin', 'Destination', 'Period', 'TripPurp']))


@pytest.fixture
def right_multi():
    return (
        DataFrame(
            dict(Origin=['A', 'A', 'B', 'B', 'C', 'C', 'E'],
                 Destination=['A', 'B', 'A', 'B', 'A', 'B', 'F'],
                 Period=['AM', 'AM', 'IP', 'AM', 'OP', 'IP', 'AM'],
                 LinkType=['a', 'b', 'c', 'b', 'a', 'b', 'a'],
                 Distance=[100, 80, 90, 80, 75, 35, 55]),
            columns=['Origin', 'Destination', 'Period',
                     'LinkType', 'Distance'])
        .set_index(['Origin', 'Destination', 'Period', 'LinkType']))


@pytest.fixture
def on_cols_multi():
    return ['Origin', 'Destination', 'Period']


@pytest.fixture
def idx_cols_multi():
    return ['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType']


class TestMergeMulti(object):

    def setup_method(self):
        self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                        ['one', 'two', 'three']],
                                codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                       [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                                names=['first', 'second'])
        self.to_join = DataFrame(np.random.randn(10, 3), index=self.index,
                                 columns=['j_one', 'j_two', 'j_three'])

        # a little relevant example with NAs
        key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
                'qux', 'snap']
        key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
                'three', 'one']

        data = np.random.randn(len(key1))
        self.data = DataFrame({'key1': key1, 'key2': key2,
                               'data': data})

    def test_merge_on_multikey(self, left, right, join_type):
        on_cols = ['key1', 'key2']
        result = (left.join(right, on=on_cols, how=join_type)
                  .reset_index(drop=True))

        expected = pd.merge(left, right.reset_index(),
                            on=on_cols, how=join_type)

        tm.assert_frame_equal(result, expected)

        result = (left.join(right, on=on_cols, how=join_type, sort=True)
                  .reset_index(drop=True))

        expected = pd.merge(left, right.reset_index(),
                            on=on_cols, how=join_type, sort=True)

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("sort", [False, True])
    def test_left_join_multi_index(self, left, right, sort):
        icols = ['1st', '2nd', '3rd']

        def bind_cols(df):
            iord = lambda a: 0 if a != a else ord(a)
            f = lambda ts: ts.map(iord) - ord('a')
            return (f(df['1st']) + f(df['3rd']) * 1e2 +
                    df['2nd'].fillna(0) * 1e4)

        def run_asserts(left, right, sort):
            res = left.join(right, on=icols, how='left', sort=sort)

            assert len(left) < len(res) + 1
            assert not res['4th'].isna().any()
            assert not res['5th'].isna().any()

            tm.assert_series_equal(
                res['4th'], - res['5th'], check_names=False)
            result = bind_cols(res.iloc[:, :-2])
            tm.assert_series_equal(res['4th'], result, check_names=False)
            assert result.name is None

            if sort:
                tm.assert_frame_equal(
                    res, res.sort_values(icols, kind='mergesort'))

            out = merge(left, right.reset_index(), on=icols,
                        sort=sort, how='left')

            res.index = np.arange(len(res))
            tm.assert_frame_equal(out, res)

        lc = list(map(chr, np.arange(ord('a'), ord('z') + 1)))
        left = DataFrame(np.random.choice(lc, (5000, 2)),
                         columns=['1st', '3rd'])
        left.insert(1, '2nd', np.random.randint(0, 1000, len(left)))

        i = np.random.permutation(len(left))
        right = left.iloc[i].copy()

        left['4th'] = bind_cols(left)
        right['5th'] = - bind_cols(right)
        right.set_index(icols, inplace=True)

        run_asserts(left, right, sort)

        # inject some nulls
        left.loc[1::23, '1st'] = np.nan
        left.loc[2::37, '2nd'] = np.nan
        left.loc[3::43, '3rd'] = np.nan
        left['4th'] = bind_cols(left)

        i = np.random.permutation(len(left))
        right = left.iloc[i, :-1]
        right['5th'] = - bind_cols(right)
        right.set_index(icols, inplace=True)

        run_asserts(left, right, sort)

    @pytest.mark.parametrize("sort", [False, True])
    def test_merge_right_vs_left(self, left, right, sort):
        # compare left vs right merge with multikey
        on_cols = ['key1', 'key2']
        merged_left_right = left.merge(right,
                                       left_on=on_cols, right_index=True,
                                       how='left', sort=sort)

        merge_right_left = right.merge(left,
                                       right_on=on_cols, left_index=True,
                                       how='right', sort=sort)

        # Reorder columns
        merge_right_left = merge_right_left[merged_left_right.columns]

        tm.assert_frame_equal(merged_left_right, merge_right_left)

    def test_compress_group_combinations(self):

        # ~ 40000000 possible unique groups
        key1 = tm.rands_array(10, 10000)
        key1 = np.tile(key1, 2)
        key2 = key1[::-1]

        df = DataFrame({'key1': key1, 'key2': key2,
                        'value1': np.random.randn(20000)})

        df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2],
                         'value2': np.random.randn(10000)})

        # just to hit the label compression code path
        merge(df, df2, how='outer')

    def test_left_join_index_preserve_order(self):

        on_cols = ['k1', 'k2']
        left = DataFrame({'k1': [0, 1, 2] * 8,
                          'k2': ['foo', 'bar'] * 12,
                          'v': np.array(np.arange(24), dtype=np.int64)})

        index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
        right = DataFrame({'v2': [5, 7]}, index=index)

        result = left.join(right, on=on_cols)

        expected = left.copy()
        expected['v2'] = np.nan
        expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
        expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7

        tm.assert_frame_equal(result, expected)

        result.sort_values(on_cols, kind='mergesort', inplace=True)
        expected = left.join(right, on=on_cols, sort=True)

        tm.assert_frame_equal(result, expected)

        # test join with multi dtypes blocks
        left = DataFrame({'k1': [0, 1, 2] * 8,
                          'k2': ['foo', 'bar'] * 12,
                          'k3': np.array([0, 1, 2] * 8, dtype=np.float32),
                          'v': np.array(np.arange(24), dtype=np.int32)})

        index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
        right = DataFrame({'v2': [5, 7]}, index=index)

        result = left.join(right, on=on_cols)

        expected = left.copy()
        expected['v2'] = np.nan
        expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5
        expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7

        tm.assert_frame_equal(result, expected)

        result = result.sort_values(on_cols, kind='mergesort')
        expected = left.join(right, on=on_cols, sort=True)

        tm.assert_frame_equal(result, expected)

    def test_left_join_index_multi_match_multiindex(self):
        left = DataFrame([
            ['X', 'Y', 'C', 'a'],
            ['W', 'Y', 'C', 'e'],
            ['V', 'Q', 'A', 'h'],
            ['V', 'R', 'D', 'i'],
            ['X', 'Y', 'D', 'b'],
            ['X', 'Y', 'A', 'c'],
            ['W', 'Q', 'B', 'f'],
            ['W', 'R', 'C', 'g'],
            ['V', 'Y', 'C', 'j'],
            ['X', 'Y', 'B', 'd']],
            columns=['cola', 'colb', 'colc', 'tag'],
            index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8])

        right = (DataFrame([
            ['W', 'R', 'C', 0],
            ['W', 'Q', 'B', 3],
            ['W', 'Q', 'B', 8],
            ['X', 'Y', 'A', 1],
            ['X', 'Y', 'A', 4],
            ['X', 'Y', 'B', 5],
            ['X', 'Y', 'C', 6],
            ['X', 'Y', 'C', 9],
            ['X', 'Q', 'C', -6],
            ['X', 'R', 'C', -9],
            ['V', 'Y', 'C', 7],
            ['V', 'R', 'D', 2],
            ['V', 'R', 'D', -1],
            ['V', 'Q', 'A', -3]],
            columns=['col1', 'col2', 'col3', 'val'])
            .set_index(['col1', 'col2', 'col3']))

        result = left.join(right, on=['cola', 'colb', 'colc'], how='left')

        expected = DataFrame([
            ['X', 'Y', 'C', 'a', 6],
            ['X', 'Y', 'C', 'a', 9],
            ['W', 'Y', 'C', 'e', nan],
            ['V', 'Q', 'A', 'h', -3],
            ['V', 'R', 'D', 'i', 2],
            ['V', 'R', 'D', 'i', -1],
            ['X', 'Y', 'D', 'b', nan],
            ['X', 'Y', 'A', 'c', 1],
            ['X', 'Y', 'A', 'c', 4],
            ['W', 'Q', 'B', 'f', 3],
            ['W', 'Q', 'B', 'f', 8],
            ['W', 'R', 'C', 'g', 0],
            ['V', 'Y', 'C', 'j', 7],
            ['X', 'Y', 'B', 'd', 5]],
            columns=['cola', 'colb', 'colc', 'tag', 'val'],
            index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8])

        tm.assert_frame_equal(result, expected)

        result = left.join(right, on=['cola', 'colb', 'colc'],
                           how='left', sort=True)

        expected = expected.sort_values(['cola', 'colb', 'colc'],
                                        kind='mergesort')

        tm.assert_frame_equal(result, expected)

    def test_left_join_index_multi_match(self):
        left = DataFrame([
            ['c', 0],
            ['b', 1],
            ['a', 2],
            ['b', 3]],
            columns=['tag', 'val'],
            index=[2, 0, 1, 3])

        right = (DataFrame([
            ['a', 'v'],
            ['c', 'w'],
            ['c', 'x'],
            ['d', 'y'],
            ['a', 'z'],
            ['c', 'r'],
            ['e', 'q'],
            ['c', 's']],
            columns=['tag', 'char'])
            .set_index('tag'))

        result = left.join(right, on='tag', how='left')
Loading ...