# -*- coding: utf-8 -*-
# pylint: disable-msg=W0612,E1101
import numpy as np
from numpy import nan
import pytest
from pandas.compat import range
import pandas as pd
from pandas import DataFrame, lreshape, melt, wide_to_long
import pandas.util.testing as tm
class TestMelt(object):
def setup_method(self, method):
self.df = tm.makeTimeDataFrame()[:10]
self.df['id1'] = (self.df['A'] > 0).astype(np.int64)
self.df['id2'] = (self.df['B'] > 0).astype(np.int64)
self.var_name = 'var'
self.value_name = 'val'
self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867
], [-1.321405, 0.368915, -1.055342],
[-0.807333, 0.08298, -0.873361]])
self.df1.columns = [list('ABC'), list('abc')]
self.df1.columns.names = ['CAP', 'low']
def test_top_level_method(self):
result = melt(self.df)
assert result.columns.tolist() == ['variable', 'value']
def test_method_signatures(self):
tm.assert_frame_equal(self.df.melt(),
melt(self.df))
tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'],
value_vars=['A', 'B']),
melt(self.df,
id_vars=['id1', 'id2'],
value_vars=['A', 'B']))
tm.assert_frame_equal(self.df.melt(var_name=self.var_name,
value_name=self.value_name),
melt(self.df,
var_name=self.var_name,
value_name=self.value_name))
tm.assert_frame_equal(self.df1.melt(col_level=0),
melt(self.df1, col_level=0))
def test_default_col_names(self):
result = self.df.melt()
assert result.columns.tolist() == ['variable', 'value']
result1 = self.df.melt(id_vars=['id1'])
assert result1.columns.tolist() == ['id1', 'variable', 'value']
result2 = self.df.melt(id_vars=['id1', 'id2'])
assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value']
def test_value_vars(self):
result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A')
assert len(result3) == 10
result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'])
expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A'] * 10 + ['B'] * 10,
'value': (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', 'variable', 'value'])
tm.assert_frame_equal(result4, expected4)
def test_value_vars_types(self):
# GH 15348
expected = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A'] * 10 + ['B'] * 10,
'value': (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', 'variable', 'value'])
for type_ in (tuple, list, np.array):
result = self.df.melt(id_vars=['id1', 'id2'],
value_vars=type_(('A', 'B')))
tm.assert_frame_equal(result, expected)
def test_vars_work_with_multiindex(self):
expected = DataFrame({
('A', 'a'): self.df1[('A', 'a')],
'CAP': ['B'] * len(self.df1),
'low': ['b'] * len(self.df1),
'value': self.df1[('B', 'b')],
}, columns=[('A', 'a'), 'CAP', 'low', 'value'])
result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')])
tm.assert_frame_equal(result, expected)
def test_single_vars_work_with_multiindex(self):
expected = DataFrame({
'A': {0: 1.067683, 1: -1.321405, 2: -0.807333},
'CAP': {0: 'B', 1: 'B', 2: 'B'},
'value': {0: -1.110463, 1: 0.368915, 2: 0.08298}})
result = self.df1.melt(['A'], ['B'], col_level=0)
tm.assert_frame_equal(result, expected)
def test_tuple_vars_fail_with_multiindex(self):
# melt should fail with an informative error message if
# the columns have a MultiIndex and a tuple is passed
# for id_vars or value_vars.
tuple_a = ('A', 'a')
list_a = [tuple_a]
tuple_b = ('B', 'b')
list_b = [tuple_b]
msg = (r"(id|value)_vars must be a list of tuples when columns are"
" a MultiIndex")
for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b),
(tuple_a, tuple_b)):
with pytest.raises(ValueError, match=msg):
self.df1.melt(id_vars=id_vars, value_vars=value_vars)
def test_custom_var_name(self):
result5 = self.df.melt(var_name=self.var_name)
assert result5.columns.tolist() == ['var', 'value']
result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name)
assert result6.columns.tolist() == ['id1', 'var', 'value']
result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name)
assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value']
result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
var_name=self.var_name)
assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value']
result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
var_name=self.var_name)
expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
self.var_name: ['A'] * 10 + ['B'] * 10,
'value': (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', self.var_name, 'value'])
tm.assert_frame_equal(result9, expected9)
def test_custom_value_name(self):
result10 = self.df.melt(value_name=self.value_name)
assert result10.columns.tolist() == ['variable', 'val']
result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name)
assert result11.columns.tolist() == ['id1', 'variable', 'val']
result12 = self.df.melt(id_vars=['id1', 'id2'],
value_name=self.value_name)
assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val']
result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
value_name=self.value_name)
assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val']
result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
value_name=self.value_name)
expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A'] * 10 + ['B'] * 10,
self.value_name: (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', 'variable',
self.value_name])
tm.assert_frame_equal(result14, expected14)
def test_custom_var_and_value_name(self):
result15 = self.df.melt(var_name=self.var_name,
value_name=self.value_name)
assert result15.columns.tolist() == ['var', 'val']
result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name,
value_name=self.value_name)
assert result16.columns.tolist() == ['id1', 'var', 'val']
result17 = self.df.melt(id_vars=['id1', 'id2'],
var_name=self.var_name,
value_name=self.value_name)
assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val']
result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
var_name=self.var_name,
value_name=self.value_name)
assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val']
result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
var_name=self.var_name,
value_name=self.value_name)
expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
self.var_name: ['A'] * 10 + ['B'] * 10,
self.value_name: (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', self.var_name,
self.value_name])
tm.assert_frame_equal(result19, expected19)
df20 = self.df.copy()
df20.columns.name = 'foo'
result20 = df20.melt()
assert result20.columns.tolist() == ['foo', 'value']
def test_col_level(self):
res1 = self.df1.melt(col_level=0)
res2 = self.df1.melt(col_level='CAP')
assert res1.columns.tolist() == ['CAP', 'value']
assert res2.columns.tolist() == ['CAP', 'value']
def test_multiindex(self):
res = self.df1.melt()
assert res.columns.tolist() == ['CAP', 'low', 'value']
@pytest.mark.parametrize("col", [
pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')),
pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
pd.Series([0, 1, 0, 0, 0])])
def test_pandas_dtypes(self, col):
# GH 15785
df = DataFrame({'klass': range(5),
'col': col,
'attr1': [1, 0, 0, 0, 0],
'attr2': col})
expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col],
ignore_index=True)
result = melt(df, id_vars=['klass', 'col'], var_name='attribute',
value_name='value')
expected = DataFrame({0: list(range(5)) * 2,
1: pd.concat([col] * 2, ignore_index=True),
2: ['attr1'] * 5 + ['attr2'] * 5,
3: expected_value})
expected.columns = ['klass', 'col', 'attribute', 'value']
tm.assert_frame_equal(result, expected)
def test_melt_missing_columns_raises(self):
# GH-23575
# This test is to ensure that pandas raises an error if melting is
# attempted with column names absent from the dataframe
# Generate data
df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd'))
# Try to melt with missing `value_vars` column name
msg = "The following '{Var}' are not present in the DataFrame: {Col}"
with pytest.raises(
KeyError,
match=msg.format(Var='value_vars', Col="\\['C'\\]")):
df.melt(['a', 'b'], ['C', 'd'])
# Try to melt with missing `id_vars` column name
with pytest.raises(
KeyError,
match=msg.format(Var='id_vars', Col="\\['A'\\]")):
df.melt(['A', 'b'], ['c', 'd'])
# Multiple missing
with pytest.raises(
KeyError,
match=msg.format(Var='id_vars',
Col="\\['not_here', 'or_there'\\]")):
df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd'])
# Multiindex melt fails if column is missing from multilevel melt
multi = df.copy()
multi.columns = [list('ABCD'), list('abcd')]
with pytest.raises(
KeyError,
match=msg.format(Var='id_vars',
Col="\\['E'\\]")):
multi.melt([('E', 'a')], [('B', 'b')])
# Multiindex fails if column is missing from single level melt
with pytest.raises(
KeyError,
match=msg.format(Var='value_vars',
Col="\\['F'\\]")):
multi.melt(['A'], ['F'], col_level=0)
class TestLreshape(object):
def test_pairs(self):
data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
'11jan2009'],
'birthwt': [1766, 3301, 1454, 3139, 4133],
'id': [101, 102, 103, 104, 105],
'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
'visitdt1': ['11jan2009', '22dec2008', '04jan2009',
'29dec2008', '20jan2009'],
'visitdt2':
['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
'wt1': [1823, 3338, 1549, 3298, 4306],
'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
'wt3': [2293.0, nan, nan, 3377.0, 4805.0]}
df = DataFrame(data)
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)],
'wt': ['wt%d' % i for i in range(1, 4)]}
result = lreshape(df, spec)
exp_data = {'birthdt':
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
'11jan2009', '08jan2009', '30dec2008', '21dec2008',
'11jan2009', '08jan2009', '21dec2008', '11jan2009'],
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139,
4133, 1766, 3139, 4133],
'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101,
104, 105],
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
'Male', 'Female', 'Female', 'Female', 'Male',
'Female', 'Female'],
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
'29dec2008', '20jan2009', '21jan2009',
'22jan2009', '31dec2008', '03feb2009',
'05feb2009', '02jan2009', '15feb2009'],
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]}
exp = DataFrame(exp_data, columns=result.columns)
tm.assert_frame_equal(result, exp)
result = lreshape(df, spec, dropna=False)
exp_data = {'birthdt':
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
'11jan2009', '08jan2009', '20dec2008', '30dec2008',
'21dec2008', '11jan2009', '08jan2009', '20dec2008',
'30dec2008', '21dec2008', '11jan2009'],
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454,
3139, 4133, 1766, 3301, 1454, 3139, 4133],
'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105,
101, 102, 103, 104, 105],
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
'Male', 'Female', 'Female', 'Female', 'Female',
'Male', 'Female', 'Female', 'Female', 'Female'],
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
'29dec2008', '20jan2009', '21jan2009', nan,
Loading ...