Repository URL to install this package:
|
Version:
0.11.1 ▾
|
from statsmodels.compat.python import iterkeys, lrange, iteritems
from collections import OrderedDict
from io import BytesIO
from itertools import product
import numpy as np
import pandas as pd
import pytest
from numpy.testing import assert_, assert_raises
from statsmodels.api import datasets
# utilities for the tests
try:
import matplotlib.pyplot as plt # noqa:F401
except ImportError:
pass
# the main drawing function
from statsmodels.graphics.mosaicplot import mosaic
# other functions to be tested for accuracy
from statsmodels.graphics.mosaicplot import _hierarchical_split
from statsmodels.graphics.mosaicplot import _reduce_dict
from statsmodels.graphics.mosaicplot import _key_splitting
from statsmodels.graphics.mosaicplot import _normalize_split
from statsmodels.graphics.mosaicplot import _split_rect
@pytest.mark.matplotlib
def test_data_conversion(close_figures):
# It will not reorder the elements
# so the dictionary will look odd
# as it key order has the c and b
# keys swapped
import pandas
_, ax = plt.subplots(4, 4)
data = {'ax': 1, 'bx': 2, 'cx': 3}
mosaic(data, ax=ax[0, 0], title='basic dict', axes_label=False)
data = pandas.Series(data)
mosaic(data, ax=ax[0, 1], title='basic series', axes_label=False)
data = [1, 2, 3]
mosaic(data, ax=ax[0, 2], title='basic list', axes_label=False)
data = np.asarray(data)
mosaic(data, ax=ax[0, 3], title='basic array', axes_label=False)
data = {('ax', 'cx'): 1, ('bx', 'cx'): 2, ('ax', 'dx'): 3, ('bx', 'dx'): 4}
mosaic(data, ax=ax[1, 0], title='compound dict', axes_label=False)
mosaic(data, ax=ax[2, 0], title='inverted keys dict', index=[1, 0], axes_label=False)
data = pandas.Series(data)
mosaic(data, ax=ax[1, 1], title='compound series', axes_label=False)
mosaic(data, ax=ax[2, 1], title='inverted keys series', index=[1, 0])
data = [[1, 2], [3, 4]]
mosaic(data, ax=ax[1, 2], title='compound list', axes_label=False)
mosaic(data, ax=ax[2, 2], title='inverted keys list', index=[1, 0])
data = np.array([[1, 2], [3, 4]])
mosaic(data, ax=ax[1, 3], title='compound array', axes_label=False)
mosaic(data, ax=ax[2, 3], title='inverted keys array', index=[1, 0], axes_label=False)
gender = ['male', 'male', 'male', 'female', 'female', 'female']
pet = ['cat', 'dog', 'dog', 'cat', 'dog', 'cat']
data = pandas.DataFrame({'gender': gender, 'pet': pet})
mosaic(data, ['gender'], ax=ax[3, 0], title='dataframe by key 1', axes_label=False)
mosaic(data, ['pet'], ax=ax[3, 1], title='dataframe by key 2', axes_label=False)
mosaic(data, ['gender', 'pet'], ax=ax[3, 2], title='both keys', axes_label=False)
mosaic(data, ['pet', 'gender'], ax=ax[3, 3], title='keys inverted', axes_label=False)
plt.suptitle('testing data conversion (plot 1 of 4)')
@pytest.mark.matplotlib
def test_mosaic_simple(close_figures):
# display a simple plot of 4 categories of data, splitted in four
# levels with increasing size for each group
# creation of the levels
key_set = (['male', 'female'], ['old', 'adult', 'young'],
['worker', 'unemployed'], ['healty', 'ill'])
# the cartesian product of all the categories is
# the complete set of categories
keys = list(product(*key_set))
data = OrderedDict(zip(keys, range(1, 1 + len(keys))))
# which colours should I use for the various categories?
# put it into a dict
props = {}
#males and females in blue and red
props[('male',)] = {'color': 'b'}
props[('female',)] = {'color': 'r'}
# all the groups corresponding to ill groups have a different color
for key in keys:
if 'ill' in key:
if 'male' in key:
props[key] = {'color': 'BlueViolet' , 'hatch': '+'}
else:
props[key] = {'color': 'Crimson' , 'hatch': '+'}
# mosaic of the data, with given gaps and colors
mosaic(data, gap=0.05, properties=props, axes_label=False)
plt.suptitle('syntetic data, 4 categories (plot 2 of 4)')
@pytest.mark.matplotlib
def test_mosaic(close_figures):
# make the same analysis on a known dataset
# load the data and clean it a bit
affairs = datasets.fair.load_pandas()
datas = affairs.exog
# any time greater than 0 is cheating
datas['cheated'] = affairs.endog > 0
# sort by the marriage quality and give meaningful name
# [rate_marriage, age, yrs_married, children,
# religious, educ, occupation, occupation_husb]
datas = datas.sort_values(['rate_marriage', 'religious'])
num_to_desc = {1: 'awful', 2: 'bad', 3: 'intermediate',
4: 'good', 5: 'wonderful'}
datas['rate_marriage'] = datas['rate_marriage'].map(num_to_desc)
num_to_faith = {1: 'non religious', 2: 'poorly religious', 3: 'religious',
4: 'very religious'}
datas['religious'] = datas['religious'].map(num_to_faith)
num_to_cheat = {False: 'faithful', True: 'cheated'}
datas['cheated'] = datas['cheated'].map(num_to_cheat)
# finished cleaning
_, ax = plt.subplots(2, 2)
mosaic(datas, ['rate_marriage', 'cheated'], ax=ax[0, 0],
title='by marriage happiness')
mosaic(datas, ['religious', 'cheated'], ax=ax[0, 1],
title='by religiosity')
mosaic(datas, ['rate_marriage', 'religious', 'cheated'], ax=ax[1, 0],
title='by both', labelizer=lambda k:'')
ax[1, 0].set_xlabel('marriage rating')
ax[1, 0].set_ylabel('religion status')
mosaic(datas, ['religious', 'rate_marriage'], ax=ax[1, 1],
title='inter-dependence', axes_label=False)
plt.suptitle("extramarital affairs (plot 3 of 4)")
@pytest.mark.matplotlib
def test_mosaic_very_complex(close_figures):
# make a scattermatrix of mosaic plots to show the correlations between
# each pair of variable in a dataset. Could be easily converted into a
# new function that does this automatically based on the type of data
key_name = ['gender', 'age', 'health', 'work']
key_base = (['male', 'female'], ['old', 'young'],
['healty', 'ill'], ['work', 'unemployed'])
keys = list(product(*key_base))
data = OrderedDict(zip(keys, range(1, 1 + len(keys))))
props = {}
props[('male', 'old')] = {'color': 'r'}
props[('female',)] = {'color': 'pink'}
L = len(key_base)
_, axes = plt.subplots(L, L)
for i in range(L):
for j in range(L):
m = set(range(L)).difference(set((i, j)))
if i == j:
axes[i, i].text(0.5, 0.5, key_name[i],
ha='center', va='center')
axes[i, i].set_xticks([])
axes[i, i].set_xticklabels([])
axes[i, i].set_yticks([])
axes[i, i].set_yticklabels([])
else:
ji = max(i, j)
ij = min(i, j)
temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v)
for k, v in iteritems(data)])
keys = list(iterkeys(temp_data))
for k in keys:
value = _reduce_dict(temp_data, k[:2])
temp_data[k[:2]] = value
del temp_data[k]
mosaic(temp_data, ax=axes[i, j], axes_label=False,
properties=props, gap=0.05, horizontal=i > j)
plt.suptitle('old males should look bright red, (plot 4 of 4)')
@pytest.mark.matplotlib
def test_axes_labeling(close_figures):
from numpy.random import rand
key_set = (['male', 'female'], ['old', 'adult', 'young'],
['worker', 'unemployed'], ['yes', 'no'])
# the cartesian product of all the categories is
# the complete set of categories
keys = list(product(*key_set))
data = OrderedDict(zip(keys, rand(len(keys))))
lab = lambda k: ''.join(s[0] for s in k)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
mosaic(data, ax=ax1, labelizer=lab, horizontal=True, label_rotation=45)
mosaic(data, ax=ax2, labelizer=lab, horizontal=False,
label_rotation=[0, 45, 90, 0])
#fig.tight_layout()
fig.suptitle("correct alignment of the axes labels")
@pytest.mark.smoke
@pytest.mark.matplotlib
def test_mosaic_empty_cells(close_figures):
# GH#2286
import pandas as pd
mydata = pd.DataFrame({'id2': {64: 'Angelica',
65: 'DXW_UID', 66: 'casuid01',
67: 'casuid01', 68: 'EC93_uid',
69: 'EC93_uid', 70: 'EC93_uid',
60: 'DXW_UID', 61: 'AtmosFox',
62: 'DXW_UID', 63: 'DXW_UID'},
'id1': {64: 'TGP',
65: 'Retention01', 66: 'default',
67: 'default', 68: 'Musa_EC_9_3',
69: 'Musa_EC_9_3', 70: 'Musa_EC_9_3',
60: 'default', 61: 'default',
62: 'default', 63: 'default'}})
ct = pd.crosstab(mydata.id1, mydata.id2)
_, vals = mosaic(ct.T.unstack())
_, vals = mosaic(mydata, ['id1','id2'])
eq = lambda x, y: assert_(np.allclose(x, y))
def test_recursive_split():
keys = list(product('mf'))
data = OrderedDict(zip(keys, [1] * len(keys)))
res = _hierarchical_split(data, gap=0)
assert_(list(iterkeys(res)) == keys)
res[('m',)] = (0.0, 0.0, 0.5, 1.0)
res[('f',)] = (0.5, 0.0, 0.5, 1.0)
keys = list(product('mf', 'yao'))
data = OrderedDict(zip(keys, [1] * len(keys)))
res = _hierarchical_split(data, gap=0)
assert_(list(iterkeys(res)) == keys)
res[('m', 'y')] = (0.0, 0.0, 0.5, 1 / 3)
res[('m', 'a')] = (0.0, 1 / 3, 0.5, 1 / 3)
res[('m', 'o')] = (0.0, 2 / 3, 0.5, 1 / 3)
res[('f', 'y')] = (0.5, 0.0, 0.5, 1 / 3)
res[('f', 'a')] = (0.5, 1 / 3, 0.5, 1 / 3)
res[('f', 'o')] = (0.5, 2 / 3, 0.5, 1 / 3)
def test__reduce_dict():
data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), [1] * 8))
eq(_reduce_dict(data, ('m',)), 4)
eq(_reduce_dict(data, ('m', 'o')), 2)
eq(_reduce_dict(data, ('m', 'o', 'w')), 1)
data = OrderedDict(zip(list(product('mf', 'oy', 'wn')), lrange(8)))
eq(_reduce_dict(data, ('m',)), 6)
eq(_reduce_dict(data, ('m', 'o')), 1)
eq(_reduce_dict(data, ('m', 'o', 'w')), 0)
def test__key_splitting():
# subdivide starting with an empty tuple
base_rect = {tuple(): (0, 0, 1, 1)}
res = _key_splitting(base_rect, ['a', 'b'], [1, 1], tuple(), True, 0)
assert_(list(iterkeys(res)) == [('a',), ('b',)])
eq(res[('a',)], (0, 0, 0.5, 1))
eq(res[('b',)], (0.5, 0, 0.5, 1))
# subdivide a in two sublevel
res_bis = _key_splitting(res, ['c', 'd'], [1, 1], ('a',), False, 0)
assert_(list(iterkeys(res_bis)) == [('a', 'c'), ('a', 'd'), ('b',)])
eq(res_bis[('a', 'c')], (0.0, 0.0, 0.5, 0.5))
eq(res_bis[('a', 'd')], (0.0, 0.5, 0.5, 0.5))
eq(res_bis[('b',)], (0.5, 0, 0.5, 1))
# starting with a non empty tuple and uneven distribution
base_rect = {('total',): (0, 0, 1, 1)}
res = _key_splitting(base_rect, ['a', 'b'], [1, 2], ('total',), True, 0)
assert_(list(iterkeys(res)) == [('total',) + (e,) for e in ['a', 'b']])
eq(res[('total', 'a')], (0, 0, 1 / 3, 1))
eq(res[('total', 'b')], (1 / 3, 0, 2 / 3, 1))
def test_proportion_normalization():
# extremes should give the whole set, as well
# as if 0 is inserted
eq(_normalize_split(0.), [0.0, 0.0, 1.0])
eq(_normalize_split(1.), [0.0, 1.0, 1.0])
eq(_normalize_split(2.), [0.0, 1.0, 1.0])
# negative values should raise ValueError
assert_raises(ValueError, _normalize_split, -1)
assert_raises(ValueError, _normalize_split, [1., -1])
assert_raises(ValueError, _normalize_split, [1., -1, 0.])
# if everything is zero it will complain
assert_raises(ValueError, _normalize_split, [0.])
assert_raises(ValueError, _normalize_split, [0., 0.])
# one-element array should return the whole interval
eq(_normalize_split([0.5]), [0.0, 1.0])
eq(_normalize_split([1.]), [0.0, 1.0])
eq(_normalize_split([2.]), [0.0, 1.0])
# simple division should give two pieces
for x in [0.3, 0.5, 0.9]:
eq(_normalize_split(x), [0., x, 1.0])
# multiple division should split as the sum of the components
for x, y in [(0.25, 0.5), (0.1, 0.8), (10., 30.)]:
eq(_normalize_split([x, y]), [0., x / (x + y), 1.0])
for x, y, z in [(1., 1., 1.), (0.1, 0.5, 0.7), (10., 30., 40)]:
eq(_normalize_split(
[x, y, z]), [0., x / (x + y + z), (x + y) / (x + y + z), 1.0])
def test_false_split():
# if you ask it to be divided in only one piece, just return the original
# one
pure_square = [0., 0., 1., 1.]
conf_h = dict(proportion=[1], gap=0.0, horizontal=True)
conf_v = dict(proportion=[1], gap=0.0, horizontal=False)
eq(_split_rect(*pure_square, **conf_h), pure_square)
eq(_split_rect(*pure_square, **conf_v), pure_square)
conf_h = dict(proportion=[1], gap=0.5, horizontal=True)
conf_v = dict(proportion=[1], gap=0.5, horizontal=False)
eq(_split_rect(*pure_square, **conf_h), pure_square)
eq(_split_rect(*pure_square, **conf_v), pure_square)
# identity on a void rectangle should not give anything strange
null_square = [0., 0., 0., 0.]
conf = dict(proportion=[1], gap=0.0, horizontal=True)
eq(_split_rect(*null_square, **conf), null_square)
conf = dict(proportion=[1], gap=1.0, horizontal=True)
eq(_split_rect(*null_square, **conf), null_square)
# splitting a negative rectangle should raise error
neg_square = [0., 0., -1., 0.]
conf = dict(proportion=[1], gap=0.0, horizontal=True)
assert_raises(ValueError, _split_rect, *neg_square, **conf)
conf = dict(proportion=[1, 1], gap=0.0, horizontal=True)
assert_raises(ValueError, _split_rect, *neg_square, **conf)
conf = dict(proportion=[1], gap=0.5, horizontal=True)
assert_raises(ValueError, _split_rect, *neg_square, **conf)
conf = dict(proportion=[1, 1], gap=0.5, horizontal=True)
assert_raises(ValueError, _split_rect, *neg_square, **conf)
def test_rect_pure_split():
pure_square = [0., 0., 1., 1.]
# division in two equal pieces from the perfect square
h_2split = [(0.0, 0.0, 0.5, 1.0), (0.5, 0.0, 0.5, 1.0)]
conf_h = dict(proportion=[1, 1], gap=0.0, horizontal=True)
eq(_split_rect(*pure_square, **conf_h), h_2split)
v_2split = [(0.0, 0.0, 1.0, 0.5), (0.0, 0.5, 1.0, 0.5)]
conf_v = dict(proportion=[1, 1], gap=0.0, horizontal=False)
eq(_split_rect(*pure_square, **conf_v), v_2split)
# division in two non-equal pieces from the perfect square
h_2split = [(0.0, 0.0, 1 / 3, 1.0), (1 / 3, 0.0, 2 / 3, 1.0)]
conf_h = dict(proportion=[1, 2], gap=0.0, horizontal=True)
eq(_split_rect(*pure_square, **conf_h), h_2split)
v_2split = [(0.0, 0.0, 1.0, 1 / 3), (0.0, 1 / 3, 1.0, 2 / 3)]
conf_v = dict(proportion=[1, 2], gap=0.0, horizontal=False)
eq(_split_rect(*pure_square, **conf_v), v_2split)
# division in three equal pieces from the perfect square
h_2split = [(0.0, 0.0, 1 / 3, 1.0), (1 / 3, 0.0, 1 / 3, 1.0), (2 / 3, 0.0,
1 / 3, 1.0)]
conf_h = dict(proportion=[1, 1, 1], gap=0.0, horizontal=True)
eq(_split_rect(*pure_square, **conf_h), h_2split)
v_2split = [(0.0, 0.0, 1.0, 1 / 3), (0.0, 1 / 3, 1.0, 1 / 3), (0.0, 2 / 3,
1.0, 1 / 3)]
conf_v = dict(proportion=[1, 1, 1], gap=0.0, horizontal=False)
eq(_split_rect(*pure_square, **conf_v), v_2split)
# division in three non-equal pieces from the perfect square
h_2split = [(0.0, 0.0, 1 / 4, 1.0), (1 / 4, 0.0, 1 / 2, 1.0), (3 / 4, 0.0,
1 / 4, 1.0)]
conf_h = dict(proportion=[1, 2, 1], gap=0.0, horizontal=True)
eq(_split_rect(*pure_square, **conf_h), h_2split)
v_2split = [(0.0, 0.0, 1.0, 1 / 4), (0.0, 1 / 4, 1.0, 1 / 2), (0.0, 3 / 4,
1.0, 1 / 4)]
conf_v = dict(proportion=[1, 2, 1], gap=0.0, horizontal=False)
eq(_split_rect(*pure_square, **conf_v), v_2split)
# splitting on a void rectangle should give multiple void
null_square = [0., 0., 0., 0.]
conf = dict(proportion=[1, 1], gap=0.0, horizontal=True)
eq(_split_rect(*null_square, **conf), [null_square, null_square])
conf = dict(proportion=[1, 2], gap=1.0, horizontal=True)
eq(_split_rect(*null_square, **conf), [null_square, null_square])
def test_rect_deformed_split():
non_pure_square = [1., -1., 1., 0.5]
# division in two equal pieces from the perfect square
h_2split = [(1.0, -1.0, 0.5, 0.5), (1.5, -1.0, 0.5, 0.5)]
conf_h = dict(proportion=[1, 1], gap=0.0, horizontal=True)
eq(_split_rect(*non_pure_square, **conf_h), h_2split)
v_2split = [(1.0, -1.0, 1.0, 0.25), (1.0, -0.75, 1.0, 0.25)]
conf_v = dict(proportion=[1, 1], gap=0.0, horizontal=False)
eq(_split_rect(*non_pure_square, **conf_v), v_2split)
# division in two non-equal pieces from the perfect square
h_2split = [(1.0, -1.0, 1 / 3, 0.5), (1 + 1 / 3, -1.0, 2 / 3, 0.5)]
conf_h = dict(proportion=[1, 2], gap=0.0, horizontal=True)
eq(_split_rect(*non_pure_square, **conf_h), h_2split)
v_2split = [(1.0, -1.0, 1.0, 1 / 6), (1.0, 1 / 6 - 1, 1.0, 2 / 6)]
conf_v = dict(proportion=[1, 2], gap=0.0, horizontal=False)
eq(_split_rect(*non_pure_square, **conf_v), v_2split)
def test_gap_split():
pure_square = [0., 0., 1., 1.]
# null split
conf_h = dict(proportion=[1], gap=1.0, horizontal=True)
eq(_split_rect(*pure_square, **conf_h), pure_square)
# equal split
h_2split = [(0.0, 0.0, 0.25, 1.0), (0.75, 0.0, 0.25, 1.0)]
conf_h = dict(proportion=[1, 1], gap=1.0, horizontal=True)
eq(_split_rect(*pure_square, **conf_h), h_2split)
# disequal split
h_2split = [(0.0, 0.0, 1 / 6, 1.0), (0.5 + 1 / 6, 0.0, 1 / 3, 1.0)]
conf_h = dict(proportion=[1, 2], gap=1.0, horizontal=True)
eq(_split_rect(*pure_square, **conf_h), h_2split)
@pytest.mark.matplotlib
def test_default_arg_index(close_figures):
# 2116
df = pd.DataFrame({'size' : ['small', 'large', 'large', 'small', 'large',
'small'],
'length' : ['long', 'short', 'short', 'long', 'long',
'short']})
assert_raises(ValueError, mosaic, data=df, title='foobar')
@pytest.mark.matplotlib
def test_missing_category(close_figures):
# GH5639
animal = ['dog', 'dog', 'dog', 'cat', 'dog', 'cat', 'cat',
'dog', 'dog', 'cat']
size = ['medium', 'large', 'medium', 'medium', 'medium', 'medium',
'large', 'large', 'large', 'small']
testdata = pd.DataFrame({'animal': animal, 'size': size})
testdata['size'] = pd.Categorical(testdata['size'],
categories=['small', 'medium', 'large'])
testdata = testdata.sort_values('size')
fig, _ = mosaic(testdata, ['animal', 'size'])
bio = BytesIO()
fig.savefig(bio, format='png')