test_highlevel.py · aaronreidsmith/patsy

aaronreidsmith / patsy python

Repository URL to install this package:
Version: 0.5.1

/ test_highlevel.py

# This file is part of Patsy
# Copyright (C) 2012-2013 Nathaniel Smith <njs@pobox.com>
# See file LICENSE.txt for license information.

# Exhaustive end-to-end tests of the top-level API.

import sys
import __future__
import six
import numpy as np
from nose.tools import assert_raises
from patsy import PatsyError
from patsy.design_info import DesignMatrix, DesignInfo
from patsy.eval import EvalEnvironment
from patsy.desc import ModelDesc, Term, INTERCEPT
from patsy.categorical import C
from patsy.contrasts import Helmert
from patsy.user_util import balanced, LookupFactor
from patsy.build import (design_matrix_builders,
                         build_design_matrices)
from patsy.highlevel import *
from patsy.util import (have_pandas,
                        have_pandas_categorical,
                        have_pandas_categorical_dtype,
                        pandas_Categorical_from_codes)
from patsy.origin import Origin

if have_pandas:
    import pandas

def check_result(expect_full_designs, lhs, rhs, data,
                 expected_rhs_values, expected_rhs_names,
                 expected_lhs_values, expected_lhs_names): # pragma: no cover
    assert np.allclose(rhs, expected_rhs_values)
    assert rhs.design_info.column_names == expected_rhs_names
    if lhs is not None:
        assert np.allclose(lhs, expected_lhs_values)
        assert lhs.design_info.column_names == expected_lhs_names
    else:
        assert expected_lhs_values is None
        assert expected_lhs_names is None
    
    if expect_full_designs:
        if lhs is None:
            new_rhs, = build_design_matrices([rhs.design_info], data)
        else:
            new_lhs, new_rhs = build_design_matrices([lhs.design_info,
                                                      rhs.design_info],
                                                     data)
            assert np.allclose(new_lhs, lhs)
            assert new_lhs.design_info.column_names == expected_lhs_names
        assert np.allclose(new_rhs, rhs)
        assert new_rhs.design_info.column_names == expected_rhs_names
    else:
        assert rhs.design_info.terms is None
        assert lhs is None or lhs.design_info.terms is None

def dmatrix_pandas(formula_like, data={}, depth=0, return_type="matrix"):
    return_type = "dataframe"
    if isinstance(depth, int):
        depth += 1
    return dmatrix(formula_like, data, depth, return_type=return_type)

def dmatrices_pandas(formula_like, data={}, depth=0, return_type="matrix"):
    return_type = "dataframe"
    if isinstance(depth, int):
        depth += 1
    return dmatrices(formula_like, data, depth, return_type=return_type)

def t(formula_like, data, depth,
      expect_full_designs,
      expected_rhs_values, expected_rhs_names,
      expected_lhs_values=None, expected_lhs_names=None): # pragma: no cover
    if isinstance(depth, int):
        depth += 1
    def data_iter_maker():
        return iter([data])
    if (isinstance(formula_like, six.string_types + (ModelDesc, DesignInfo))
        or (isinstance(formula_like, tuple)
            and isinstance(formula_like[0], DesignInfo))
        or hasattr(formula_like, "__patsy_get_model_desc__")):
        if expected_lhs_values is None:
            builder = incr_dbuilder(formula_like, data_iter_maker, depth)
            lhs = None
            (rhs,) = build_design_matrices([builder], data)
        else:
            builders = incr_dbuilders(formula_like, data_iter_maker, depth)
            lhs, rhs = build_design_matrices(builders, data)
        check_result(expect_full_designs, lhs, rhs, data,
                     expected_rhs_values, expected_rhs_names,
                     expected_lhs_values, expected_lhs_names)
    else:
        assert_raises(PatsyError, incr_dbuilders,
                      formula_like, data_iter_maker)
        assert_raises(PatsyError, incr_dbuilder,
                      formula_like, data_iter_maker)
    one_mat_fs = [dmatrix]
    two_mat_fs = [dmatrices]
    if have_pandas:
        one_mat_fs.append(dmatrix_pandas)
        two_mat_fs.append(dmatrices_pandas)
    if expected_lhs_values is None:
        for f in one_mat_fs:
            rhs = f(formula_like, data, depth)
            check_result(expect_full_designs, None, rhs, data,
                         expected_rhs_values, expected_rhs_names,
                         expected_lhs_values, expected_lhs_names)

        # We inline assert_raises here to avoid complications with the
        # depth argument.
        for f in two_mat_fs:
            try:
                f(formula_like, data, depth)
            except PatsyError:
                pass
            else:
                raise AssertionError
    else:
        for f in one_mat_fs:
            try:
                f(formula_like, data, depth)
            except PatsyError:
                pass
            else:
                raise AssertionError

        for f in two_mat_fs:
            (lhs, rhs) = f(formula_like, data, depth)
            check_result(expect_full_designs, lhs, rhs, data,
                         expected_rhs_values, expected_rhs_names,
                         expected_lhs_values, expected_lhs_names)

def t_invalid(formula_like, data, depth, exc=PatsyError): # pragma: no cover
    if isinstance(depth, int):
        depth += 1
    fs = [dmatrix, dmatrices]
    if have_pandas:
        fs += [dmatrix_pandas, dmatrices_pandas]
    for f in fs:
        try:
            f(formula_like, data, depth)
        except exc:
            pass
        else:
            raise AssertionError

# Exercise all the different calling conventions for the high-level API
def test_formula_likes():
    # Plain array-like, rhs only
    t([[1, 2, 3], [4, 5, 6]], {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"])
    dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    t(dm, {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"])
    t((None, dm), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"])
      
    # Plain array-likes, lhs and rhs
    t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"],
      [[1], [2]], ["y0"])
    t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"],
      [[1], [2]], ["y0"])
    t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"],
      [[1], [2]], ["y0"])
    t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"],
      [[1], [2]], ["y0"])
    x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo")
    y_dm = DesignMatrix([1, 2], default_column_prefix="bar")
    t((y_dm, x_dm), {}, 0,
      False,
      [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"],
      [[1], [2]], ["bar0"])
    # number of rows must match
    t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0)

    # tuples must have the right size
    t_invalid(([[1, 2, 3]],), {}, 0)
    t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0)

    # plain Series and DataFrames
    if have_pandas:
        # Names are extracted
        t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0,
          False,
          [[1], [2], [3]], ["x"])
        t(pandas.Series([1, 2, 3], name="asdf"), {}, 0,
          False,
          [[1], [2], [3]], ["asdf"])
        t((pandas.DataFrame({"y": [4, 5, 6]}),
           pandas.DataFrame({"x": [1, 2, 3]})), {}, 0,
          False,
          [[1], [2], [3]], ["x"],
          [[4], [5], [6]], ["y"])
        t((pandas.Series([4, 5, 6], name="y"),
           pandas.Series([1, 2, 3], name="x")), {}, 0,
          False,
          [[1], [2], [3]], ["x"],
          [[4], [5], [6]], ["y"])
        # Or invented
        t((pandas.DataFrame([[4, 5, 6]]),
           pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0,
          False,
          [[1, 2, 3]], ["x7", "x8", "x9"],
          [[4, 5, 6]], ["y0", "y1", "y2"])
        t(pandas.Series([1, 2, 3]), {}, 0,
          False,
          [[1], [2], [3]], ["x0"])
        # indices must match
        t_invalid((pandas.DataFrame([[1]], index=[1]),
                   pandas.DataFrame([[1]], index=[2])),
                  {}, 0)

    # Foreign ModelDesc factories
    class ForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return ModelDesc([Term([LookupFactor("Y")])],
                             [Term([LookupFactor("X")])])
    foreign_model = ForeignModelSource()
    t(foreign_model,
      {"Y": [1, 2],
       "X": [[1, 2], [3, 4]]},
      0,
      True,
      [[1, 2], [3, 4]], ["X[0]", "X[1]"],
      [[1], [2]], ["Y"])
    class BadForeignModelSource(object):
        def __patsy_get_model_desc__(self, data):
            return data
    t_invalid(BadForeignModelSource(), {}, 0)

    # string formulas
    t("y ~ x", {"y": [1, 2], "x": [3, 4]}, 0,
      True,
      [[1, 3], [1, 4]], ["Intercept", "x"],
      [[1], [2]], ["y"])
    t("~ x", {"y": [1, 2], "x": [3, 4]}, 0,
      True,
      [[1, 3], [1, 4]], ["Intercept", "x"])
    t("x + y", {"y": [1, 2], "x": [3, 4]}, 0,
      True,
      [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"])

    # unicode objects on py2 (must be ascii only)
    if not six.PY3:
        # ascii is fine
        t(unicode("y ~ x"),
          {"y": [1, 2], "x": [3, 4]}, 0,
          True,
          [[1, 3], [1, 4]], ["Intercept", "x"],
          [[1], [2]], ["y"])
        # non-ascii is not (even if this would be valid on py3 with its less
        # restrict variable naming rules)
        eacute = "\xc3\xa9".decode("utf-8")
        assert isinstance(eacute, unicode)
        assert_raises(PatsyError, dmatrix, eacute, data={eacute: [1, 2]})

    # ModelDesc
    desc = ModelDesc([], [Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0,
      True,
      [[1.5], [2.5], [3.5]], ["x"])
    desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5]}, 0,
      True,
      [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"])
    desc = ModelDesc([Term([LookupFactor("y")])],
                     [Term([]), Term([LookupFactor("x")])])
    t(desc, {"x": [1.5, 2.5, 3.5], "y": [10, 20, 30]}, 0,
      True,
      [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"],
      [[10], [20], [30]], ["y"])

    # builders
    termlists = ([],
                 [Term([LookupFactor("x")])],
                 [Term([]), Term([LookupFactor("x")])],
                 )
    builders = design_matrix_builders(termlists,
                                      lambda: iter([{"x": [1, 2, 3]}]),
                                      eval_env=0)
    # twople but with no LHS
    t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0,
      True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"])
    # single DesignInfo
    t(builders[2], {"x": [10, 20, 30]}, 0,
      True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"])
    # twople with LHS
    t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0,
      True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"],
      [[10], [20], [30]], ["x"])
    
    # check depth arguments
    x_in_env = [1, 2, 3]
    t("~ x_in_env", {}, 0,
      True,
      [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"])
    t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0,
      True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"])
    # Trying to pull x_in_env out of our *caller* shouldn't work.
    t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError))
    # But then again it should, if called from one down on the stack:
    def check_nested_call():
        x_in_env = "asdf"
        t("~ x_in_env", {}, 1,
          True,
          [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"])
    check_nested_call()
    # passing in an explicit EvalEnvironment also works:
    e = EvalEnvironment.capture(1)
    t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError))
    e = EvalEnvironment.capture(0)
    def check_nested_call_2():
        x_in_env = "asdf"
        t("~ x_in_env", {}, e,
          True,
          [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"])
    check_nested_call_2()

def test_return_pandas():
    if not have_pandas:
        return
    # basic check of pulling a Series out of the environment
    s1 = pandas.Series([1, 2, 3], name="AA", index=[10, 20, 30])
    s2 = pandas.Series([4, 5, 6], name="BB", index=[10, 20, 30])
    df1 = dmatrix("s1", return_type="dataframe")
    assert np.allclose(df1, [[1, 1], [1, 2], [1, 3]])
    assert np.array_equal(df1.columns, ["Intercept", "s1"])
    assert df1.design_info.column_names == ["Intercept", "s1"]
    assert np.array_equal(df1.index, [10, 20, 30])
    df2, df3 = dmatrices("s2 ~ s1", return_type="dataframe")
    assert np.allclose(df2, [[4], [5], [6]])
    assert np.array_equal(df2.columns, ["s2"])
    assert df2.design_info.column_names == ["s2"]
    assert np.array_equal(df2.index, [10, 20, 30])
    assert np.allclose(df3, [[1, 1], [1, 2], [1, 3]])
    assert np.array_equal(df3.columns, ["Intercept", "s1"])
    assert df3.design_info.column_names == ["Intercept", "s1"]
    assert np.array_equal(df3.index, [10, 20, 30])
    # indices are preserved if pandas is passed in directly
    df4 = dmatrix(s1, return_type="dataframe")
    assert np.allclose(df4, [[1], [2], [3]])
    assert np.array_equal(df4.columns, ["AA"])
    assert df4.design_info.column_names == ["AA"]
    assert np.array_equal(df4.index, [10, 20, 30])
    df5, df6 = dmatrices((s2, s1), return_type="dataframe")
    assert np.allclose(df5, [[4], [5], [6]])
    assert np.array_equal(df5.columns, ["BB"])
    assert df5.design_info.column_names == ["BB"]
    assert np.array_equal(df5.index, [10, 20, 30])
    assert np.allclose(df6, [[1], [2], [3]])
    assert np.array_equal(df6.columns, ["AA"])
    assert df6.design_info.column_names == ["AA"]
    assert np.array_equal(df6.index, [10, 20, 30])
    # Both combinations of with-index and without-index
    df7, df8 = dmatrices((s1, [10, 11, 12]), return_type="dataframe")
    assert np.array_equal(df7.index, s1.index)
    assert np.array_equal(df8.index, s1.index)
    df9, df10 = dmatrices(([10, 11, 12], s1), return_type="dataframe")
    assert np.array_equal(df9.index, s1.index)
    assert np.array_equal(df10.index, s1.index)
    # pandas must be available
    import patsy.highlevel
    had_pandas = patsy.highlevel.have_pandas
    try:
        patsy.highlevel.have_pandas = False
        assert_raises(PatsyError,
                      dmatrix, "x", {"x": [1]}, 0, return_type="dataframe")
        assert_raises(PatsyError,
                      dmatrices, "y ~ x", {"x": [1], "y": [2]}, 0,
                      return_type="dataframe")
    finally:
        patsy.highlevel.have_pandas = had_pandas

def test_term_info():
    data = balanced(a=2, b=2)
    rhs = dmatrix("a:b", data)
    assert rhs.design_info.column_names == ["Intercept", "b[T.b2]",
                                            "a[T.a2]:b[b1]", "a[T.a2]:b[b2]"]
    assert rhs.design_info.term_names == ["Intercept", "a:b"]
    assert len(rhs.design_info.terms) == 2
    assert rhs.design_info.terms[0] == INTERCEPT
    
def test_data_types():
    data = {"a": [1, 2, 3],
            "b": [1.0, 2.0, 3.0],
            "c": np.asarray([1, 2, 3], dtype=np.float32),
            "d": [True, False, True],
            "e": ["foo", "bar", "baz"],
            "f": C([1, 2, 3]),
            "g": C(["foo", "bar", "baz"]),
            "h": np.array(["foo", 1, (1, "hi")], dtype=object),
            }
    t("~ 0 + a", data, 0, True,
      [[1], [2], [3]], ["a"])
    t("~ 0 + b", data, 0, True,
      [[1], [2], [3]], ["b"])
    t("~ 0 + c", data, 0, True,
      [[1], [2], [3]], ["c"])
    t("~ 0 + d", data, 0, True,
      [[0, 1], [1, 0], [0, 1]], ["d[False]", "d[True]"])
    t("~ 0 + e", data, 0, True,
      [[0, 0, 1], [1, 0, 0], [0, 1, 0]], ["e[bar]", "e[baz]", "e[foo]"])
    t("~ 0 + f", data, 0, True,
      [[1, 0, 0], [0, 1, 0], [0, 0, 1]], ["f[1]", "f[2]", "f[3]"])
    t("~ 0 + g", data, 0, True,
      [[0, 0, 1], [1, 0, 0], [0, 1, 0]], ["g[bar]", "g[baz]", "g[foo]"])
    # This depends on Python's sorting behavior:
    t("~ 0 + h", data, 0, True,
      [[0, 1, 0], [1, 0, 0], [0, 0, 1]],
      ["h[1]", "h[foo]", "h[(1, 'hi')]"])
    
def test_categorical():
    data = balanced(a=2, b=2)
    # There are more exhaustive tests for all the different coding options in
    # test_build; let's just make sure that C() and stuff works.
    t("~ C(a)", data, 0,
      True,
      [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a)[T.a2]"])
    t("~ C(a, levels=['a2', 'a1'])", data, 0,
      True,
      [[1, 1], [1, 1], [1, 0], [1, 0]],
      ["Intercept", "C(a, levels=['a2', 'a1'])[T.a1]"])
    t("~ C(a, Treatment(reference=-1))", data, 0,
      True,
      [[1, 1], [1, 1], [1, 0], [1, 0]],
      ["Intercept", "C(a, Treatment(reference=-1))[T.a1]"])

    # Different interactions
    t("a*b", data, 0,
      True,
      [[1, 0, 0, 0],
       [1, 0, 1, 0],
       [1, 1, 0, 0],
       [1, 1, 1, 1]],
      ["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"])
    t("0 + a:b", data, 0,
      True,
      [[1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1]],
      ["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"])
    t("1 + a + a:b", data, 0,
      True,
      [[1, 0, 0, 0],
       [1, 0, 1, 0],
       [1, 1, 0, 0],
       [1, 1, 0, 1]],
      ["Intercept", "a[T.a2]", "a[a1]:b[T.b2]", "a[a2]:b[T.b2]"])

    # Changing contrast with C()
    data["a"] = C(data["a"], Helmert)
    t("a", data, 0,
      True,
      [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"])
    t("C(a, Treatment)", data, 0,
      True,
      [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a, Treatment)[T.a2]"])
    # That didn't affect the original object
    t("a", data, 0,
      True,
      [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"])

def test_builtins():
    data = {"x": [1, 2, 3],
            "y": [4, 5, 6],
            "a b c": [10, 20, 30]}
    t("0 + I(x + y)", data, 0,
      True,
      [[1], [2], [3], [4], [5], [6]], ["I(x + y)"])
    t("Q('a b c')", data, 0,
      True,
      [[1, 10], [1, 20], [1, 30]], ["Intercept", "Q('a b c')"])
    t("center(x)", data, 0,
      True,
      [[1, -1], [1, 0], [1, 1]], ["Intercept", "center(x)"])

def test_incremental():
    # incr_dbuilder(s)
    # stateful transformations
    datas = [
        {"a": ["a2", "a2", "a2"],
         "x": [1, 2, 3]},
        {"a": ["a2", "a2", "a1"],
         "x": [4, 5, 6]},
        ]
    x = np.asarray([1, 2, 3, 4, 5, 6])
    sin_center_x = np.sin(x - np.mean(x))
    x_col = sin_center_x - np.mean(sin_center_x)
    def data_iter_maker():
        return iter(datas)
    builders = incr_dbuilders("1 ~ a + center(np.sin(center(x)))",
                              data_iter_maker)
    lhs, rhs = build_design_matrices(builders, datas[1])
    assert lhs.design_info.column_names == ["Intercept"]
    assert rhs.design_info.column_names == ["Intercept",
                                            "a[T.a2]",
                                            "center(np.sin(center(x)))"]
    assert np.allclose(lhs, [[1], [1], [1]])
    assert np.allclose(rhs, np.column_stack(([1, 1, 1],
                                             [1, 1, 0],
                                             x_col[3:])))

    builder = incr_dbuilder("~ a + center(np.sin(center(x)))",
                            data_iter_maker)
    (rhs,) = build_design_matrices([builder], datas[1])
    assert rhs.design_info.column_names == ["Intercept",
                                            "a[T.a2]",
                                            "center(np.sin(center(x)))"]
    assert np.allclose(lhs, [[1], [1], [1]])
    assert np.allclose(rhs, np.column_stack(([1, 1, 1],
                                             [1, 1, 0],
                                             x_col[3:])))

    assert_raises(PatsyError, incr_dbuilder, "x ~ x", data_iter_maker)
    assert_raises(PatsyError, incr_dbuilders, "x", data_iter_maker)

def test_env_transform():
    t("~ np.sin(x)", {"x": [1, 2, 3]}, 0,
      True,
      [[1, np.sin(1)], [1, np.sin(2)], [1, np.sin(3)]],
      ["Intercept", "np.sin(x)"])

# Term ordering:
#   1) all 0-order no-numeric
#   2) all 1st-order no-numeric
#   3) all 2nd-order no-numeric
#   4) ...
#   5) all 0-order with the first numeric interaction encountered
#   6) all 1st-order with the first numeric interaction encountered
#   7) ...
#   8) all 0-order with the second numeric interaction encountered
#   9) ...
def test_term_order():
    data = balanced(a=2, b=2)
    data["x1"] = np.linspace(0, 1, 4)
    data["x2"] = data["x1"] ** 2

    def t_terms(formula, order):
        m = dmatrix(formula, data)
        assert m.design_info.term_names == order

    t_terms("a + b + x1 + x2", ["Intercept", "a", "b", "x1", "x2"])
    t_terms("b + a + x2 + x1", ["Intercept", "b", "a", "x2", "x1"])
    t_terms("0 + x1 + a + x2 + b + 1", ["Intercept", "a", "b", "x1", "x2"])
    t_terms("0 + a:b + a + b + 1", ["Intercept", "a", "b", "a:b"])
    t_terms("a + a:x1 + x2 + x1 + b",
            ["Intercept", "a", "b", "x1", "a:x1", "x2"])
    t_terms("0 + a:x1:x2 + a + x2:x1:b + x2 + x1 + a:x1 + x1:x2 + x1:a:x2:a:b",
            ["a",
             "x1:x2", "a:x1:x2", "x2:x1:b", "x1:a:x2:b",
             "x2",
             "x1",
             "a:x1"])

def _check_division(expect_true_division): # pragma: no cover
    # We evaluate the formula "I(x / y)" in our *caller's* scope, so the
    # result depends on whether our caller has done 'from __future__ import
    # division'.
    data = {"x": 5, "y": 2}
    m = dmatrix("0 + I(x / y)", data, 1)
    if expect_true_division:
        assert np.allclose(m, [[2.5]])
    else:
        assert np.allclose(m, [[2]])

def test_future():
    if __future__.division.getMandatoryRelease() < sys.version_info:
        # This is Python 3, where division is already default
        return
    # no __future__.division in this module's scope
    _check_division(False)
    # create an execution context where __future__.division is in effect
    exec ("from __future__ import division\n"
          "_check_division(True)\n")

def test_multicolumn():
    data = {
        "a": ["a1", "a2"],
        "X": [[1, 2], [3, 4]],
        "Y": [[1, 3], [2, 4]],
        }
    t("X*Y", data, 0,
      True,
      [[1, 1, 2, 1, 3, 1 * 1, 2 * 1, 1 * 3, 2 * 3],
       [1, 3, 4, 2, 4, 3 * 2, 4 * 2, 3 * 4, 4 * 4]],
      ["Intercept", "X[0]", "X[1]", "Y[0]", "Y[1]",
       "X[0]:Y[0]", "X[1]:Y[0]", "X[0]:Y[1]", "X[1]:Y[1]"])
    t("a:X + Y", data, 0,
      True,
      [[1, 1, 0, 2, 0, 1, 3],
       [1, 0, 3, 0, 4, 2, 4]],
      ["Intercept",
       "a[a1]:X[0]", "a[a2]:X[0]", "a[a1]:X[1]", "a[a2]:X[1]",
       "Y[0]", "Y[1]"])

def test_dmatrix_dmatrices_no_data():
    x = [1, 2, 3]
    y = [4, 5, 6]
    assert np.allclose(dmatrix("x"), [[1, 1], [1, 2], [1, 3]])
    lhs, rhs = dmatrices("y ~ x")
    assert np.allclose(lhs, [[4], [5], [6]])
    assert np.allclose(rhs, [[1, 1], [1, 2], [1, 3]])

def test_designinfo_describe():
    lhs, rhs = dmatrices("y ~ x + a", {"y": [1, 2, 3],
                                       "x": [4, 5, 6],
                                       "a": ["a1", "a2", "a3"]})
    assert lhs.design_info.describe() == "y"
    assert rhs.design_info.describe() == "1 + a + x"

def test_evalfactor_reraise():
    # This will produce a PatsyError, but buried inside the factor evaluation,
    # so the original code has no way to give it an appropriate origin=
    # attribute. EvalFactor should notice this, and add a useful origin:
    def raise_patsy_error(x):
        raise PatsyError("WHEEEEEE")
    formula = "raise_patsy_error(X) + Y"
    try:
        dmatrix(formula, {"X": [1, 2, 3], "Y": [4, 5, 6]})
    except PatsyError as e:
        assert e.origin == Origin(formula, 0, formula.index(" "))
    else:
        assert False
    # This will produce a KeyError, which on Python 3 we can do wrap without
    # destroying the traceback, so we do so. On Python 2 we let the original
    # exception escape.
    try:
        dmatrix("1 + x[1]", {"x": {}})
    except Exception as e:
        if sys.version_info[0] >= 3:
            assert isinstance(e, PatsyError)
            assert e.origin == Origin("1 + x[1]", 4, 8)
        else:
            assert isinstance(e, KeyError)
    else:
        assert False

def test_dmatrix_NA_action():
    data = {"x": [1, 2, 3, np.nan], "y": [np.nan, 20, 30, 40]}

    return_types = ["matrix"]
    if have_pandas:
        return_types.append("dataframe")

    for return_type in return_types:
        mat = dmatrix("x + y", data=data, return_type=return_type)
        assert np.array_equal(mat, [[1, 2, 20],
                                    [1, 3, 30]])
        if return_type == "dataframe":
            assert mat.index.equals(pandas.Index([1, 2]))
        assert_raises(PatsyError, dmatrix, "x + y", data=data,
                      return_type=return_type,
                      NA_action="raise")

        lmat, rmat = dmatrices("y ~ x", data=data, return_type=return_type)
        assert np.array_equal(lmat, [[20], [30]])
        assert np.array_equal(rmat, [[1, 2], [1, 3]])
        if return_type == "dataframe":
            assert lmat.index.equals(pandas.Index([1, 2]))
            assert rmat.index.equals(pandas.Index([1, 2]))
        assert_raises(PatsyError,
                      dmatrices, "y ~ x", data=data, return_type=return_type,
                      NA_action="raise")

        # Initial release for the NA handling code had problems with
        # non-data-dependent matrices like "~ 1".
        lmat, rmat = dmatrices("y ~ 1", data=data, return_type=return_type)
        assert np.array_equal(lmat, [[20], [30], [40]])
        assert np.array_equal(rmat, [[1], [1], [1]])
        if return_type == "dataframe":
            assert lmat.index.equals(pandas.Index([1, 2, 3]))
            assert rmat.index.equals(pandas.Index([1, 2, 3]))
        assert_raises(PatsyError,
                      dmatrices, "y ~ 1", data=data, return_type=return_type,
                      NA_action="raise")

def test_0d_data():
    # Use case from statsmodels/statsmodels#1881
    data_0d = {"x1": 1.1, "x2": 1.2, "a": "a1"}

    for formula, expected in [
            ("x1 + x2", [[1, 1.1, 1.2]]),
            ("C(a, levels=('a1', 'a2')) + x1", [[1, 0, 1.1]]),
            ]:
        mat = dmatrix(formula, data_0d)
        assert np.allclose(mat, expected)

        assert np.allclose(build_design_matrices([mat.design_info],
                                                 data_0d)[0],
                           expected)
        if have_pandas:
            data_series = pandas.Series(data_0d)
            assert np.allclose(dmatrix(formula, data_series), expected)

            assert np.allclose(build_design_matrices([mat.design_info],
                                                     data_series)[0],
                               expected)

def test_env_not_saved_in_builder():
    x_in_env = [1, 2, 3]
    design_matrix = dmatrix("x_in_env", {})

    x_in_env = [10, 20, 30]
    design_matrix2 = dmatrix(design_matrix.design_info, {})

    assert np.allclose(design_matrix, design_matrix2)

def test_C_and_pandas_categorical():
    if not have_pandas_categorical:
        return

    objs = [pandas_Categorical_from_codes([1, 0, 1], ["b", "a"])]
    if have_pandas_categorical_dtype:
        objs.append(pandas.Series(objs[0]))
    for obj in objs:
        d = {"obj": obj}
        assert np.allclose(dmatrix("obj", d),
                           [[1, 1],
                            [1, 0],
                            [1, 1]])

        assert np.allclose(dmatrix("C(obj)", d),
                           [[1, 1],
                            [1, 0],
                            [1, 1]])

        assert np.allclose(dmatrix("C(obj, levels=['b', 'a'])", d),
                           [[1, 1],
                            [1, 0],
                            [1, 1]])

        assert np.allclose(dmatrix("C(obj, levels=['a', 'b'])", d),
                           [[1, 0],
                            [1, 1],
                            [1, 0]])
aaronreidsmith / patsy python

Version: 0.5.1

/ test_highlevel.py

Products

About

Resources

Contact Gemfury