Gemfury

aaronreidsmith / statsmodels python

Repository URL to install this package:
Details
statsmodels / sandbox / tests / test_predict_functional.py
from statsmodels.sandbox.predict_functional import predict_functional
import numpy as np
import pandas as pd
import pytest
import statsmodels.api as sm

# If true, the output is written to a multi-page pdf file.
pdf_output = False

try:
    import matplotlib.pyplot as plt
except ImportError:
    pass

def pctl(q):
    return lambda x : np.percentile(x, 100 *q)


class TestPredFunc(object):

    @classmethod
    def setup_class(cls):
        if pdf_output:
            from matplotlib.backends.backend_pdf import PdfPages
            cls.pdf = PdfPages("predict_functional.pdf")

    @classmethod
    def teardown_class(cls):
        if pdf_output:
            cls.pdf.close()

    def close_or_save(self, fig):
        if pdf_output:
            self.pdf.savefig(fig)

    @pytest.mark.matplotlib
    def test_formula(self, close_figures):

        np.random.seed(542)
        n = 500
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        x4 = np.random.randint(0, 5, size=n)
        x4 = np.asarray(["ABCDE"[i] for i in x4])
        x5 = np.random.normal(size=n)
        y = 0.3*x2**2 + (x4 == "B") + 0.1*(x4 == "B")*x2**2 + x5 + np.random.normal(size=n)

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3, "x4": x4, "x5": x5})

        fml = "y ~ x1 + bs(x2, df=4) + x3 + x2*x3 + I(x1**2) + C(x4) + C(x4)*bs(x2, df=4) + x5"
        model = sm.OLS.from_formula(fml, data=df)
        result = model.fit()

        summaries = {"x1": np.mean, "x3": pctl(0.75), "x5": np.mean}

        values = {"x4": "B"}
        pr1, ci1, fvals1 = predict_functional(result, "x2", summaries, values)

        values = {"x4": "C"}
        pr2, ci2, fvals2 = predict_functional(result, "x2", summaries, values)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x4=B')
        plt.plot(fvals2, pr2, '-', label='x4=C')
        ha, lb = ax.get_legend_handles_labels()
        plt.figlegend(ha, lb, "center right")
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x4=B')
        plt.fill_between(fvals1, ci1[:, 0], ci1[:, 1], color='grey')
        plt.plot(fvals2, pr2, '-', label='x4=C')
        plt.fill_between(fvals2, ci2[:, 0], ci2[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        plt.figlegend(ha, lb, "center right")
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)

    @pytest.mark.matplotlib
    def test_lm_contrast(self, close_figures):

        np.random.seed(542)
        n = 200
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        y = x1 + 2*x2 + x3 - x1*x2 + x2*x3 + np.random.normal(size=n)

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})

        fml = "y ~ x1 + x2 + x3 + x1*x2 + x2*x3"
        model = sm.OLS.from_formula(fml, data=df)
        result = model.fit()

        values = {"x2": 1, "x3": 1} # y = 4
        values2 = {"x2": 0, "x3": 0} # y = x1
        pr, cb, fvals = predict_functional(result, "x1", values=values,
                                           values2=values2, ci_method='scheffe')

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.67, 0.8])
        plt.plot(fvals, pr, '-', label="Estimate", color='orange', lw=4)
        plt.plot(fvals, 4 - fvals, '-', label="Truth", color='lime', lw=4)
        plt.fill_between(fvals, cb[:, 0], cb[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        leg = plt.figlegend(ha, lb, "center right")
        leg.draw_frame(False)
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Mean contrast", size=15)
        plt.title("Linear model contrast")
        self.close_or_save(fig)

    @pytest.mark.matplotlib
    def test_glm_formula_contrast(self, close_figures):

        np.random.seed(542)
        n = 50
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        mn = 5 + 0.1*x1 + 0.1*x2 + 0.1*x3 - 0.1*x1*x2
        y = np.random.poisson(np.exp(mn), size=len(mn))

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})

        fml = "y ~ x1 + x2 + x3 + x1*x2"
        model = sm.GLM.from_formula(fml, data=df, family=sm.families.Poisson())
        result = model.fit()

        values = {"x2": 1, "x3": 1} # y = 5.2
        values2 = {"x2": 0, "x3": 0} # y = 5 + 0.1*x1
        pr, cb, fvals = predict_functional(result, "x1", values=values,
                                           values2=values2, ci_method='simultaneous')

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.67, 0.8])
        plt.plot(fvals, pr, '-', label="Estimate", color='orange', lw=4)
        plt.plot(fvals, 0.2 - 0.1*fvals, '-', label="Truth", color='lime', lw=4)
        plt.fill_between(fvals, cb[:, 0], cb[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        leg = plt.figlegend(ha, lb, "center right")
        leg.draw_frame(False)
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Linear predictor contrast", size=15)
        plt.title("Poisson regression contrast")
        self.close_or_save(fig)

    @pytest.mark.matplotlib
    def test_scb(self, close_figures):

        np.random.seed(473)
        n = 100
        x = np.random.normal(size=(n,4))
        x[:, 0] = 1

        for fam_name in "poisson", "binomial", "gaussian":

            if fam_name == "poisson":
                y = np.random.poisson(20, size=n)
                fam = sm.families.Poisson()
                true_mean = 20
                true_lp = np.log(20)
            elif fam_name == "binomial":
                y = 1 * (np.random.uniform(size=n) < 0.5)
                fam = sm.families.Binomial()
                true_mean = 0.5
                true_lp = 0
            elif fam_name == "gaussian":
                y = np.random.normal(size=n)
                fam = sm.families.Gaussian()
                true_mean = 0
                true_lp = 0

            model = sm.GLM(y, x, family=fam)
            result = model.fit()

            # CB is for linear predictor or mean response
            for linear in False, True:

                true = true_lp if linear else true_mean

                values = {'const': 1, "x2": 0}
                summaries = {"x3": np.mean}
                pred1, cb1, fvals1 = predict_functional(result, "x1",
                            values=values, summaries=summaries, linear=linear)
                pred2, cb2, fvals2 = predict_functional(result, "x1",
                            values=values, summaries=summaries,
                            ci_method='simultaneous', linear=linear)

                plt.clf()
                fig = plt.figure()
                ax = plt.axes([0.1, 0.1, 0.58, 0.8])
                plt.plot(fvals1, pred1, '-', color='black', label='Estimate')
                plt.plot(fvals1, true * np.ones(len(pred1)), '-', color='purple',
                         label='Truth')
                plt.plot(fvals1, cb1[:, 0], color='blue', label='Pointwise CB')
                plt.plot(fvals1, cb1[:, 1], color='blue')
                plt.plot(fvals2, cb2[:, 0], color='green', label='Simultaneous CB')
                plt.plot(fvals2, cb2[:, 1], color='green')
                ha, lb = ax.get_legend_handles_labels()
                leg = plt.figlegend(ha, lb, "center right")
                leg.draw_frame(False)
                plt.xlabel("Focus variable", size=15)
                if linear:
                    plt.ylabel("Linear predictor", size=15)
                else:
                    plt.ylabel("Fitted mean", size=15)
                plt.title("%s family prediction" % fam_name.capitalize())

                self.close_or_save(fig)

    @pytest.mark.matplotlib
    def test_glm_formula(self, close_figures):

        np.random.seed(542)
        n = 500
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.randint(0, 3, size=n)
        x3 = np.asarray(["ABC"[i] for i in x3])
        lin_pred = -1 + 0.5*x1**2 + (x3 == "B")
        prob = 1 / (1 + np.exp(-lin_pred))
        y = 1 * (np.random.uniform(size=n) < prob)

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})

        fml = "y ~ x1 + I(x1**2) + x2 + C(x3)"
        model = sm.GLM.from_formula(fml, family=sm.families.Binomial(), data=df)
        result = model.fit()
        summaries = {"x2": np.mean}

        for linear in False, True:

            values = {"x3": "B"}
            pr1, ci1, fvals1 = predict_functional(result, "x1", summaries, values, linear=linear)

            values = {"x3": "C"}
            pr2, ci2, fvals2 = predict_functional(result, "x1", summaries, values, linear=linear)

            exact1 = -1 + 0.5*fvals1**2 + 1
            exact2 = -1 + 0.5*fvals2**2

            if not linear:
                exact1 = 1 / (1 + np.exp(-exact1))
                exact2 = 1 / (1 + np.exp(-exact2))

            plt.clf()
            fig = plt.figure()
            ax = plt.axes([0.1, 0.1, 0.7, 0.8])
            plt.plot(fvals1, pr1, '-', label='x3=B')
            plt.plot(fvals2, pr2, '-', label='x3=C')
            plt.plot(fvals1, exact1, '-', label='x3=B (exact)')
            plt.plot(fvals2, exact2, '-', label='x3=C (exact)')
            ha, lb = ax.get_legend_handles_labels()
            plt.figlegend(ha, lb, "center right")
            plt.xlabel("Focus variable", size=15)
            if linear:
                plt.ylabel("Fitted linear predictor", size=15)
            else:
                plt.ylabel("Fitted probability", size=15)
            plt.title("Binomial GLM prediction")
            self.close_or_save(fig)

            plt.clf()
            fig = plt.figure()
            ax = plt.axes([0.1, 0.1, 0.7, 0.8])
            plt.plot(fvals1, pr1, '-', label='x3=B', color='orange')
            plt.fill_between(fvals1, ci1[:, 0], ci1[:, 1], color='grey')
            plt.plot(fvals2, pr2, '-', label='x3=C', color='lime')
            plt.fill_between(fvals2, ci2[:, 0], ci2[:, 1], color='grey')
            ha, lb = ax.get_legend_handles_labels()
            plt.figlegend(ha, lb, "center right")
            plt.xlabel("Focus variable", size=15)
            if linear:
                plt.ylabel("Fitted linear predictor", size=15)
            else:
                plt.ylabel("Fitted probability", size=15)
            plt.title("Binomial GLM prediction")
            self.close_or_save(fig)

    @pytest.mark.matplotlib
    def test_noformula_prediction(self, close_figures):

        np.random.seed(6434)
        n = 200
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        y = x1 - x2 + np.random.normal(size=n)

        exog = np.vstack((x1, x2, x3)).T

        model = sm.OLS(y, exog)
        result = model.fit()

        summaries = {"x3": pctl(0.75)}
        values = {"x2": 1}
        pr1, ci1, fvals1 = predict_functional(result, "x1", summaries, values)

        values = {"x2": -1}
        pr2, ci2, fvals2 = predict_functional(result, "x1", summaries, values)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x2=1', lw=4, alpha=0.6, color='orange')
        plt.plot(fvals2, pr2, '-', label='x2=-1', lw=4, alpha=0.6, color='lime')
        ha, lb = ax.get_legend_handles_labels()
        leg = plt.figlegend(ha, lb, "center right")
        leg.draw_frame(False)
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x2=1', lw=4, alpha=0.6, color='orange')
        plt.fill_between(fvals1, ci1[:, 0], ci1[:, 1], color='grey')
        plt.plot(fvals1, pr2, '-', label='x2=1', lw=4, alpha=0.6, color='lime')
        plt.fill_between(fvals2, ci2[:, 0], ci2[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        plt.figlegend(ha, lb, "center right")
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)
aaronreidsmith / statsmodels python

Products

About

Resources

Contact Gemfury