Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

aaronreidsmith / pandas   python

Repository URL to install this package:

Version: 0.25.3 

/ tests / frame / test_repr_info.py

from datetime import datetime, timedelta
from io import StringIO
import re
import sys
import textwrap

import numpy as np
import pytest

from pandas.compat import PYPY

import pandas as pd
from pandas import (
    Categorical,
    DataFrame,
    Series,
    date_range,
    option_context,
    period_range,
)
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm

import pandas.io.formats.format as fmt

# Segregated collection of methods that require the BlockManager internal data
# structure


class TestDataFrameReprInfoEtc(TestData):
    def test_repr_empty(self):
        # empty
        foo = repr(self.empty)  # noqa

        # empty with index
        frame = DataFrame(index=np.arange(1000))
        foo = repr(frame)  # noqa

    def test_repr_mixed(self):
        buf = StringIO()

        # mixed
        foo = repr(self.mixed_frame)  # noqa
        self.mixed_frame.info(verbose=False, buf=buf)

    @pytest.mark.slow
    def test_repr_mixed_big(self):
        # big mixed
        biggie = DataFrame(
            {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200)
        )
        biggie.loc[:20, "A"] = np.nan
        biggie.loc[:20, "B"] = np.nan

        foo = repr(biggie)  # noqa

    def test_repr(self):
        buf = StringIO()

        # small one
        foo = repr(self.frame)
        self.frame.info(verbose=False, buf=buf)

        # even smaller
        self.frame.reindex(columns=["A"]).info(verbose=False, buf=buf)
        self.frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf)

        # exhausting cases in DataFrame.info

        # columns but no index
        no_index = DataFrame(columns=[0, 1, 3])
        foo = repr(no_index)  # noqa

        # no columns or index
        self.empty.info(buf=buf)

        df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
        assert "\t" not in repr(df)
        assert "\r" not in repr(df)
        assert "a\n" not in repr(df)

    def test_repr_dimensions(self):
        df = DataFrame([[1, 2], [3, 4]])
        with option_context("display.show_dimensions", True):
            assert "2 rows x 2 columns" in repr(df)

        with option_context("display.show_dimensions", False):
            assert "2 rows x 2 columns" not in repr(df)

        with option_context("display.show_dimensions", "truncate"):
            assert "2 rows x 2 columns" not in repr(df)

    @pytest.mark.slow
    def test_repr_big(self):
        # big one
        biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200))
        repr(biggie)

    def test_repr_unsortable(self):
        # columns are not sortable
        import warnings

        warn_filters = warnings.filters
        warnings.filterwarnings("ignore", category=FutureWarning, module=".*format")

        unsortable = DataFrame(
            {
                "foo": [1] * 50,
                datetime.today(): [1] * 50,
                "bar": ["bar"] * 50,
                datetime.today() + timedelta(1): ["bar"] * 50,
            },
            index=np.arange(50),
        )
        repr(unsortable)

        fmt.set_option("display.precision", 3, "display.column_space", 10)
        repr(self.frame)

        fmt.set_option("display.max_rows", 10, "display.max_columns", 2)
        repr(self.frame)

        fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000)
        repr(self.frame)

        tm.reset_display_options()

        warnings.filters = warn_filters

    def test_repr_unicode(self):
        uval = "\u03c3\u03c3\u03c3\u03c3"

        # TODO(wesm): is this supposed to be used?
        bval = uval.encode("utf-8")  # noqa

        df = DataFrame({"A": [uval, uval]})

        result = repr(df)
        ex_top = "      A"
        assert result.split("\n")[0].rstrip() == ex_top

        df = DataFrame({"A": [uval, uval]})
        result = repr(df)
        assert result.split("\n")[0].rstrip() == ex_top

    def test_unicode_string_with_unicode(self):
        df = DataFrame({"A": ["\u05d0"]})
        str(df)

    def test_str_to_bytes_raises(self):
        # GH 26447
        df = DataFrame({"A": ["abc"]})
        msg = "^'str' object cannot be interpreted as an integer$"
        with pytest.raises(TypeError, match=msg):
            bytes(df)

    def test_very_wide_info_repr(self):
        df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20))
        repr(df)

    def test_repr_column_name_unicode_truncation_bug(self):
        # #1906
        df = DataFrame(
            {
                "Id": [7117434],
                "StringCol": (
                    "Is it possible to modify drop plot code"
                    " so that the output graph is displayed "
                    "in iphone simulator, Is it possible to "
                    "modify drop plot code so that the "
                    "output graph is \xe2\x80\xa8displayed "
                    "in iphone simulator.Now we are adding "
                    "the CSV file externally. I want to Call"
                    " the File through the code.."
                ),
            }
        )

        with option_context("display.max_columns", 20):
            assert "StringCol" in repr(df)

    def test_latex_repr(self):
        result = r"""\begin{tabular}{llll}
\toprule
{} &         0 &  1 &  2 \\
\midrule
0 &  $\alpha$ &  b &  c \\
1 &         1 &  2 &  3 \\
\bottomrule
\end{tabular}
"""
        with option_context("display.latex.escape", False, "display.latex.repr", True):
            df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]])
            assert result == df._repr_latex_()

        # GH 12182
        assert df._repr_latex_() is None

    def test_info(self):
        io = StringIO()
        self.frame.info(buf=io)
        self.tsframe.info(buf=io)

        frame = DataFrame(np.random.randn(5, 3))

        frame.info()
        frame.info(verbose=False)

    def test_info_memory(self):
        # https://github.com/pandas-dev/pandas/issues/21056
        df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")})
        buf = StringIO()
        df.info(buf=buf)
        result = buf.getvalue()
        bytes = float(df.memory_usage().sum())

        expected = textwrap.dedent(
            """\
        <class 'pandas.core.frame.DataFrame'>
        RangeIndex: 2 entries, 0 to 1
        Data columns (total 1 columns):
        a    2 non-null int64
        dtypes: int64(1)
        memory usage: {} bytes
        """.format(
                bytes
            )
        )

        assert result == expected

    def test_info_wide(self):
        from pandas import set_option, reset_option

        io = StringIO()
        df = DataFrame(np.random.randn(5, 101))
        df.info(buf=io)

        io = StringIO()
        df.info(buf=io, max_cols=101)
        rs = io.getvalue()
        assert len(rs.splitlines()) > 100
        xp = rs

        set_option("display.max_info_columns", 101)
        io = StringIO()
        df.info(buf=io)
        assert rs == xp
        reset_option("display.max_info_columns")

    def test_info_duplicate_columns(self):
        io = StringIO()

        # it works!
        frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"])
        frame.info(buf=io)

    def test_info_duplicate_columns_shows_correct_dtypes(self):
        # GH11761
        io = StringIO()

        frame = DataFrame([[1, 2.0]], columns=["a", "a"])
        frame.info(buf=io)
        io.seek(0)
        lines = io.readlines()
        assert "a    1 non-null int64\n" == lines[3]
        assert "a    1 non-null float64\n" == lines[4]

    def test_info_shows_column_dtypes(self):
        dtypes = [
            "int64",
            "float64",
            "datetime64[ns]",
            "timedelta64[ns]",
            "complex128",
            "object",
            "bool",
        ]
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()
        df.info(buf=buf)
        res = buf.getvalue()
        for i, dtype in enumerate(dtypes):
            name = "{i:d}    {n:d} non-null {dtype}".format(i=i, n=n, dtype=dtype)
            assert name in res

    def test_info_max_cols(self):
        df = DataFrame(np.random.randn(10, 5))
        for len_, verbose in [(5, None), (5, False), (10, True)]:
            # For verbose always      ^ setting  ^ summarize ^ full output
            with option_context("max_info_columns", 4):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                assert len(res.strip().split("\n")) == len_

        for len_, verbose in [(10, None), (5, False), (10, True)]:

            # max_cols no exceeded
            with option_context("max_info_columns", 5):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                assert len(res.strip().split("\n")) == len_

        for len_, max_cols in [(10, 5), (5, 4)]:
            # setting truncates
            with option_context("max_info_columns", 4):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                assert len(res.strip().split("\n")) == len_

            # setting wouldn't truncate
            with option_context("max_info_columns", 5):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                assert len(res.strip().split("\n")) == len_

    def test_info_memory_usage(self):
        # Ensure memory usage is displayed, when asserted, on the last line
        dtypes = [
            "int64",
            "float64",
            "datetime64[ns]",
            "timedelta64[ns]",
            "complex128",
            "object",
            "bool",
        ]
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()

        # display memory usage case
        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        assert "memory usage: " in res[-1]

        # do not display memory usage case
        df.info(buf=buf, memory_usage=False)
        res = buf.getvalue().splitlines()
        assert "memory usage: " not in res[-1]

        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # memory usage is a lower bound, so print it as XYZ+ MB
        assert re.match(r"memory usage: [^+]+\+", res[-1])

        df.iloc[:, :5].info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # excluded column with object dtype, so estimate is accurate
        assert not re.match(r"memory usage: [^+]+\+", res[-1])

        # Test a DataFrame with duplicate columns
        dtypes = ["int64", "int64", "int64", "float64"]
        data = {}
        n = 100
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        df.columns = dtypes

        df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
        df_with_object_index.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        assert re.match(r"memory usage: [^+]+\+", res[-1])

        df_with_object_index.info(buf=buf, memory_usage="deep")
        res = buf.getvalue().splitlines()
        assert re.match(r"memory usage: [^+]+$", res[-1])

        # Ensure df size is as expected
        # (cols * rows * bytes) + index size
        df_size = df.memory_usage().sum()
        exp_size = len(dtypes) * n * 8 + df.index.nbytes
        assert df_size == exp_size

        # Ensure number of cols in memory_usage is the same as df
        size_df = np.size(df.columns.values) + 1  # index=True; default
        assert size_df == np.size(df.memory_usage())

        # assert deep works only on object
        assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()

        # test for validity
        DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
        DataFrame(1, index=["a"], columns=["A"]).index.nbytes
        df = DataFrame(
            data=1,
            index=pd.MultiIndex.from_product([["a"], range(1000)]),
            columns=["A"],
        )
        df.index.nbytes
        df.memory_usage(index=True)
        df.index.values.nbytes

        mem = df.memory_usage(deep=True).sum()
        assert mem > 0

    @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
    def test_info_memory_usage_deep_not_pypy(self):
        df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
        assert (
            df_with_object_index.memory_usage(index=True, deep=True).sum()
            > df_with_object_index.memory_usage(index=True).sum()
        )

        df_object = pd.DataFrame({"a": ["a"]})
        assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()

    @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result")
    def test_info_memory_usage_deep_pypy(self):
        df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"])
        assert (
            df_with_object_index.memory_usage(index=True, deep=True).sum()
            == df_with_object_index.memory_usage(index=True).sum()
        )

        df_object = pd.DataFrame({"a": ["a"]})
        assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()

    @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
    def test_usage_via_getsizeof(self):
        df = DataFrame(
            data=1,
            index=pd.MultiIndex.from_product([["a"], range(1000)]),
            columns=["A"],
        )
        mem = df.memory_usage(deep=True).sum()
        # sys.getsizeof will call the .memory_usage with
        # deep=True, and add on some GC overhead
        diff = mem - sys.getsizeof(df)
        assert abs(diff) < 100

    def test_info_memory_usage_qualified(self):

        buf = StringIO()
        df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
        df.info(buf=buf)
        assert "+" not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list("ab"), index=list("ABC"))
        df.info(buf=buf)
        assert "+" in buf.getvalue()

        buf = StringIO()
        df = DataFrame(
            1,
            columns=list("ab"),
            index=pd.MultiIndex.from_product([range(3), range(3)]),
        )
        df.info(buf=buf)
        assert "+" not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(
            1,
            columns=list("ab"),
            index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]),
        )
        df.info(buf=buf)
        assert "+" in buf.getvalue()

    def test_info_memory_usage_bug_on_multiindex(self):
        # GH 14308
        # memory usage introspection should not materialize .values

        from string import ascii_uppercase as uppercase

        def memory_usage(f):
            return f.memory_usage(deep=True).sum()

        N = 100
        M = len(uppercase)
        index = pd.MultiIndex.from_product(
            [list(uppercase), pd.date_range("20160101", periods=N)],
            names=["id", "date"],
        )
        df = DataFrame({"value": np.random.randn(N * M)}, index=index)

        unstacked = df.unstack("id")
        assert df.values.nbytes == unstacked.values.nbytes
        assert memory_usage(df) > memory_usage(unstacked)

        # high upper bound
        assert memory_usage(unstacked) - memory_usage(df) < 2000

    def test_info_categorical(self):
        # GH14298
        idx = pd.CategoricalIndex(["a", "b"])
        df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)

        buf = StringIO()
        df.info(buf=buf)

    def test_info_categorical_column(self):

        # make sure it works
        n = 2500
        df = DataFrame({"int64": np.random.randint(100, size=n)})
        df["category"] = Series(
            np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
        ).astype("category")
        df.isna()
        buf = StringIO()
        df.info(buf=buf)

        df2 = df[df["category"] == "d"]
        buf = StringIO()
        df2.info(buf=buf)

    def test_repr_categorical_dates_periods(self):
        # normal DataFrame
        dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern")
        p = period_range("2011-01", freq="M", periods=5)
        df = DataFrame({"dt": dt, "p": p})
        exp = """                         dt        p
0 2011-01-01 09:00:00-05:00  2011-01
1 2011-01-01 10:00:00-05:00  2011-02
2 2011-01-01 11:00:00-05:00  2011-03
3 2011-01-01 12:00:00-05:00  2011-04
4 2011-01-01 13:00:00-05:00  2011-05"""

        assert repr(df) == exp

        df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)})
        assert repr(df2) == exp

    @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64])
    @pytest.mark.parametrize(
        "box, expected",
        [[Series, "0    NaT\ndtype: object"], [DataFrame, "     0\n0  NaT"]],
    )
    def test_repr_np_nat_with_object(self, arg, box, expected):
        # GH 25445
        result = repr(box([arg("NaT")], dtype=object))
        assert result == expected