Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
pandas / tests / io / parser / mangle_dupes.py
Size: Mime:
# -*- coding: utf-8 -*-

"""
Tests that duplicate columns are handled appropriately when parsed by the
CSV engine. In general, the expected result is that they are either thoroughly
de-duplicated (if mangling requested) or ignored otherwise.
"""

from pandas.compat import StringIO
from pandas import DataFrame

import pandas.util.testing as tm


class DupeColumnTests(object):
    def test_basic(self):
        # TODO: add test for condition "mangle_dupe_cols=False"
        # once it is actually supported (gh-12935)
        data = "a,a,b,b,b\n1,2,3,4,5"

        for method in ("read_csv", "read_table"):
            # Check default behavior.
            expected = ["a", "a.1", "b", "b.1", "b.2"]
            df = getattr(self, method)(StringIO(data), sep=",")
            assert list(df.columns) == expected

            df = getattr(self, method)(StringIO(data), sep=",",
                                       mangle_dupe_cols=True)
            assert list(df.columns) == expected

    def test_basic_names(self):
        # See gh-7160
        data = "a,b,a\n0,1,2\n3,4,5"
        expected = DataFrame([[0, 1, 2], [3, 4, 5]],
                             columns=["a", "b", "a.1"])

        df = self.read_csv(StringIO(data))
        tm.assert_frame_equal(df, expected)

        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
            data = "0,1,2\n3,4,5"
            df = self.read_csv(StringIO(data),
                               names=["a", "b", "a"])
            tm.assert_frame_equal(df, expected)

    def test_thorough_mangle_columns(self):
        # see gh-17060
        data = "a,a,a.1\n1,2,3"
        df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
        assert list(df.columns) == ["a", "a.1", "a.1.1"]

        data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6"
        df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
        assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
                                    "a.1.1.1.1", "a.1.1.1.1.1"]

        data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7"
        df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
        assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
                                    "a.2", "a.2.1", "a.3.1"]

    def test_thorough_mangle_names(self):
        # see gh-17095
        data = "a,b,b\n1,2,3"
        names = ["a.1", "a.1", "a.1.1"]

        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
            df = self.read_csv(StringIO(data), sep=",", names=names,
                               mangle_dupe_cols=True)
            assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]

        data = "a,b,c,d,e,f\n1,2,3,4,5,6"
        names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]

        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
            df = self.read_csv(StringIO(data), sep=",", names=names,
                               mangle_dupe_cols=True)
            assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
                                        "a.1.1.1.1", "a.1.1.1.1.1"]

        data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
        names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]

        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
            df = self.read_csv(StringIO(data), sep=",", names=names,
                               mangle_dupe_cols=True)
            assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
                                        "a.2", "a.2.1", "a.3.1"]