Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
pandas / tests / io / parser / test_python_parser_only.py
Size: Mime:
# -*- coding: utf-8 -*-

"""
Tests that apply specifically to the Python parser. Unless specifically
stated as a Python-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the C parser can accept further
arguments when parsing.
"""

import csv

import pytest

import pandas.compat as compat
from pandas.compat import BytesIO, StringIO, u
from pandas.errors import ParserError

from pandas import DataFrame, Index, MultiIndex
import pandas.util.testing as tm


def test_default_separator(python_parser_only):
    # see gh-17333
    #
    # csv.Sniffer in Python treats "o" as separator.
    data = "aob\n1o2\n3o4"
    parser = python_parser_only
    expected = DataFrame({"a": [1, 3], "b": [2, 4]})

    result = parser.read_csv(StringIO(data), sep=None)
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
    # see gh-15925 (comment)
    data = "a\n1\n2"
    parser = python_parser_only
    msg = "skipfooter must be an integer"

    with pytest.raises(ValueError, match=msg):
        parser.read_csv(StringIO(data), skipfooter=skipfooter)


def test_invalid_skipfooter_negative(python_parser_only):
    # see gh-15925 (comment)
    data = "a\n1\n2"
    parser = python_parser_only
    msg = "skipfooter cannot be negative"

    with pytest.raises(ValueError, match=msg):
        parser.read_csv(StringIO(data), skipfooter=-1)


@pytest.mark.parametrize("kwargs", [
    dict(sep=None),
    dict(delimiter="|")
])
def test_sniff_delimiter(python_parser_only, kwargs):
    data = """index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
    parser = python_parser_only
    result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
    expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                         columns=["A", "B", "C"],
                         index=Index(["foo", "bar", "baz"], name="index"))
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_sniff_delimiter_encoding(python_parser_only, encoding):
    parser = python_parser_only
    data = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""

    if encoding is not None:
        data = u(data).encode(encoding)
        data = BytesIO(data)

        if compat.PY3:
            from io import TextIOWrapper
            data = TextIOWrapper(data, encoding=encoding)
    else:
        data = StringIO(data)

    result = parser.read_csv(data, index_col=0, sep=None,
                             skiprows=2, encoding=encoding)
    expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                         columns=["A", "B", "C"],
                         index=Index(["foo", "bar", "baz"], name="index"))
    tm.assert_frame_equal(result, expected)


def test_single_line(python_parser_only):
    # see gh-6607: sniff separator
    parser = python_parser_only
    result = parser.read_csv(StringIO("1,2"), names=["a", "b"],
                             header=None, sep=None)

    expected = DataFrame({"a": [1], "b": [2]})
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("kwargs", [dict(skipfooter=2), dict(nrows=3)])
def test_skipfooter(python_parser_only, kwargs):
    # see gh-6607
    data = """A,B,C
1,2,3
4,5,6
7,8,9
want to skip this
also also skip this
"""
    parser = python_parser_only
    result = parser.read_csv(StringIO(data), **kwargs)

    expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                         columns=["A", "B", "C"])
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("compression,klass", [
    ("gzip", "GzipFile"),
    ("bz2", "BZ2File"),
])
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
    # see gh-6607
    parser = python_parser_only

    with open(csv1, "rb") as f:
        data = f.read()

    data = data.replace(b",", b"::")
    expected = parser.read_csv(csv1)

    module = pytest.importorskip(compression)
    klass = getattr(module, klass)

    with tm.ensure_clean() as path:
        tmp = klass(path, mode="wb")
        tmp.write(data)
        tmp.close()

        result = parser.read_csv(path, sep="::",
                                 compression=compression)
        tm.assert_frame_equal(result, expected)


def test_read_csv_buglet_4x_multi_index(python_parser_only):
    # see gh-6607
    data = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""
    parser = python_parser_only

    expected = DataFrame([[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
                          [0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
                          [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838]],
                         columns=["A", "B", "C", "D", "E"],
                         index=MultiIndex.from_tuples([
                             ("a", "b", 10.0032, 5),
                             ("a", "q", 20, 4),
                             ("x", "q", 30, 3),
                         ], names=["one", "two", "three", "four"]))
    result = parser.read_csv(StringIO(data), sep=r"\s+")
    tm.assert_frame_equal(result, expected)


def test_read_csv_buglet_4x_multi_index2(python_parser_only):
    # see gh-6893
    data = "      A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
    parser = python_parser_only

    expected = DataFrame.from_records(
        [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
        columns=list("abcABC"), index=list("abc"))
    result = parser.read_csv(StringIO(data), sep=r"\s+")
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("add_footer", [True, False])
def test_skipfooter_with_decimal(python_parser_only, add_footer):
    # see gh-6971
    data = "1#2\n3#4"
    parser = python_parser_only
    expected = DataFrame({"a": [1.2, 3.4]})

    if add_footer:
        # The stray footer line should not mess with the
        # casting of the first two lines if we skip it.
        kwargs = dict(skipfooter=1)
        data += "\nFooter"
    else:
        kwargs = dict()

    result = parser.read_csv(StringIO(data), names=["a"],
                             decimal="#", **kwargs)
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("sep", ["::", "#####", "!!!", "123", "#1!c5",
                                 "%!c!d", "@@#4:2", "_!pd#_"])
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16-be", "utf-16-le",
                                      "utf-32", "cp037"])
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
    # see gh-3404
    expected = DataFrame({"a": [1], "b": [2]})
    parser = python_parser_only

    data = "1" + sep + "2"
    encoded_data = data.encode(encoding)

    result = parser.read_csv(BytesIO(encoded_data), sep=sep,
                             names=["a", "b"], encoding=encoding)
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
def test_multi_char_sep_quotes(python_parser_only, quoting):
    # see gh-13374
    kwargs = dict(sep=",,")
    parser = python_parser_only

    data = 'a,,b\n1,,a\n2,,"2,,b"'
    msg = "ignored when a multi-char delimiter is used"

    def fail_read():
        with pytest.raises(ParserError, match=msg):
            parser.read_csv(StringIO(data), quoting=quoting, **kwargs)

    if quoting == csv.QUOTE_NONE:
        # We expect no match, so there should be an assertion
        # error out of the inner context manager.
        with pytest.raises(AssertionError):
            fail_read()
    else:
        fail_read()


def test_none_delimiter(python_parser_only, capsys):
    # see gh-13374 and gh-17465
    parser = python_parser_only
    data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
    expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})

    # We expect the third line in the data to be
    # skipped because it is malformed, but we do
    # not expect any errors to occur.
    result = parser.read_csv(StringIO(data), header=0,
                             sep=None, warn_bad_lines=True,
                             error_bad_lines=False)
    tm.assert_frame_equal(result, expected)

    captured = capsys.readouterr()
    assert "Skipping line 3" in captured.err


@pytest.mark.parametrize("data", [
    'a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
@pytest.mark.parametrize("skipfooter", [0, 1])
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
    # see gh-13879 and gh-15910
    msg = "parsing errors in the skipped footer rows"
    parser = python_parser_only

    def fail_read():
        with pytest.raises(ParserError, match=msg):
            parser.read_csv(StringIO(data), skipfooter=skipfooter)

    if skipfooter:
        fail_read()
    else:
        # We expect no match, so there should be an assertion
        # error out of the inner context manager.
        with pytest.raises(AssertionError):
            fail_read()


def test_malformed_skipfooter(python_parser_only):
    parser = python_parser_only
    data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
footer
"""
    msg = "Expected 3 fields in line 4, saw 5"
    with pytest.raises(ParserError, match=msg):
        parser.read_csv(StringIO(data), header=1,
                        comment="#", skipfooter=1)