"""
Tests that apply specifically to the Python parser. Unless specifically
stated as a Python-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the C parser can accept further
arguments when parsing.
"""
import csv
from io import BytesIO, StringIO
import pytest
from pandas.errors import ParserError
from pandas import DataFrame, Index, MultiIndex
import pandas.util.testing as tm
def test_default_separator(python_parser_only):
# see gh-17333
#
# csv.Sniffer in Python treats "o" as separator.
data = "aob\n1o2\n3o4"
parser = python_parser_only
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
result = parser.read_csv(StringIO(data), sep=None)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
# see gh-15925 (comment)
data = "a\n1\n2"
parser = python_parser_only
msg = "skipfooter must be an integer"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
def test_invalid_skipfooter_negative(python_parser_only):
# see gh-15925 (comment)
data = "a\n1\n2"
parser = python_parser_only
msg = "skipfooter cannot be negative"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=-1)
@pytest.mark.parametrize("kwargs", [dict(sep=None), dict(delimiter="|")])
def test_sniff_delimiter(python_parser_only, kwargs):
data = """index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_sniff_delimiter_encoding(python_parser_only, encoding):
parser = python_parser_only
data = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
if encoding is not None:
from io import TextIOWrapper
data = data.encode(encoding)
data = BytesIO(data)
data = TextIOWrapper(data, encoding=encoding)
else:
data = StringIO(data)
result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
def test_single_line(python_parser_only):
# see gh-6607: sniff separator
parser = python_parser_only
result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None)
expected = DataFrame({"a": [1], "b": [2]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [dict(skipfooter=2), dict(nrows=3)])
def test_skipfooter(python_parser_only, kwargs):
# see gh-6607
data = """A,B,C
1,2,3
4,5,6
7,8,9
want to skip this
also also skip this
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), **kwargs)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")]
)
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
# see gh-6607
parser = python_parser_only
with open(csv1, "rb") as f:
data = f.read()
data = data.replace(b",", b"::")
expected = parser.read_csv(csv1)
module = pytest.importorskip(compression)
klass = getattr(module, klass)
with tm.ensure_clean() as path:
tmp = klass(path, mode="wb")
tmp.write(data)
tmp.close()
result = parser.read_csv(path, sep="::", compression=compression)
tm.assert_frame_equal(result, expected)
def test_read_csv_buglet_4x_multi_index(python_parser_only):
# see gh-6607
data = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
parser = python_parser_only
expected = DataFrame(
[
[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838],
],
columns=["A", "B", "C", "D", "E"],
index=MultiIndex.from_tuples(
[("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)],
names=["one", "two", "three", "four"],
),
)
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
# see gh-6893
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
parser = python_parser_only
expected = DataFrame.from_records(
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
columns=list("abcABC"),
index=list("abc"),
)
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("add_footer", [True, False])
def test_skipfooter_with_decimal(python_parser_only, add_footer):
# see gh-6971
data = "1#2\n3#4"
parser = python_parser_only
expected = DataFrame({"a": [1.2, 3.4]})
if add_footer:
# The stray footer line should not mess with the
# casting of the first two lines if we skip it.
kwargs = dict(skipfooter=1)
data += "\nFooter"
else:
kwargs = dict()
result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"]
)
@pytest.mark.parametrize(
"encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"]
)
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
# see gh-3404
expected = DataFrame({"a": [1], "b": [2]})
parser = python_parser_only
data = "1" + sep + "2"
encoded_data = data.encode(encoding)
result = parser.read_csv(
BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
def test_multi_char_sep_quotes(python_parser_only, quoting):
# see gh-13374
kwargs = dict(sep=",,")
parser = python_parser_only
data = 'a,,b\n1,,a\n2,,"2,,b"'
msg = "ignored when a multi-char delimiter is used"
def fail_read():
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
if quoting == csv.QUOTE_NONE:
# We expect no match, so there should be an assertion
# error out of the inner context manager.
with pytest.raises(AssertionError):
fail_read()
else:
fail_read()
def test_none_delimiter(python_parser_only, capsys):
# see gh-13374 and gh-17465
parser = python_parser_only
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
# We expect the third line in the data to be
# skipped because it is malformed, but we do
# not expect any errors to occur.
result = parser.read_csv(
StringIO(data), header=0, sep=None, warn_bad_lines=True, error_bad_lines=False
)
tm.assert_frame_equal(result, expected)
captured = capsys.readouterr()
assert "Skipping line 3" in captured.err
@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
@pytest.mark.parametrize("skipfooter", [0, 1])
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
# see gh-13879 and gh-15910
msg = "parsing errors in the skipped footer rows"
parser = python_parser_only
def fail_read():
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
if skipfooter:
fail_read()
else:
# We expect no match, so there should be an assertion
# error out of the inner context manager.
with pytest.raises(AssertionError):
fail_read()
def test_malformed_skipfooter(python_parser_only):
parser = python_parser_only
data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
footer
"""
msg = "Expected 3 fields in line 4, saw 5"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)