Repository URL to install this package:
|
Version:
0.23.4 ▾
|
# -*- coding: utf-8 -*-
"""
Tests that apply specifically to the CParser. Unless specifically stated
as a CParser-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the Python parser can accept
further arguments when parsing.
"""
import os
import sys
import tarfile
import pytest
import numpy as np
import pandas as pd
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas import DataFrame
from pandas.compat import StringIO, range, lrange
class CParserTests(object):
def test_buffer_overflow(self):
# see gh-9205: test certain malformed input files that cause
# buffer overflows in tokenizer.c
malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer
malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer
malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer
cperr = 'Buffer overflow caught - possible malformed input file.'
for malf in (malfw, malfs, malfl):
try:
self.read_table(StringIO(malf))
except Exception as err:
assert cperr in str(err)
def test_buffer_rd_bytes(self):
# see gh-12098: src->buffer in the C parser can be freed twice leading
# to a segfault if a corrupt gzip file is read with 'read_csv' and the
# buffer is filled more than once before gzip throws an exception
data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
'\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
'\xA6\x4D' + '\x55' * 267 + \
'\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
'\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
for i in range(100):
try:
self.read_csv(StringIO(data),
compression='gzip',
delim_whitespace=True)
except Exception:
pass
def test_delim_whitespace_custom_terminator(self):
# See gh-12912
data = """a b c~1 2 3~4 5 6~7 8 9"""
df = self.read_csv(StringIO(data), lineterminator='~',
delim_whitespace=True)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=['a', 'b', 'c'])
tm.assert_frame_equal(df, expected)
def test_dtype_and_names_error(self):
# see gh-8833: passing both dtype and names
# resulting in an error reporting issue
data = """
1.0 1
2.0 2
3.0 3
"""
# base cases
result = self.read_csv(StringIO(data), sep=r'\s+', header=None)
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
tm.assert_frame_equal(result, expected)
result = self.read_csv(StringIO(data), sep=r'\s+',
header=None, names=['a', 'b'])
expected = DataFrame(
[[1.0, 1], [2.0, 2], [3.0, 3]], columns=['a', 'b'])
tm.assert_frame_equal(result, expected)
# fallback casting
result = self.read_csv(StringIO(
data), sep=r'\s+', header=None,
names=['a', 'b'], dtype={'a': np.int32})
expected = DataFrame([[1, 1], [2, 2], [3, 3]],
columns=['a', 'b'])
expected['a'] = expected['a'].astype(np.int32)
tm.assert_frame_equal(result, expected)
data = """
1.0 1
nan 2
3.0 3
"""
# fallback casting, but not castable
with tm.assert_raises_regex(ValueError, 'cannot safely convert'):
self.read_csv(StringIO(data), sep=r'\s+', header=None,
names=['a', 'b'], dtype={'a': np.int32})
def test_unsupported_dtype(self):
df = DataFrame(np.random.rand(5, 2), columns=list(
'AB'), index=['1A', '1B', '1C', '1D', '1E'])
with tm.ensure_clean('__unsupported_dtype__.csv') as path:
df.to_csv(path)
# valid but we don't support it (date)
pytest.raises(TypeError, self.read_csv, path,
dtype={'A': 'datetime64', 'B': 'float64'},
index_col=0)
pytest.raises(TypeError, self.read_csv, path,
dtype={'A': 'datetime64', 'B': 'float64'},
index_col=0, parse_dates=['B'])
# valid but we don't support it
pytest.raises(TypeError, self.read_csv, path,
dtype={'A': 'timedelta64', 'B': 'float64'},
index_col=0)
# valid but unsupported - fixed width unicode string
pytest.raises(TypeError, self.read_csv, path,
dtype={'A': 'U8'},
index_col=0)
@td.skip_if_32bit
def test_precise_conversion(self):
from decimal import Decimal
normal_errors = []
precise_errors = []
# test numbers between 1 and 2
for num in np.linspace(1., 2., num=500):
# 25 decimal digits of precision
text = 'a\n{0:.25}'.format(num)
normal_val = float(self.read_csv(StringIO(text))['a'][0])
precise_val = float(self.read_csv(
StringIO(text), float_precision='high')['a'][0])
roundtrip_val = float(self.read_csv(
StringIO(text), float_precision='round_trip')['a'][0])
actual_val = Decimal(text[2:])
def error(val):
return abs(Decimal('{0:.100}'.format(val)) - actual_val)
normal_errors.append(error(normal_val))
precise_errors.append(error(precise_val))
# round-trip should match float()
assert roundtrip_val == float(text[2:])
assert sum(precise_errors) <= sum(normal_errors)
assert max(precise_errors) <= max(normal_errors)
def test_usecols_dtypes(self):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
result = self.read_csv(StringIO(data), usecols=(0, 1, 2),
names=('a', 'b', 'c'),
header=None,
converters={'a': str},
dtype={'b': int, 'c': float},
)
result2 = self.read_csv(StringIO(data), usecols=(0, 2),
names=('a', 'b', 'c'),
header=None,
converters={'a': str},
dtype={'b': int, 'c': float},
)
assert (result.dtypes == [object, np.int, np.float]).all()
assert (result2.dtypes == [object, np.float]).all()
def test_disable_bool_parsing(self):
# #2090
data = """A,B,C
Yes,No,Yes
No,Yes,Yes
Yes,,Yes
No,No,No"""
result = self.read_csv(StringIO(data), dtype=object)
assert (result.dtypes == object).all()
result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
assert result['B'][2] == ''
def test_custom_lineterminator(self):
data = 'a,b,c~1,2,3~4,5,6'
result = self.read_csv(StringIO(data), lineterminator='~')
expected = self.read_csv(StringIO(data.replace('~', '\n')))
tm.assert_frame_equal(result, expected)
def test_parse_ragged_csv(self):
data = """1,2,3
1,2,3,4
1,2,3,4,5
1,2
1,2,3,4"""
nice_data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
result = self.read_csv(StringIO(data), header=None,
names=['a', 'b', 'c', 'd', 'e'])
expected = self.read_csv(StringIO(nice_data), header=None,
names=['a', 'b', 'c', 'd', 'e'])
tm.assert_frame_equal(result, expected)
# too many columns, cause segfault if not careful
data = "1,2\n3,4,5"
result = self.read_csv(StringIO(data), header=None,
names=lrange(50))
expected = self.read_csv(StringIO(data), header=None,
names=lrange(3)).reindex(columns=lrange(50))
tm.assert_frame_equal(result, expected)
def test_tokenize_CR_with_quoting(self):
# see gh-3453
data = ' a,b,c\r"a,b","e,d","f,f"'
result = self.read_csv(StringIO(data), header=None)
expected = self.read_csv(StringIO(data.replace('\r', '\n')),
header=None)
tm.assert_frame_equal(result, expected)
result = self.read_csv(StringIO(data))
expected = self.read_csv(StringIO(data.replace('\r', '\n')))
tm.assert_frame_equal(result, expected)
def test_grow_boundary_at_cap(self):
# See gh-12494
#
# Cause of error was that the C parser
# was not increasing the buffer size when
# the desired space would fill the buffer
# to capacity, which would later cause a
# buffer overflow error when checking the
# EOF terminator of the CSV stream
def test_empty_header_read(count):
s = StringIO(',' * count)
expected = DataFrame(columns=[
'Unnamed: {i}'.format(i=i)
for i in range(count + 1)])
df = self.read_csv(s)
tm.assert_frame_equal(df, expected)
for count in range(1, 101):
test_empty_header_read(count)
def test_parse_trim_buffers(self):
# This test is part of a bugfix for issue #13703. It attempts to
# to stress the system memory allocator, to cause it to move the
# stream buffer and either let the OS reclaim the region, or let
# other memory requests of parser otherwise modify the contents
# of memory space, where it was formally located.
# This test is designed to cause a `segfault` with unpatched
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
# times it fails due to memory corruption, which causes the
# loaded DataFrame to differ from the expected one.
# Generate a large mixed-type CSV file on-the-fly (one record is
# approx 1.5KiB).
record_ = \
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
# Set the number of lines so that a call to `parser_trim_buffers`
# is triggered: after a couple of full chunks are consumed a
# relatively small 'residual' chunk would cause reallocation
# within the parser.
chunksize, n_lines = 128, 2 * 128 + 15
csv_data = "\n".join([record_] * n_lines) + "\n"
# We will use StringIO to load the CSV from this text buffer.
# pd.read_csv() will iterate over the file in chunks and will
# finally read a residual chunk of really small size.
# Generate the expected output: manually create the dataframe
# by splitting by comma and repeating the `n_lines` times.
row = tuple(val_ if val_ else np.nan
for val_ in record_.split(","))
expected = pd.DataFrame([row for _ in range(n_lines)],
dtype=object, columns=None, index=None)
# Iterate over the CSV file in chunks of `chunksize` lines
chunks_ = self.read_csv(StringIO(csv_data), header=None,
dtype=object, chunksize=chunksize)
result = pd.concat(chunks_, axis=0, ignore_index=True)
# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)
# This extra test was added to replicate the fault in gh-5291.
# Force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.
chunks_ = self.read_csv(StringIO(csv_data), header=None,
dtype=object, chunksize=chunksize,
encoding='utf_8')
result = pd.concat(chunks_, axis=0, ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_internal_null_byte(self):
# see gh-14012
#
# The null byte ('\x00') should not be used as a
# true line terminator, escape character, or comment
# character, only as a placeholder to indicate that
# none was specified.
#
# This test should be moved to common.py ONLY when
# Python's csv class supports parsing '\x00'.
names = ['a', 'b', 'c']
data = "1,2,3\n4,\x00,6\n7,8,9"
expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6],
[7, 8, 9]], columns=names)
result = self.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(result, expected)
def test_read_nrows_large(self):
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
header_narrow = '\t'.join(['COL_HEADER_' + str(i)
for i in range(10)]) + '\n'
data_narrow = '\t'.join(['somedatasomedatasomedata1'
for i in range(10)]) + '\n'
header_wide = '\t'.join(['COL_HEADER_' + str(i)
for i in range(15)]) + '\n'
data_wide = '\t'.join(['somedatasomedatasomedata2'
for i in range(15)]) + '\n'
test_input = (header_narrow + data_narrow * 1050 +
header_wide + data_wide * 2)
df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010)
assert df.size == 1010 * 10
def test_float_precision_round_trip_with_text(self):
# gh-15140 - This should not segfault on Python 2.7+
df = self.read_csv(StringIO('a'),
float_precision='round_trip',
header=None)
tm.assert_frame_equal(df, DataFrame({0: ['a']}))
def test_large_difference_in_columns(self):
# gh-14125
count = 10000
large_row = ('X,' * count)[:-1] + '\n'
normal_row = 'XXXXXX XXXXXX,111111111111111\n'
test_input = (large_row + normal_row * 6)[:-1]
result = self.read_csv(StringIO(test_input), header=None, usecols=[0])
rows = test_input.split('\n')
expected = DataFrame([row.split(',')[0] for row in rows])
tm.assert_frame_equal(result, expected)
def test_data_after_quote(self):
# see gh-15910
data = 'a\n1\n"b"a'
result = self.read_csv(StringIO(data))
expected = DataFrame({'a': ['1', 'ba']})
tm.assert_frame_equal(result, expected)
@tm.capture_stderr
def test_comment_whitespace_delimited(self):
test_input = """\
1 2
2 2 3
3 2 3 # 3 fields
4 2 3# 3 fields
5 2 # 2 fields
6 2# 2 fields
7 # 1 field, NaN
8# 1 field, NaN
9 2 3 # skipped line
# comment"""
df = self.read_csv(StringIO(test_input), comment='#', header=None,
delimiter='\\s+', skiprows=0,
error_bad_lines=False)
error = sys.stderr.getvalue()
# skipped lines 2, 3, 4, 9
for line_num in (2, 3, 4, 9):
assert 'Skipping line {}'.format(line_num) in error, error
expected = DataFrame([[1, 2],
[5, 2],
[6, 2],
[7, np.nan],
[8, np.nan]])
tm.assert_frame_equal(df, expected)
def test_file_like_no_next(self):
# gh-16530: the file-like need not have a "next" or "__next__"
# attribute despite having an "__iter__" attribute.
#
# NOTE: This is only true for the C engine, not Python engine.
class NoNextBuffer(StringIO):
def __next__(self):
raise AttributeError("No next method")
next = __next__
data = "a\n1"
expected = pd.DataFrame({"a": [1]})
result = self.read_csv(NoNextBuffer(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
def test_read_tarfile(self, tar_suffix):
# see gh-16530
#
# Unfortunately, Python's CSV library can't handle
# tarfile objects (expects string, not bytes when
# iterating through a file-like).
tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix)
with tarfile.open(tar_path, "r") as tar:
data_file = tar.extractfile("tar_data.csv")
out = self.read_csv(data_file)
expected = pd.DataFrame({"a": [1]})
tm.assert_frame_equal(out, expected)
@pytest.mark.high_memory
def test_bytes_exceed_2gb(self):
"""Read from a "CSV" that has a column larger than 2GB.
GH 16798
"""
if self.low_memory:
pytest.skip("not a high_memory test")
csv = StringIO('strings\n' + '\n'.join(
['x' * (1 << 20) for _ in range(2100)]))
df = self.read_csv(csv, low_memory=False)
assert not df.empty