Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
pandas / tests / io / parser / c_parser_only.py
Size: Mime:
# -*- coding: utf-8 -*-

"""
Tests that apply specifically to the CParser. Unless specifically stated
as a CParser-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the Python parser can accept
further arguments when parsing.
"""

import os
import sys
import tarfile

import pytest
import numpy as np

import pandas as pd
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas import DataFrame
from pandas.compat import StringIO, range, lrange


class CParserTests(object):

    def test_buffer_overflow(self):
        # see gh-9205: test certain malformed input files that cause
        # buffer overflows in tokenizer.c

        malfw = "1\r1\r1\r 1\r 1\r"         # buffer overflow in words pointer
        malfs = "1\r1\r1\r 1\r 1\r11\r"     # buffer overflow in stream pointer
        malfl = "1\r1\r1\r 1\r 1\r11\r1\r"  # buffer overflow in lines pointer

        cperr = 'Buffer overflow caught - possible malformed input file.'

        for malf in (malfw, malfs, malfl):
            try:
                self.read_table(StringIO(malf))
            except Exception as err:
                assert cperr in str(err)

    def test_buffer_rd_bytes(self):
        # see gh-12098: src->buffer in the C parser can be freed twice leading
        # to a segfault if a corrupt gzip file is read with 'read_csv' and the
        # buffer is filled more than once before gzip throws an exception

        data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \
               '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \
               '\xA6\x4D' + '\x55' * 267 + \
               '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \
               '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO'
        for i in range(100):
            try:
                self.read_csv(StringIO(data),
                              compression='gzip',
                              delim_whitespace=True)
            except Exception:
                pass

    def test_delim_whitespace_custom_terminator(self):
        # See gh-12912
        data = """a b c~1 2 3~4 5 6~7 8 9"""
        df = self.read_csv(StringIO(data), lineterminator='~',
                           delim_whitespace=True)
        expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                             columns=['a', 'b', 'c'])
        tm.assert_frame_equal(df, expected)

    def test_dtype_and_names_error(self):
        # see gh-8833: passing both dtype and names
        # resulting in an error reporting issue
        data = """
1.0 1
2.0 2
3.0 3
"""
        # base cases
        result = self.read_csv(StringIO(data), sep=r'\s+', header=None)
        expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
        tm.assert_frame_equal(result, expected)

        result = self.read_csv(StringIO(data), sep=r'\s+',
                               header=None, names=['a', 'b'])
        expected = DataFrame(
            [[1.0, 1], [2.0, 2], [3.0, 3]], columns=['a', 'b'])
        tm.assert_frame_equal(result, expected)

        # fallback casting
        result = self.read_csv(StringIO(
            data), sep=r'\s+', header=None,
            names=['a', 'b'], dtype={'a': np.int32})
        expected = DataFrame([[1, 1], [2, 2], [3, 3]],
                             columns=['a', 'b'])
        expected['a'] = expected['a'].astype(np.int32)
        tm.assert_frame_equal(result, expected)

        data = """
1.0 1
nan 2
3.0 3
"""
        # fallback casting, but not castable
        with tm.assert_raises_regex(ValueError, 'cannot safely convert'):
            self.read_csv(StringIO(data), sep=r'\s+', header=None,
                          names=['a', 'b'], dtype={'a': np.int32})

    def test_unsupported_dtype(self):
        df = DataFrame(np.random.rand(5, 2), columns=list(
            'AB'), index=['1A', '1B', '1C', '1D', '1E'])

        with tm.ensure_clean('__unsupported_dtype__.csv') as path:
            df.to_csv(path)

            # valid but we don't support it (date)
            pytest.raises(TypeError, self.read_csv, path,
                          dtype={'A': 'datetime64', 'B': 'float64'},
                          index_col=0)
            pytest.raises(TypeError, self.read_csv, path,
                          dtype={'A': 'datetime64', 'B': 'float64'},
                          index_col=0, parse_dates=['B'])

            # valid but we don't support it
            pytest.raises(TypeError, self.read_csv, path,
                          dtype={'A': 'timedelta64', 'B': 'float64'},
                          index_col=0)

            # valid but unsupported - fixed width unicode string
            pytest.raises(TypeError, self.read_csv, path,
                          dtype={'A': 'U8'},
                          index_col=0)

    @td.skip_if_32bit
    def test_precise_conversion(self):
        from decimal import Decimal

        normal_errors = []
        precise_errors = []

        # test numbers between 1 and 2
        for num in np.linspace(1., 2., num=500):
            # 25 decimal digits of precision
            text = 'a\n{0:.25}'.format(num)

            normal_val = float(self.read_csv(StringIO(text))['a'][0])
            precise_val = float(self.read_csv(
                StringIO(text), float_precision='high')['a'][0])
            roundtrip_val = float(self.read_csv(
                StringIO(text), float_precision='round_trip')['a'][0])
            actual_val = Decimal(text[2:])

            def error(val):
                return abs(Decimal('{0:.100}'.format(val)) - actual_val)

            normal_errors.append(error(normal_val))
            precise_errors.append(error(precise_val))

            # round-trip should match float()
            assert roundtrip_val == float(text[2:])

        assert sum(precise_errors) <= sum(normal_errors)
        assert max(precise_errors) <= max(normal_errors)

    def test_usecols_dtypes(self):
        data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""

        result = self.read_csv(StringIO(data), usecols=(0, 1, 2),
                               names=('a', 'b', 'c'),
                               header=None,
                               converters={'a': str},
                               dtype={'b': int, 'c': float},
                               )
        result2 = self.read_csv(StringIO(data), usecols=(0, 2),
                                names=('a', 'b', 'c'),
                                header=None,
                                converters={'a': str},
                                dtype={'b': int, 'c': float},
                                )
        assert (result.dtypes == [object, np.int, np.float]).all()
        assert (result2.dtypes == [object, np.float]).all()

    def test_disable_bool_parsing(self):
        # #2090

        data = """A,B,C
Yes,No,Yes
No,Yes,Yes
Yes,,Yes
No,No,No"""

        result = self.read_csv(StringIO(data), dtype=object)
        assert (result.dtypes == object).all()

        result = self.read_csv(StringIO(data), dtype=object, na_filter=False)
        assert result['B'][2] == ''

    def test_custom_lineterminator(self):
        data = 'a,b,c~1,2,3~4,5,6'

        result = self.read_csv(StringIO(data), lineterminator='~')
        expected = self.read_csv(StringIO(data.replace('~', '\n')))

        tm.assert_frame_equal(result, expected)

    def test_parse_ragged_csv(self):
        data = """1,2,3
1,2,3,4
1,2,3,4,5
1,2
1,2,3,4"""

        nice_data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
        result = self.read_csv(StringIO(data), header=None,
                               names=['a', 'b', 'c', 'd', 'e'])

        expected = self.read_csv(StringIO(nice_data), header=None,
                                 names=['a', 'b', 'c', 'd', 'e'])

        tm.assert_frame_equal(result, expected)

        # too many columns, cause segfault if not careful
        data = "1,2\n3,4,5"

        result = self.read_csv(StringIO(data), header=None,
                               names=lrange(50))
        expected = self.read_csv(StringIO(data), header=None,
                                 names=lrange(3)).reindex(columns=lrange(50))

        tm.assert_frame_equal(result, expected)

    def test_tokenize_CR_with_quoting(self):
        # see gh-3453

        data = ' a,b,c\r"a,b","e,d","f,f"'

        result = self.read_csv(StringIO(data), header=None)
        expected = self.read_csv(StringIO(data.replace('\r', '\n')),
                                 header=None)
        tm.assert_frame_equal(result, expected)

        result = self.read_csv(StringIO(data))
        expected = self.read_csv(StringIO(data.replace('\r', '\n')))
        tm.assert_frame_equal(result, expected)

    def test_grow_boundary_at_cap(self):
        # See gh-12494
        #
        # Cause of error was that the C parser
        # was not increasing the buffer size when
        # the desired space would fill the buffer
        # to capacity, which would later cause a
        # buffer overflow error when checking the
        # EOF terminator of the CSV stream
        def test_empty_header_read(count):
            s = StringIO(',' * count)
            expected = DataFrame(columns=[
                'Unnamed: {i}'.format(i=i)
                for i in range(count + 1)])
            df = self.read_csv(s)
            tm.assert_frame_equal(df, expected)

        for count in range(1, 101):
            test_empty_header_read(count)

    def test_parse_trim_buffers(self):
        # This test is part of a bugfix for issue #13703. It attempts to
        # to stress the system memory allocator, to cause it to move the
        # stream buffer and either let the OS reclaim the region, or let
        # other memory requests of parser otherwise modify the contents
        # of memory space, where it was formally located.
        # This test is designed to cause a `segfault` with unpatched
        # `tokenizer.c`. Sometimes the test fails on `segfault`, other
        # times it fails due to memory corruption, which causes the
        # loaded DataFrame to differ from the expected one.

        # Generate a large mixed-type CSV file on-the-fly (one record is
        # approx 1.5KiB).
        record_ = \
            """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \
            """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \
            """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \
            """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \
            """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \
            """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \
            """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \
            """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \
            """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \
            """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \
            """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \
            """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \
            """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \
            """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \
            """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \
            """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \
            """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \
            """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \
            """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \
            """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \
            """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \
            """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \
            """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \
            """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \
            """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \
            """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \
            """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \
            """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \
            """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""

        # Set the number of lines so that a call to `parser_trim_buffers`
        # is triggered: after a couple of full chunks are consumed a
        # relatively small 'residual' chunk would cause reallocation
        # within the parser.
        chunksize, n_lines = 128, 2 * 128 + 15
        csv_data = "\n".join([record_] * n_lines) + "\n"

        # We will use StringIO to load the CSV from this text buffer.
        # pd.read_csv() will iterate over the file in chunks and will
        # finally read a residual chunk of really small size.

        # Generate the expected output: manually create the dataframe
        # by splitting by comma and repeating the `n_lines` times.
        row = tuple(val_ if val_ else np.nan
                    for val_ in record_.split(","))
        expected = pd.DataFrame([row for _ in range(n_lines)],
                                dtype=object, columns=None, index=None)

        # Iterate over the CSV file in chunks of `chunksize` lines
        chunks_ = self.read_csv(StringIO(csv_data), header=None,
                                dtype=object, chunksize=chunksize)
        result = pd.concat(chunks_, axis=0, ignore_index=True)

        # Check for data corruption if there was no segfault
        tm.assert_frame_equal(result, expected)

        # This extra test was added to replicate the fault in gh-5291.
        # Force 'utf-8' encoding, so that `_string_convert` would take
        # a different execution branch.
        chunks_ = self.read_csv(StringIO(csv_data), header=None,
                                dtype=object, chunksize=chunksize,
                                encoding='utf_8')
        result = pd.concat(chunks_, axis=0, ignore_index=True)
        tm.assert_frame_equal(result, expected)

    def test_internal_null_byte(self):
        # see gh-14012
        #
        # The null byte ('\x00') should not be used as a
        # true line terminator, escape character, or comment
        # character, only as a placeholder to indicate that
        # none was specified.
        #
        # This test should be moved to common.py ONLY when
        # Python's csv class supports parsing '\x00'.
        names = ['a', 'b', 'c']
        data = "1,2,3\n4,\x00,6\n7,8,9"
        expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6],
                                 [7, 8, 9]], columns=names)

        result = self.read_csv(StringIO(data), names=names)
        tm.assert_frame_equal(result, expected)

    def test_read_nrows_large(self):
        # gh-7626 - Read only nrows of data in for large inputs (>262144b)
        header_narrow = '\t'.join(['COL_HEADER_' + str(i)
                                   for i in range(10)]) + '\n'
        data_narrow = '\t'.join(['somedatasomedatasomedata1'
                                 for i in range(10)]) + '\n'
        header_wide = '\t'.join(['COL_HEADER_' + str(i)
                                 for i in range(15)]) + '\n'
        data_wide = '\t'.join(['somedatasomedatasomedata2'
                               for i in range(15)]) + '\n'
        test_input = (header_narrow + data_narrow * 1050 +
                      header_wide + data_wide * 2)

        df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010)

        assert df.size == 1010 * 10

    def test_float_precision_round_trip_with_text(self):
        # gh-15140 - This should not segfault on Python 2.7+
        df = self.read_csv(StringIO('a'),
                           float_precision='round_trip',
                           header=None)
        tm.assert_frame_equal(df, DataFrame({0: ['a']}))

    def test_large_difference_in_columns(self):
        # gh-14125
        count = 10000
        large_row = ('X,' * count)[:-1] + '\n'
        normal_row = 'XXXXXX XXXXXX,111111111111111\n'
        test_input = (large_row + normal_row * 6)[:-1]
        result = self.read_csv(StringIO(test_input), header=None, usecols=[0])
        rows = test_input.split('\n')
        expected = DataFrame([row.split(',')[0] for row in rows])

        tm.assert_frame_equal(result, expected)

    def test_data_after_quote(self):
        # see gh-15910

        data = 'a\n1\n"b"a'
        result = self.read_csv(StringIO(data))
        expected = DataFrame({'a': ['1', 'ba']})

        tm.assert_frame_equal(result, expected)

    @tm.capture_stderr
    def test_comment_whitespace_delimited(self):
        test_input = """\
1 2
2 2 3
3 2 3 # 3 fields
4 2 3# 3 fields
5 2 # 2 fields
6 2# 2 fields
7 # 1 field, NaN
8# 1 field, NaN
9 2 3 # skipped line
# comment"""
        df = self.read_csv(StringIO(test_input), comment='#', header=None,
                           delimiter='\\s+', skiprows=0,
                           error_bad_lines=False)
        error = sys.stderr.getvalue()
        # skipped lines 2, 3, 4, 9
        for line_num in (2, 3, 4, 9):
            assert 'Skipping line {}'.format(line_num) in error, error
        expected = DataFrame([[1, 2],
                              [5, 2],
                              [6, 2],
                              [7, np.nan],
                              [8, np.nan]])
        tm.assert_frame_equal(df, expected)

    def test_file_like_no_next(self):
        # gh-16530: the file-like need not have a "next" or "__next__"
        # attribute despite having an "__iter__" attribute.
        #
        # NOTE: This is only true for the C engine, not Python engine.
        class NoNextBuffer(StringIO):
            def __next__(self):
                raise AttributeError("No next method")

            next = __next__

        data = "a\n1"

        expected = pd.DataFrame({"a": [1]})
        result = self.read_csv(NoNextBuffer(data))

        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
    def test_read_tarfile(self, tar_suffix):
        # see gh-16530
        #
        # Unfortunately, Python's CSV library can't handle
        # tarfile objects (expects string, not bytes when
        # iterating through a file-like).
        tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix)

        with tarfile.open(tar_path, "r") as tar:
            data_file = tar.extractfile("tar_data.csv")

            out = self.read_csv(data_file)
            expected = pd.DataFrame({"a": [1]})
            tm.assert_frame_equal(out, expected)

    @pytest.mark.high_memory
    def test_bytes_exceed_2gb(self):
        """Read from a "CSV" that has a column larger than 2GB.

        GH 16798
        """
        if self.low_memory:
            pytest.skip("not a high_memory test")

        csv = StringIO('strings\n' + '\n'.join(
            ['x' * (1 << 20) for _ in range(2100)]))
        df = self.read_csv(csv, low_memory=False)
        assert not df.empty