diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py deleted file mode 100644 index 88db1080642c5..0000000000000 --- a/pandas/tests/io/parser/c_parser_only.py +++ /dev/null @@ -1,494 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that apply specifically to the CParser. Unless specifically stated -as a CParser-specific issue, the goal is to eventually move as many of -these tests out of this module as soon as the Python parser can accept -further arguments when parsing. -""" - -from io import TextIOWrapper -import os -import sys -import tarfile - -import numpy as np -import pytest - -from pandas.compat import PY3, BytesIO, StringIO, lrange, range -import pandas.util._test_decorators as td - -import pandas as pd -from pandas import DataFrame -import pandas.util.testing as tm - - -class CParserTests(object): - - @pytest.mark.parametrize( - 'malf', - ['1\r1\r1\r 1\r 1\r', - '1\r1\r1\r 1\r 1\r11\r', - '1\r1\r1\r 1\r 1\r11\r1\r'], - ids=['words pointer', 'stream pointer', 'lines pointer']) - def test_buffer_overflow(self, malf): - # see gh-9205: test certain malformed input files that cause - # buffer overflows in tokenizer.c - cperr = 'Buffer overflow caught - possible malformed input file.' - with pytest.raises(pd.errors.ParserError, match=cperr): - self.read_table(StringIO(malf)) - - def test_buffer_rd_bytes(self): - # see gh-12098: src->buffer in the C parser can be freed twice leading - # to a segfault if a corrupt gzip file is read with 'read_csv' and the - # buffer is filled more than once before gzip throws an exception - - data = '\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09' \ - '\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0' \ - '\xA6\x4D' + '\x55' * 267 + \ - '\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00' \ - '\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO' - for i in range(100): - try: - self.read_csv(StringIO(data), - compression='gzip', - delim_whitespace=True) - except Exception: - pass - - def test_delim_whitespace_custom_terminator(self): - # See gh-12912 - data = """a b c~1 2 3~4 5 6~7 8 9""" - df = self.read_csv(StringIO(data), lineterminator='~', - delim_whitespace=True) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=['a', 'b', 'c']) - tm.assert_frame_equal(df, expected) - - def test_dtype_and_names_error(self): - # see gh-8833: passing both dtype and names - # resulting in an error reporting issue - data = """ -1.0 1 -2.0 2 -3.0 3 -""" - # base cases - result = self.read_csv(StringIO(data), sep=r'\s+', header=None) - expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), sep=r'\s+', - header=None, names=['a', 'b']) - expected = DataFrame( - [[1.0, 1], [2.0, 2], [3.0, 3]], columns=['a', 'b']) - tm.assert_frame_equal(result, expected) - - # fallback casting - result = self.read_csv(StringIO( - data), sep=r'\s+', header=None, - names=['a', 'b'], dtype={'a': np.int32}) - expected = DataFrame([[1, 1], [2, 2], [3, 3]], - columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.int32) - tm.assert_frame_equal(result, expected) - - data = """ -1.0 1 -nan 2 -3.0 3 -""" - # fallback casting, but not castable - with pytest.raises(ValueError, match='cannot safely convert'): - self.read_csv(StringIO(data), sep=r'\s+', header=None, - names=['a', 'b'], dtype={'a': np.int32}) - - def test_unsupported_dtype(self): - df = DataFrame(np.random.rand(5, 2), columns=list( - 'AB'), index=['1A', '1B', '1C', '1D', '1E']) - - with tm.ensure_clean('__unsupported_dtype__.csv') as path: - df.to_csv(path) - - # valid but we don't support it (date) - pytest.raises(TypeError, self.read_csv, path, - dtype={'A': 'datetime64', 'B': 'float64'}, - index_col=0) - pytest.raises(TypeError, self.read_csv, path, - dtype={'A': 'datetime64', 'B': 'float64'}, - index_col=0, parse_dates=['B']) - - # valid but we don't support it - pytest.raises(TypeError, self.read_csv, path, - dtype={'A': 'timedelta64', 'B': 'float64'}, - index_col=0) - - # valid but unsupported - fixed width unicode string - pytest.raises(TypeError, self.read_csv, path, - dtype={'A': 'U8'}, - index_col=0) - - @td.skip_if_32bit - def test_precise_conversion(self): - from decimal import Decimal - - normal_errors = [] - precise_errors = [] - - # test numbers between 1 and 2 - for num in np.linspace(1., 2., num=500): - # 25 decimal digits of precision - text = 'a\n{0:.25}'.format(num) - - normal_val = float(self.read_csv(StringIO(text))['a'][0]) - precise_val = float(self.read_csv( - StringIO(text), float_precision='high')['a'][0]) - roundtrip_val = float(self.read_csv( - StringIO(text), float_precision='round_trip')['a'][0]) - actual_val = Decimal(text[2:]) - - def error(val): - return abs(Decimal('{0:.100}'.format(val)) - actual_val) - - normal_errors.append(error(normal_val)) - precise_errors.append(error(precise_val)) - - # round-trip should match float() - assert roundtrip_val == float(text[2:]) - - assert sum(precise_errors) <= sum(normal_errors) - assert max(precise_errors) <= max(normal_errors) - - def test_usecols_dtypes(self): - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - - result = self.read_csv(StringIO(data), usecols=(0, 1, 2), - names=('a', 'b', 'c'), - header=None, - converters={'a': str}, - dtype={'b': int, 'c': float}, - ) - result2 = self.read_csv(StringIO(data), usecols=(0, 2), - names=('a', 'b', 'c'), - header=None, - converters={'a': str}, - dtype={'b': int, 'c': float}, - ) - assert (result.dtypes == [object, np.int, np.float]).all() - assert (result2.dtypes == [object, np.float]).all() - - def test_disable_bool_parsing(self): - # #2090 - - data = """A,B,C -Yes,No,Yes -No,Yes,Yes -Yes,,Yes -No,No,No""" - - result = self.read_csv(StringIO(data), dtype=object) - assert (result.dtypes == object).all() - - result = self.read_csv(StringIO(data), dtype=object, na_filter=False) - assert result['B'][2] == '' - - def test_custom_lineterminator(self): - data = 'a,b,c~1,2,3~4,5,6' - - result = self.read_csv(StringIO(data), lineterminator='~') - expected = self.read_csv(StringIO(data.replace('~', '\n'))) - - tm.assert_frame_equal(result, expected) - - def test_parse_ragged_csv(self): - data = """1,2,3 -1,2,3,4 -1,2,3,4,5 -1,2 -1,2,3,4""" - - nice_data = """1,2,3,, -1,2,3,4, -1,2,3,4,5 -1,2,,, -1,2,3,4,""" - result = self.read_csv(StringIO(data), header=None, - names=['a', 'b', 'c', 'd', 'e']) - - expected = self.read_csv(StringIO(nice_data), header=None, - names=['a', 'b', 'c', 'd', 'e']) - - tm.assert_frame_equal(result, expected) - - # too many columns, cause segfault if not careful - data = "1,2\n3,4,5" - - result = self.read_csv(StringIO(data), header=None, - names=lrange(50)) - expected = self.read_csv(StringIO(data), header=None, - names=lrange(3)).reindex(columns=lrange(50)) - - tm.assert_frame_equal(result, expected) - - def test_tokenize_CR_with_quoting(self): - # see gh-3453 - - data = ' a,b,c\r"a,b","e,d","f,f"' - - result = self.read_csv(StringIO(data), header=None) - expected = self.read_csv(StringIO(data.replace('\r', '\n')), - header=None) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data)) - expected = self.read_csv(StringIO(data.replace('\r', '\n'))) - tm.assert_frame_equal(result, expected) - - def test_grow_boundary_at_cap(self): - # See gh-12494 - # - # Cause of error was that the C parser - # was not increasing the buffer size when - # the desired space would fill the buffer - # to capacity, which would later cause a - # buffer overflow error when checking the - # EOF terminator of the CSV stream - def test_empty_header_read(count): - s = StringIO(',' * count) - expected = DataFrame(columns=[ - 'Unnamed: {i}'.format(i=i) - for i in range(count + 1)]) - df = self.read_csv(s) - tm.assert_frame_equal(df, expected) - - for count in range(1, 101): - test_empty_header_read(count) - - def test_parse_trim_buffers(self): - # This test is part of a bugfix for issue #13703. It attempts to - # to stress the system memory allocator, to cause it to move the - # stream buffer and either let the OS reclaim the region, or let - # other memory requests of parser otherwise modify the contents - # of memory space, where it was formally located. - # This test is designed to cause a `segfault` with unpatched - # `tokenizer.c`. Sometimes the test fails on `segfault`, other - # times it fails due to memory corruption, which causes the - # loaded DataFrame to differ from the expected one. - - # Generate a large mixed-type CSV file on-the-fly (one record is - # approx 1.5KiB). - record_ = \ - """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \ - """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \ - """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \ - """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \ - """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \ - """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \ - """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \ - """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \ - """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \ - """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \ - """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \ - """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \ - """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \ - """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \ - """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \ - """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \ - """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \ - """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \ - """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \ - """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \ - """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \ - """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \ - """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \ - """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \ - """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \ - """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \ - """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \ - """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \ - """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" - - # Set the number of lines so that a call to `parser_trim_buffers` - # is triggered: after a couple of full chunks are consumed a - # relatively small 'residual' chunk would cause reallocation - # within the parser. - chunksize, n_lines = 128, 2 * 128 + 15 - csv_data = "\n".join([record_] * n_lines) + "\n" - - # We will use StringIO to load the CSV from this text buffer. - # pd.read_csv() will iterate over the file in chunks and will - # finally read a residual chunk of really small size. - - # Generate the expected output: manually create the dataframe - # by splitting by comma and repeating the `n_lines` times. - row = tuple(val_ if val_ else np.nan - for val_ in record_.split(",")) - expected = pd.DataFrame([row for _ in range(n_lines)], - dtype=object, columns=None, index=None) - - # Iterate over the CSV file in chunks of `chunksize` lines - chunks_ = self.read_csv(StringIO(csv_data), header=None, - dtype=object, chunksize=chunksize) - result = pd.concat(chunks_, axis=0, ignore_index=True) - - # Check for data corruption if there was no segfault - tm.assert_frame_equal(result, expected) - - # This extra test was added to replicate the fault in gh-5291. - # Force 'utf-8' encoding, so that `_string_convert` would take - # a different execution branch. - chunks_ = self.read_csv(StringIO(csv_data), header=None, - dtype=object, chunksize=chunksize, - encoding='utf_8') - result = pd.concat(chunks_, axis=0, ignore_index=True) - tm.assert_frame_equal(result, expected) - - def test_internal_null_byte(self): - # see gh-14012 - # - # The null byte ('\x00') should not be used as a - # true line terminator, escape character, or comment - # character, only as a placeholder to indicate that - # none was specified. - # - # This test should be moved to common.py ONLY when - # Python's csv class supports parsing '\x00'. - names = ['a', 'b', 'c'] - data = "1,2,3\n4,\x00,6\n7,8,9" - expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6], - [7, 8, 9]], columns=names) - - result = self.read_csv(StringIO(data), names=names) - tm.assert_frame_equal(result, expected) - - def test_read_nrows_large(self): - # gh-7626 - Read only nrows of data in for large inputs (>262144b) - header_narrow = '\t'.join(['COL_HEADER_' + str(i) - for i in range(10)]) + '\n' - data_narrow = '\t'.join(['somedatasomedatasomedata1' - for i in range(10)]) + '\n' - header_wide = '\t'.join(['COL_HEADER_' + str(i) - for i in range(15)]) + '\n' - data_wide = '\t'.join(['somedatasomedatasomedata2' - for i in range(15)]) + '\n' - test_input = (header_narrow + data_narrow * 1050 + - header_wide + data_wide * 2) - - df = self.read_csv(StringIO(test_input), sep='\t', nrows=1010) - - assert df.size == 1010 * 10 - - def test_float_precision_round_trip_with_text(self): - # gh-15140 - This should not segfault on Python 2.7+ - df = self.read_csv(StringIO('a'), - float_precision='round_trip', - header=None) - tm.assert_frame_equal(df, DataFrame({0: ['a']})) - - def test_large_difference_in_columns(self): - # gh-14125 - count = 10000 - large_row = ('X,' * count)[:-1] + '\n' - normal_row = 'XXXXXX XXXXXX,111111111111111\n' - test_input = (large_row + normal_row * 6)[:-1] - result = self.read_csv(StringIO(test_input), header=None, usecols=[0]) - rows = test_input.split('\n') - expected = DataFrame([row.split(',')[0] for row in rows]) - - tm.assert_frame_equal(result, expected) - - def test_data_after_quote(self): - # see gh-15910 - - data = 'a\n1\n"b"a' - result = self.read_csv(StringIO(data)) - expected = DataFrame({'a': ['1', 'ba']}) - - tm.assert_frame_equal(result, expected) - - @tm.capture_stderr - def test_comment_whitespace_delimited(self): - test_input = """\ -1 2 -2 2 3 -3 2 3 # 3 fields -4 2 3# 3 fields -5 2 # 2 fields -6 2# 2 fields -7 # 1 field, NaN -8# 1 field, NaN -9 2 3 # skipped line -# comment""" - df = self.read_csv(StringIO(test_input), comment='#', header=None, - delimiter='\\s+', skiprows=0, - error_bad_lines=False) - error = sys.stderr.getvalue() - # skipped lines 2, 3, 4, 9 - for line_num in (2, 3, 4, 9): - assert 'Skipping line {}'.format(line_num) in error, error - expected = DataFrame([[1, 2], - [5, 2], - [6, 2], - [7, np.nan], - [8, np.nan]]) - tm.assert_frame_equal(df, expected) - - def test_file_like_no_next(self): - # gh-16530: the file-like need not have a "next" or "__next__" - # attribute despite having an "__iter__" attribute. - # - # NOTE: This is only true for the C engine, not Python engine. - class NoNextBuffer(StringIO): - def __next__(self): - raise AttributeError("No next method") - - next = __next__ - - data = "a\n1" - - expected = pd.DataFrame({"a": [1]}) - result = self.read_csv(NoNextBuffer(data)) - - tm.assert_frame_equal(result, expected) - - def test_buffer_rd_bytes_bad_unicode(self): - # see gh-22748 - t = BytesIO(b"\xB0") - if PY3: - t = TextIOWrapper(t, encoding='ascii', errors='surrogateescape') - with pytest.raises(UnicodeError): - self.read_csv(t, encoding='UTF-8') - - @pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"]) - def test_read_tarfile(self, tar_suffix): - # see gh-16530 - # - # Unfortunately, Python's CSV library can't handle - # tarfile objects (expects string, not bytes when - # iterating through a file-like). - tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix) - - with tarfile.open(tar_path, "r") as tar: - data_file = tar.extractfile("tar_data.csv") - - out = self.read_csv(data_file) - expected = pd.DataFrame({"a": [1]}) - tm.assert_frame_equal(out, expected) - - @pytest.mark.high_memory - def test_bytes_exceed_2gb(self): - """Read from a "CSV" that has a column larger than 2GB. - - GH 16798 - """ - if self.low_memory: - pytest.skip("not a high_memory test") - - csv = StringIO('strings\n' + '\n'.join( - ['x' * (1 << 20) for _ in range(2100)])) - df = self.read_csv(csv, low_memory=False) - assert not df.empty diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py new file mode 100644 index 0000000000000..a82e3e47c6931 --- /dev/null +++ b/pandas/tests/io/parser/conftest.py @@ -0,0 +1,80 @@ +import pytest + +from pandas import read_csv, read_table +import pandas.util.testing as tm + + +class BaseParser(object): + engine = None + low_memory = True + float_precision_choices = [] + + def update_kwargs(self, kwargs): + kwargs = kwargs.copy() + kwargs.update(dict(engine=self.engine, + low_memory=self.low_memory)) + + return kwargs + + def read_csv(self, *args, **kwargs): + kwargs = self.update_kwargs(kwargs) + return read_csv(*args, **kwargs) + + def read_table(self, *args, **kwargs): + kwargs = self.update_kwargs(kwargs) + with tm.assert_produces_warning(FutureWarning): + return read_table(*args, **kwargs) + + +class CParser(BaseParser): + engine = "c" + float_precision_choices = [None, "high", "round_trip"] + + +class CParserHighMemory(CParser): + low_memory = False + + +class CParserLowMemory(CParser): + low_memory = True + + +class PythonParser(BaseParser): + engine = "python" + float_precision_choices = [] + + +@pytest.fixture +def csv_dir_path(datapath): + return datapath("io", "parser", "data") + + +_cParserHighMemory = CParserHighMemory() +_cParserLowMemory = CParserLowMemory() +_pythonParser = PythonParser() + +_py_parsers_only = [_pythonParser] +_c_parsers_only = [_cParserHighMemory, _cParserLowMemory] +_all_parsers = _c_parsers_only + _py_parsers_only + +_py_parser_ids = ["python"] +_c_parser_ids = ["c_high", "c_low"] +_all_parser_ids = _c_parser_ids + _py_parser_ids + + +@pytest.fixture(params=_all_parsers, + ids=_all_parser_ids) +def all_parsers(request): + return request.param + + +@pytest.fixture(params=_c_parsers_only, + ids=_c_parser_ids) +def c_parser_only(request): + return request.param + + +@pytest.fixture(params=_py_parsers_only, + ids=_py_parser_ids) +def python_parser_only(request): + return request.param diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py new file mode 100644 index 0000000000000..570ecd80b00c0 --- /dev/null +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -0,0 +1,546 @@ +# -*- coding: utf-8 -*- + +""" +Tests that apply specifically to the CParser. Unless specifically stated +as a CParser-specific issue, the goal is to eventually move as many of +these tests out of this module as soon as the Python parser can accept +further arguments when parsing. +""" + +from io import TextIOWrapper +import os +import sys +import tarfile + +import numpy as np +import pytest + +from pandas.compat import PY3, BytesIO, StringIO, lrange, range +from pandas.errors import ParserError +import pandas.util._test_decorators as td + +from pandas import DataFrame, concat +import pandas.util.testing as tm + + +@pytest.mark.parametrize( + "malformed", + ["1\r1\r1\r 1\r 1\r", + "1\r1\r1\r 1\r 1\r11\r", + "1\r1\r1\r 1\r 1\r11\r1\r"], + ids=["words pointer", "stream pointer", "lines pointer"]) +def test_buffer_overflow(c_parser_only, malformed): + # see gh-9205: test certain malformed input files that cause + # buffer overflows in tokenizer.c + msg = "Buffer overflow caught - possible malformed input file." + parser = c_parser_only + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(malformed)) + + +def test_buffer_rd_bytes(c_parser_only): + # see gh-12098: src->buffer in the C parser can be freed twice leading + # to a segfault if a corrupt gzip file is read with 'read_csv', and the + # buffer is filled more than once before gzip raises an Exception. + + data = "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" \ + "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" \ + "\xA6\x4D" + "\x55" * 267 + \ + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" \ + "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO" + parser = c_parser_only + + for _ in range(100): + try: + parser.read_csv(StringIO(data), compression="gzip", + delim_whitespace=True) + except Exception: + pass + + +def test_delim_whitespace_custom_terminator(c_parser_only): + # See gh-12912 + data = "a b c~1 2 3~4 5 6~7 8 9" + parser = c_parser_only + + df = parser.read_csv(StringIO(data), lineterminator="~", + delim_whitespace=True) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["a", "b", "c"]) + tm.assert_frame_equal(df, expected) + + +def test_dtype_and_names_error(c_parser_only): + # see gh-8833: passing both dtype and names + # resulting in an error reporting issue + parser = c_parser_only + data = """ +1.0 1 +2.0 2 +3.0 3 +""" + # base cases + result = parser.read_csv(StringIO(data), sep=r"\s+", header=None) + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) + tm.assert_frame_equal(result, expected) + + result = parser.read_csv(StringIO(data), sep=r"\s+", + header=None, names=["a", "b"]) + expected = DataFrame( + [[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + # fallback casting + result = parser.read_csv(StringIO( + data), sep=r"\s+", header=None, + names=["a", "b"], dtype={"a": np.int32}) + expected = DataFrame([[1, 1], [2, 2], [3, 3]], + columns=["a", "b"]) + expected["a"] = expected["a"].astype(np.int32) + tm.assert_frame_equal(result, expected) + + data = """ +1.0 1 +nan 2 +3.0 3 +""" + # fallback casting, but not castable + with pytest.raises(ValueError, match="cannot safely convert"): + parser.read_csv(StringIO(data), sep=r"\s+", header=None, + names=["a", "b"], dtype={"a": np.int32}) + + +def test_unsupported_dtype(c_parser_only): + parser = c_parser_only + df = DataFrame(np.random.rand(5, 2), columns=list( + "AB"), index=["1A", "1B", "1C", "1D", "1E"]) + + with tm.ensure_clean("__unsupported_dtype__.csv") as path: + df.to_csv(path) + + # valid but we don"t support it (date) + pytest.raises(TypeError, parser.read_csv, path, + dtype={"A": "datetime64", "B": "float64"}, + index_col=0) + pytest.raises(TypeError, parser.read_csv, path, + dtype={"A": "datetime64", "B": "float64"}, + index_col=0, parse_dates=["B"]) + + # valid but we don"t support it + pytest.raises(TypeError, parser.read_csv, path, + dtype={"A": "timedelta64", "B": "float64"}, + index_col=0) + + # valid but unsupported - fixed width unicode string + pytest.raises(TypeError, parser.read_csv, path, + dtype={"A": "U8"}, + index_col=0) + + +@td.skip_if_32bit +def test_precise_conversion(c_parser_only): + from decimal import Decimal + parser = c_parser_only + + normal_errors = [] + precise_errors = [] + + # test numbers between 1 and 2 + for num in np.linspace(1., 2., num=500): + # 25 decimal digits of precision + text = "a\n{0:.25}".format(num) + + normal_val = float(parser.read_csv(StringIO(text))["a"][0]) + precise_val = float(parser.read_csv( + StringIO(text), float_precision="high")["a"][0]) + roundtrip_val = float(parser.read_csv( + StringIO(text), float_precision="round_trip")["a"][0]) + actual_val = Decimal(text[2:]) + + def error(val): + return abs(Decimal("{0:.100}".format(val)) - actual_val) + + normal_errors.append(error(normal_val)) + precise_errors.append(error(precise_val)) + + # round-trip should match float() + assert roundtrip_val == float(text[2:]) + + assert sum(precise_errors) <= sum(normal_errors) + assert max(precise_errors) <= max(normal_errors) + + +def test_usecols_dtypes(c_parser_only): + parser = c_parser_only + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + + result = parser.read_csv(StringIO(data), usecols=(0, 1, 2), + names=("a", "b", "c"), + header=None, + converters={"a": str}, + dtype={"b": int, "c": float}) + result2 = parser.read_csv(StringIO(data), usecols=(0, 2), + names=("a", "b", "c"), + header=None, + converters={"a": str}, + dtype={"b": int, "c": float}) + + assert (result.dtypes == [object, np.int, np.float]).all() + assert (result2.dtypes == [object, np.float]).all() + + +def test_disable_bool_parsing(c_parser_only): + # see gh-2090 + + parser = c_parser_only + data = """A,B,C +Yes,No,Yes +No,Yes,Yes +Yes,,Yes +No,No,No""" + + result = parser.read_csv(StringIO(data), dtype=object) + assert (result.dtypes == object).all() + + result = parser.read_csv(StringIO(data), dtype=object, na_filter=False) + assert result["B"][2] == "" + + +def test_custom_lineterminator(c_parser_only): + parser = c_parser_only + data = "a,b,c~1,2,3~4,5,6" + + result = parser.read_csv(StringIO(data), lineterminator="~") + expected = parser.read_csv(StringIO(data.replace("~", "\n"))) + + tm.assert_frame_equal(result, expected) + + +def test_parse_ragged_csv(c_parser_only): + parser = c_parser_only + data = """1,2,3 +1,2,3,4 +1,2,3,4,5 +1,2 +1,2,3,4""" + + nice_data = """1,2,3,, +1,2,3,4, +1,2,3,4,5 +1,2,,, +1,2,3,4,""" + result = parser.read_csv(StringIO(data), header=None, + names=["a", "b", "c", "d", "e"]) + + expected = parser.read_csv(StringIO(nice_data), header=None, + names=["a", "b", "c", "d", "e"]) + + tm.assert_frame_equal(result, expected) + + # too many columns, cause segfault if not careful + data = "1,2\n3,4,5" + + result = parser.read_csv(StringIO(data), header=None, + names=lrange(50)) + expected = parser.read_csv(StringIO(data), header=None, + names=lrange(3)).reindex(columns=lrange(50)) + + tm.assert_frame_equal(result, expected) + + +def test_tokenize_CR_with_quoting(c_parser_only): + # see gh-3453 + parser = c_parser_only + data = " a,b,c\r\"a,b\",\"e,d\",\"f,f\"" + + result = parser.read_csv(StringIO(data), header=None) + expected = parser.read_csv(StringIO(data.replace("\r", "\n")), + header=None) + tm.assert_frame_equal(result, expected) + + result = parser.read_csv(StringIO(data)) + expected = parser.read_csv(StringIO(data.replace("\r", "\n"))) + tm.assert_frame_equal(result, expected) + + +def test_grow_boundary_at_cap(c_parser_only): + # See gh-12494 + # + # Cause of error was that the C parser + # was not increasing the buffer size when + # the desired space would fill the buffer + # to capacity, which would later cause a + # buffer overflow error when checking the + # EOF terminator of the CSV stream. + parser = c_parser_only + + def test_empty_header_read(count): + s = StringIO("," * count) + expected = DataFrame(columns=[ + "Unnamed: {i}".format(i=i) + for i in range(count + 1)]) + df = parser.read_csv(s) + tm.assert_frame_equal(df, expected) + + for cnt in range(1, 101): + test_empty_header_read(cnt) + + +def test_parse_trim_buffers(c_parser_only): + # This test is part of a bugfix for gh-13703. It attempts to + # to stress the system memory allocator, to cause it to move the + # stream buffer and either let the OS reclaim the region, or let + # other memory requests of parser otherwise modify the contents + # of memory space, where it was formally located. + # This test is designed to cause a `segfault` with unpatched + # `tokenizer.c`. Sometimes the test fails on `segfault`, other + # times it fails due to memory corruption, which causes the + # loaded DataFrame to differ from the expected one. + + parser = c_parser_only + + # Generate a large mixed-type CSV file on-the-fly (one record is + # approx 1.5KiB). + record_ = \ + """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \ + """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \ + """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \ + """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \ + """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \ + """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \ + """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \ + """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \ + """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \ + """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \ + """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \ + """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \ + """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \ + """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \ + """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \ + """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \ + """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \ + """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \ + """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \ + """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \ + """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \ + """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \ + """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \ + """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \ + """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \ + """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \ + """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \ + """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \ + """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" + + # Set the number of lines so that a call to `parser_trim_buffers` + # is triggered: after a couple of full chunks are consumed a + # relatively small 'residual' chunk would cause reallocation + # within the parser. + chunksize, n_lines = 128, 2 * 128 + 15 + csv_data = "\n".join([record_] * n_lines) + "\n" + + # We will use StringIO to load the CSV from this text buffer. + # pd.read_csv() will iterate over the file in chunks and will + # finally read a residual chunk of really small size. + + # Generate the expected output: manually create the dataframe + # by splitting by comma and repeating the `n_lines` times. + row = tuple(val_ if val_ else np.nan + for val_ in record_.split(",")) + expected = DataFrame([row for _ in range(n_lines)], + dtype=object, columns=None, index=None) + + # Iterate over the CSV file in chunks of `chunksize` lines + chunks_ = parser.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize) + result = concat(chunks_, axis=0, ignore_index=True) + + # Check for data corruption if there was no segfault + tm.assert_frame_equal(result, expected) + + # This extra test was added to replicate the fault in gh-5291. + # Force 'utf-8' encoding, so that `_string_convert` would take + # a different execution branch. + chunks_ = parser.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize, + encoding="utf_8") + result = concat(chunks_, axis=0, ignore_index=True) + tm.assert_frame_equal(result, expected) + + +def test_internal_null_byte(c_parser_only): + # see gh-14012 + # + # The null byte ('\x00') should not be used as a + # true line terminator, escape character, or comment + # character, only as a placeholder to indicate that + # none was specified. + # + # This test should be moved to common.py ONLY when + # Python's csv class supports parsing '\x00'. + parser = c_parser_only + + names = ["a", "b", "c"] + data = "1,2,3\n4,\x00,6\n7,8,9" + expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], + [7, 8, 9]], columns=names) + + result = parser.read_csv(StringIO(data), names=names) + tm.assert_frame_equal(result, expected) + + +def test_read_nrows_large(c_parser_only): + # gh-7626 - Read only nrows of data in for large inputs (>262144b) + parser = c_parser_only + header_narrow = "\t".join(["COL_HEADER_" + str(i) + for i in range(10)]) + "\n" + data_narrow = "\t".join(["somedatasomedatasomedata1" + for _ in range(10)]) + "\n" + header_wide = "\t".join(["COL_HEADER_" + str(i) + for i in range(15)]) + "\n" + data_wide = "\t".join(["somedatasomedatasomedata2" + for _ in range(15)]) + "\n" + test_input = (header_narrow + data_narrow * 1050 + + header_wide + data_wide * 2) + + df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010) + + assert df.size == 1010 * 10 + + +def test_float_precision_round_trip_with_text(c_parser_only): + # see gh-15140 - This should not segfault on Python 2.7+ + parser = c_parser_only + df = parser.read_csv(StringIO("a"), header=None, + float_precision="round_trip") + tm.assert_frame_equal(df, DataFrame({0: ["a"]})) + + +def test_large_difference_in_columns(c_parser_only): + # see gh-14125 + parser = c_parser_only + + count = 10000 + large_row = ("X," * count)[:-1] + "\n" + normal_row = "XXXXXX XXXXXX,111111111111111\n" + test_input = (large_row + normal_row * 6)[:-1] + + result = parser.read_csv(StringIO(test_input), header=None, usecols=[0]) + rows = test_input.split("\n") + + expected = DataFrame([row.split(",")[0] for row in rows]) + tm.assert_frame_equal(result, expected) + + +def test_data_after_quote(c_parser_only): + # see gh-15910 + parser = c_parser_only + + data = "a\n1\n\"b\"a" + result = parser.read_csv(StringIO(data)) + + expected = DataFrame({"a": ["1", "ba"]}) + tm.assert_frame_equal(result, expected) + + +@tm.capture_stderr +def test_comment_whitespace_delimited(c_parser_only): + parser = c_parser_only + test_input = """\ +1 2 +2 2 3 +3 2 3 # 3 fields +4 2 3# 3 fields +5 2 # 2 fields +6 2# 2 fields +7 # 1 field, NaN +8# 1 field, NaN +9 2 3 # skipped line +# comment""" + df = parser.read_csv(StringIO(test_input), comment="#", header=None, + delimiter="\\s+", skiprows=0, + error_bad_lines=False) + error = sys.stderr.getvalue() + # skipped lines 2, 3, 4, 9 + for line_num in (2, 3, 4, 9): + assert "Skipping line {}".format(line_num) in error, error + expected = DataFrame([[1, 2], + [5, 2], + [6, 2], + [7, np.nan], + [8, np.nan]]) + tm.assert_frame_equal(df, expected) + + +def test_file_like_no_next(c_parser_only): + # gh-16530: the file-like need not have a "next" or "__next__" + # attribute despite having an "__iter__" attribute. + # + # NOTE: This is only true for the C engine, not Python engine. + class NoNextBuffer(StringIO): + def __next__(self): + raise AttributeError("No next method") + + next = __next__ + + parser = c_parser_only + data = "a\n1" + + expected = DataFrame({"a": [1]}) + result = parser.read_csv(NoNextBuffer(data)) + + tm.assert_frame_equal(result, expected) + + +def test_buffer_rd_bytes_bad_unicode(c_parser_only): + # see gh-22748 + parser = c_parser_only + t = BytesIO(b"\xB0") + + if PY3: + msg = "'utf-8' codec can't encode character" + t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape") + else: + msg = "'utf8' codec can't decode byte" + + with pytest.raises(UnicodeError, match=msg): + parser.read_csv(t, encoding="UTF-8") + + +@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"]) +def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix): + # see gh-16530 + # + # Unfortunately, Python's CSV library can't handle + # tarfile objects (expects string, not bytes when + # iterating through a file-like). + parser = c_parser_only + tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix) + + with tarfile.open(tar_path, "r") as tar: + data_file = tar.extractfile("tar_data.csv") + + out = parser.read_csv(data_file) + expected = DataFrame({"a": [1]}) + tm.assert_frame_equal(out, expected) + + +@pytest.mark.high_memory +def test_bytes_exceed_2gb(c_parser_only): + # see gh-16798 + # + # Read from a "CSV" that has a column larger than 2GB. + parser = c_parser_only + + if parser.low_memory: + pytest.skip("not a high_memory test") + + csv = StringIO("strings\n" + "\n".join( + ["x" * (1 << 20) for _ in range(2100)])) + df = parser.read_csv(csv) + assert not df.empty diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 21286e9b82323..13704e2f542ab 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -11,7 +11,6 @@ from pandas import DataFrame, read_csv, read_table import pandas.util.testing as tm -from .c_parser_only import CParserTests from .comment import CommentTests from .common import ParserTests from .compression import CompressionTests @@ -57,7 +56,7 @@ def setup_method(self, datapath): self.csv_shiftjs = os.path.join(self.dirpath, 'sauron.SHIFT_JIS.csv') -class TestCParserHighMemory(BaseParser, CParserTests): +class TestCParserHighMemory(BaseParser): engine = 'c' low_memory = False float_precision_choices = [None, 'high', 'round_trip'] @@ -77,7 +76,7 @@ def read_table(self, *args, **kwds): return df -class TestCParserLowMemory(BaseParser, CParserTests): +class TestCParserLowMemory(BaseParser): engine = 'c' low_memory = True float_precision_choices = [None, 'high', 'round_trip']