From d3c5f282adff487fc4b23c7e41f10119a35cd811 Mon Sep 17 00:00:00 2001 From: Jeff Carey Date: Tue, 29 Nov 2016 21:53:57 -0800 Subject: [PATCH] BUG: Fixed incorrect stream size check (#14125) --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/tests/parser/c_parser_only.py | 12 ++++++++++++ pandas/src/parser/tokenizer.c | 8 +++----- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 79ca5a5e13d02..da41aaa03bfd5 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -383,6 +383,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) +- Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`) - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 6df3c513faf4a..73edda90720af 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -395,3 +395,15 @@ def test_float_precision_round_trip_with_text(self): float_precision='round_trip', header=None) tm.assert_frame_equal(df, DataFrame({0: ['a']})) + + def test_large_difference_in_columns(self): + # gh-14125 + count = 10000 + large_row = ('X,' * count)[:-1] + '\n' + normal_row = 'XXXXXX XXXXXX,111111111111111\n' + test_input = (large_row + normal_row * 6)[:-1] + result = self.read_csv(StringIO(test_input), header=None, usecols=[0]) + rows = test_input.split('\n') + expected = DataFrame([row.split(',')[0] for row in rows]) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 7cddefd40cfc5..b6428c8b76743 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -592,9 +592,9 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { TRACE( \ ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ c, slen, self->stream_cap, self->stream_len)) \ - if (slen >= maxstreamsize) { \ - TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, \ - maxstreamsize)) \ + if (slen >= self->stream_cap) { \ + TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ + self->stream_cap)) \ int bufsize = 100; \ self->error_msg = (char *)malloc(bufsize); \ snprintf(self->error_msg, bufsize, \ @@ -711,7 +711,6 @@ int skip_this_line(parser_t *self, int64_t rownum) { int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { int i, slen; int should_skip; - long maxstreamsize; char c; char *stream; char *buf = self->data + self->datapos; @@ -723,7 +722,6 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { stream = self->stream + self->stream_len; slen = self->stream_len; - maxstreamsize = self->stream_cap; TRACE(("%s\n", buf));