BUG: Fixed incorrect stream size check (#14125)

jeffcarey · jreback · commit 64d7670d99a1 · 2017-01-24T18:10:19.000-05:00
closes #14125 Previously, self->stream_cap was copied into a local variable called maxstreamsize each time tokenize_bytes ran, and then this was checked in the PUSH_CHAR macro. However, there is one other place in the file where function make_stream_space() is called (in end_line()), and when this happens self->stream_cap is increased but maxstreamsize is not updated, making the check incorrect. In rare circumstances (see original issue or test case) this could cause a crash. The resolution is to just check self->stream_cap directly. Author: Jeff Carey <jeff.carey@gmail.com> Closes #15195 from jeffcarey/fix/14125 and squashes the following commits: d3c5f28 [Jeff Carey] BUG: Fixed incorrect stream size check (#14125)
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -387,6 +387,7 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
 - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
 - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
+- Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`)
 - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
 
 - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -395,3 +395,15 @@ def test_float_precision_round_trip_with_text(self):
                            float_precision='round_trip',
                            header=None)
         tm.assert_frame_equal(df, DataFrame({0: ['a']}))
+
+    def test_large_difference_in_columns(self):
+        # gh-14125
+        count = 10000
+        large_row = ('X,' * count)[:-1] + '\n'
+        normal_row = 'XXXXXX XXXXXX,111111111111111\n'
+        test_input = (large_row + normal_row * 6)[:-1]
+        result = self.read_csv(StringIO(test_input), header=None, usecols=[0])
+        rows = test_input.split('\n')
+        expected = DataFrame([row.split(',')[0] for row in rows])
+
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -592,9 +592,9 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     TRACE(                                                                    \
         ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
          c, slen, self->stream_cap, self->stream_len))                        \
-    if (slen >= maxstreamsize) {                                              \
-        TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen,   \
-               maxstreamsize))                                                \
+    if (slen >= self->stream_cap) {                                           \
+        TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen,      \
+               self->stream_cap))                                             \
         int bufsize = 100;                                                    \
         self->error_msg = (char *)malloc(bufsize);                            \
         snprintf(self->error_msg, bufsize,                                    \
@@ -711,7 +711,6 @@ int skip_this_line(parser_t *self, int64_t rownum) {
 int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
     int i, slen;
     int should_skip;
-    long maxstreamsize;
     char c;
     char *stream;
     char *buf = self->data + self->datapos;
@@ -723,7 +722,6 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
 
     stream = self->stream + self->stream_len;
     slen = self->stream_len;
-    maxstreamsize = self->stream_cap;
 
     TRACE(("%s\n", buf));