Skip to content

Commit 64d7670

Browse files
jeffcareyjreback
authored andcommitted
BUG: Fixed incorrect stream size check (#14125)
closes #14125 Previously, self->stream_cap was copied into a local variable called maxstreamsize each time tokenize_bytes ran, and then this was checked in the PUSH_CHAR macro. However, there is one other place in the file where function make_stream_space() is called (in end_line()), and when this happens self->stream_cap is increased but maxstreamsize is not updated, making the check incorrect. In rare circumstances (see original issue or test case) this could cause a crash. The resolution is to just check self->stream_cap directly. Author: Jeff Carey <[email protected]> Closes #15195 from jeffcarey/fix/14125 and squashes the following commits: d3c5f28 [Jeff Carey] BUG: Fixed incorrect stream size check (#14125)
1 parent 9309eba commit 64d7670

File tree

3 files changed

+16
-5
lines changed

3 files changed

+16
-5
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ Bug Fixes
387387
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
388388
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
389389
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
390+
- Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`)
390391
- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
391392

392393
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)

pandas/io/tests/parser/c_parser_only.py

+12
Original file line numberDiff line numberDiff line change
@@ -395,3 +395,15 @@ def test_float_precision_round_trip_with_text(self):
395395
float_precision='round_trip',
396396
header=None)
397397
tm.assert_frame_equal(df, DataFrame({0: ['a']}))
398+
399+
def test_large_difference_in_columns(self):
400+
# gh-14125
401+
count = 10000
402+
large_row = ('X,' * count)[:-1] + '\n'
403+
normal_row = 'XXXXXX XXXXXX,111111111111111\n'
404+
test_input = (large_row + normal_row * 6)[:-1]
405+
result = self.read_csv(StringIO(test_input), header=None, usecols=[0])
406+
rows = test_input.split('\n')
407+
expected = DataFrame([row.split(',')[0] for row in rows])
408+
409+
tm.assert_frame_equal(result, expected)

pandas/src/parser/tokenizer.c

+3-5
Original file line numberDiff line numberDiff line change
@@ -592,9 +592,9 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
592592
TRACE( \
593593
("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
594594
c, slen, self->stream_cap, self->stream_len)) \
595-
if (slen >= maxstreamsize) { \
596-
TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, \
597-
maxstreamsize)) \
595+
if (slen >= self->stream_cap) { \
596+
TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \
597+
self->stream_cap)) \
598598
int bufsize = 100; \
599599
self->error_msg = (char *)malloc(bufsize); \
600600
snprintf(self->error_msg, bufsize, \
@@ -711,7 +711,6 @@ int skip_this_line(parser_t *self, int64_t rownum) {
711711
int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
712712
int i, slen;
713713
int should_skip;
714-
long maxstreamsize;
715714
char c;
716715
char *stream;
717716
char *buf = self->data + self->datapos;
@@ -723,7 +722,6 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
723722

724723
stream = self->stream + self->stream_len;
725724
slen = self->stream_len;
726-
maxstreamsize = self->stream_cap;
727725

728726
TRACE(("%s\n", buf));
729727

0 commit comments

Comments
 (0)