Skip to content

Commit c6c9c0b

Browse files
Scott E Lasleyjreback
Scott E Lasley
authored andcommitted
BUG: Fix buffer overflows in tokenizer.c with certain malformed input files. GH9205
1 parent b8e9590 commit c6c9c0b

File tree

5 files changed

+233
-131
lines changed

5 files changed

+233
-131
lines changed

doc/source/whatsnew/v0.16.0.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -157,8 +157,6 @@ Bug Fixes
157157
- Bug in the returned ``Series.dt.components`` index was reset to the default index (:issue:`9247`)
158158

159159

160-
161-
162160
- Fixed bug in ``to_sql`` when mapping a Timestamp object column (datetime
163161
column with timezone info) to the according sqlalchemy type (:issue:`9085`).
164162
- Fixed bug in ``to_sql`` ``dtype`` argument not accepting an instantiated
@@ -237,3 +235,5 @@ Bug Fixes
237235

238236

239237
- ``SparseSeries`` and ``SparsePanel`` now accept zero argument constructors (same as their non-sparse counterparts) (:issue:`9272`).
238+
239+
- Bug in ``read_csv`` with buffer overflows with certain malformed input files (:issue:`9205`)

pandas/io/tests/test_parsers.py

+26
Original file line numberDiff line numberDiff line change
@@ -3258,6 +3258,19 @@ def test_fallback_to_python(self):
32583258
self.read_table(StringIO(data), engine='c', skip_footer=1)
32593259

32603260

3261+
def test_buffer_overflow(self):
3262+
# GH9205
3263+
# test certain malformed input files that cause buffer overflows in
3264+
# tokenizer.c
3265+
malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer
3266+
malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer
3267+
malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer
3268+
for malf in (malfw, malfs, malfl):
3269+
try:
3270+
df = self.read_table(StringIO(malf))
3271+
except Exception as cperr:
3272+
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
3273+
32613274
class TestCParserLowMemory(ParserTests, tm.TestCase):
32623275

32633276
def read_csv(self, *args, **kwds):
@@ -3666,6 +3679,19 @@ def test_raise_on_sep_with_delim_whitespace(self):
36663679
self.read_table(StringIO(data), sep='\s', delim_whitespace=True)
36673680

36683681

3682+
def test_buffer_overflow(self):
3683+
# GH9205
3684+
# test certain malformed input files that cause buffer overflows in
3685+
# tokenizer.c
3686+
malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer
3687+
malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer
3688+
malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer
3689+
for malf in (malfw, malfs, malfl):
3690+
try:
3691+
df = self.read_table(StringIO(malf))
3692+
except Exception as cperr:
3693+
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
3694+
36693695
class TestMiscellaneous(tm.TestCase):
36703696

36713697
# for tests that don't fit into any of the other classes, e.g. those that

pandas/parser.pyx

+11-9
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ cdef extern from "parser/tokenizer.h":
137137
int quoting # style of quoting to write */
138138

139139
# hmm =/
140-
int numeric_field
140+
# int numeric_field
141141

142142
char commentchar
143143
int allow_embedded_newline
@@ -198,7 +198,7 @@ cdef extern from "parser/tokenizer.h":
198198

199199
int64_t str_to_int64(char *p_item, int64_t int_min,
200200
int64_t int_max, int *error, char tsep)
201-
uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
201+
# uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
202202

203203
double xstrtod(const char *p, char **q, char decimal, char sci,
204204
char tsep, int skip_trailing)
@@ -207,12 +207,12 @@ cdef extern from "parser/tokenizer.h":
207207
double round_trip(const char *p, char **q, char decimal, char sci,
208208
char tsep, int skip_trailing)
209209

210-
inline int to_complex(char *item, double *p_real,
211-
double *p_imag, char sci, char decimal)
210+
# inline int to_complex(char *item, double *p_real,
211+
# double *p_imag, char sci, char decimal)
212212
inline int to_longlong(char *item, long long *p_value)
213-
inline int to_longlong_thousands(char *item, long long *p_value,
214-
char tsep)
215-
inline int to_boolean(char *item, uint8_t *val)
213+
# inline int to_longlong_thousands(char *item, long long *p_value,
214+
# char tsep)
215+
int to_boolean(char *item, uint8_t *val)
216216

217217

218218
cdef extern from "parser/io.h":
@@ -1055,7 +1055,8 @@ cdef class TextReader:
10551055
bint user_dtype,
10561056
kh_str_t *na_hashset,
10571057
object na_flist):
1058-
cdef kh_str_t *true_set, *false_set
1058+
cdef kh_str_t *true_set
1059+
cdef kh_str_t *false_set
10591060

10601061
if dtype[1] == 'i' or dtype[1] == 'u':
10611062
result, na_count = _try_int64(self.parser, i, start, end,
@@ -1443,7 +1444,8 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
14431444
int error
14441445
Py_ssize_t i, j
14451446
coliter_t it
1446-
char *word, *data
1447+
char *word
1448+
char *data
14471449
ndarray result
14481450

14491451
result = np.empty(line_end - line_start, dtype='|S%d' % width)

0 commit comments

Comments
 (0)