Skip to content

BUG: Fix buffer overflows in tokenizer.c that caused python to segfault with certain #9360

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,6 @@ Bug Fixes
- Bug in the returned ``Series.dt.components`` index was reset to the default index (:issue:`9247`)




- Fixed bug in ``to_sql`` when mapping a Timestamp object column (datetime
column with timezone info) to the according sqlalchemy type (:issue:`9085`).
- Fixed bug in ``to_sql`` ``dtype`` argument not accepting an instantiated
Expand Down Expand Up @@ -210,3 +208,6 @@ Bug Fixes
- Fixes issue with ``index_col=False`` when ``usecols`` is also specified in ``read_csv``. (:issue:`9082`)
- Bug where ``wide_to_long`` would modify the input stubnames list (:issue:`9204`)
- Bug in to_sql not storing float64 values using double precision. (:issue:`9009`)


- Bug in read_csv. Buffer overflows with certain malformed input files (:issue:`9205`)
26 changes: 26 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3258,6 +3258,19 @@ def test_fallback_to_python(self):
self.read_table(StringIO(data), engine='c', skip_footer=1)


def test_buffer_overflow(self):
# GH9205
# test certain malformed input files that cause buffer overflows in
# tokenizer.c
malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer
malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer
malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer
for malf in (malfw, malfs, malfl):
try:
df = self.read_table(StringIO(malf))
except Exception as cperr:
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))

class TestCParserLowMemory(ParserTests, tm.TestCase):

def read_csv(self, *args, **kwds):
Expand Down Expand Up @@ -3666,6 +3679,19 @@ def test_raise_on_sep_with_delim_whitespace(self):
self.read_table(StringIO(data), sep='\s', delim_whitespace=True)


def test_buffer_overflow(self):
# GH9205
# test certain malformed input files that cause buffer overflows in
# tokenizer.c
malfw = "1\r1\r1\r 1\r 1\r" # buffer overflow in words pointer
malfs = "1\r1\r1\r 1\r 1\r11\r" # buffer overflow in stream pointer
malfl = "1\r1\r1\r 1\r 1\r11\r1\r" # buffer overflow in lines pointer
for malf in (malfw, malfs, malfl):
try:
df = self.read_table(StringIO(malf))
except Exception as cperr:
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))

class TestMiscellaneous(tm.TestCase):

# for tests that don't fit into any of the other classes, e.g. those that
Expand Down
20 changes: 11 additions & 9 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ cdef extern from "parser/tokenizer.h":
int quoting # style of quoting to write */

# hmm =/
int numeric_field
# int numeric_field

char commentchar
int allow_embedded_newline
Expand Down Expand Up @@ -198,7 +198,7 @@ cdef extern from "parser/tokenizer.h":

int64_t str_to_int64(char *p_item, int64_t int_min,
int64_t int_max, int *error, char tsep)
uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
# uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)

double xstrtod(const char *p, char **q, char decimal, char sci,
char tsep, int skip_trailing)
Expand All @@ -207,12 +207,12 @@ cdef extern from "parser/tokenizer.h":
double round_trip(const char *p, char **q, char decimal, char sci,
char tsep, int skip_trailing)

inline int to_complex(char *item, double *p_real,
double *p_imag, char sci, char decimal)
# inline int to_complex(char *item, double *p_real,
# double *p_imag, char sci, char decimal)
inline int to_longlong(char *item, long long *p_value)
inline int to_longlong_thousands(char *item, long long *p_value,
char tsep)
inline int to_boolean(char *item, uint8_t *val)
# inline int to_longlong_thousands(char *item, long long *p_value,
# char tsep)
int to_boolean(char *item, uint8_t *val)


cdef extern from "parser/io.h":
Expand Down Expand Up @@ -1055,7 +1055,8 @@ cdef class TextReader:
bint user_dtype,
kh_str_t *na_hashset,
object na_flist):
cdef kh_str_t *true_set, *false_set
cdef kh_str_t *true_set
cdef kh_str_t *false_set

if dtype[1] == 'i' or dtype[1] == 'u':
result, na_count = _try_int64(self.parser, i, start, end,
Expand Down Expand Up @@ -1443,7 +1444,8 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start,
int error
Py_ssize_t i, j
coliter_t it
char *word, *data
char *word
char *data
ndarray result

result = np.empty(line_end - line_start, dtype='|S%d' % width)
Expand Down
Loading