Skip to content

Commit 6f6b0df

Browse files
committed
Merge pull request #5268 from guyrt/issue-5156-segfault
BUG: Fixed issue #5156: segfault on read_csv
2 parents 8dbaafe + 7fd6eab commit 6f6b0df

File tree

5 files changed

+31
-15
lines changed

5 files changed

+31
-15
lines changed

doc/source/release.rst

+3
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ Improvements to existing features
183183
- ``Series`` now supports a ``to_frame`` method to convert it to a single-column DataFrame (:issue:`5164`)
184184
- DatetimeIndex (and date_range) can now be constructed in a left- or
185185
right-open fashion using the ``closed`` parameter (:issue:`4579`)
186+
- Python csv parser now supports usecols (:issue:`4335`)
186187

187188
API Changes
188189
~~~~~~~~~~~
@@ -625,6 +626,8 @@ Bug Fixes
625626
- Fixed bug in Excel writers where frames with duplicate column names weren't
626627
written correctly. (:issue:`5235`)
627628
- Fixed issue with ``drop`` and a non-unique index on Series (:issue:`5248`)
629+
- Fixed seg fault in C parser caused by passing more names than columns in
630+
the file. (:issue:`5156`)
628631

629632
pandas 0.12.0
630633
-------------

pandas/io/tests/test_parsers.py

+9
Original file line numberDiff line numberDiff line change
@@ -1955,6 +1955,15 @@ def test_integer_overflow_bug(self):
19551955
result = self.read_csv(StringIO(data), header=None, sep='\s+')
19561956
self.assertTrue(result[0].dtype == np.float64)
19571957

1958+
def test_catch_too_many_names(self):
1959+
# Issue 5156
1960+
data = """\
1961+
1,2,3
1962+
4,,6
1963+
7,8,9
1964+
10,11,12\n"""
1965+
tm.assertRaises(Exception, read_csv, StringIO(data), header=0, names=['a', 'b', 'c', 'd'])
1966+
19581967

19591968
class TestPythonParser(ParserTests, unittest.TestCase):
19601969
def test_negative_skipfooter_raises(self):

pandas/parser.pyx

+18-13
Original file line numberDiff line numberDiff line change
@@ -801,7 +801,6 @@ cdef class TextReader:
801801
raise StopIteration
802802
self._end_clock('Tokenization')
803803

804-
805804
self._start_clock()
806805
columns = self._convert_column_data(rows=rows,
807806
footer=footer,
@@ -840,11 +839,12 @@ cdef class TextReader:
840839

841840
def _convert_column_data(self, rows=None, upcast_na=False, footer=0):
842841
cdef:
843-
Py_ssize_t i, nused, ncols
842+
Py_ssize_t i, nused
844843
kh_str_t *na_hashset = NULL
845844
int start, end
846845
object name, na_flist
847846
bint na_filter = 0
847+
Py_ssize_t num_cols
848848

849849
start = self.parser_start
850850

@@ -857,6 +857,22 @@ cdef class TextReader:
857857
# if footer > 0:
858858
# end -= footer
859859

860+
#print >> sys.stderr, self.table_width
861+
#print >> sys.stderr, self.leading_cols
862+
#print >> sys.stderr, self.parser.lines
863+
#print >> sys.stderr, start
864+
#print >> sys.stderr, end
865+
#print >> sys.stderr, self.header
866+
#print >> sys.stderr, "index"
867+
num_cols = -1
868+
for i in range(self.parser.lines):
869+
num_cols = (num_cols < self.parser.line_fields[i]) * self.parser.line_fields[i] +\
870+
(num_cols >= self.parser.line_fields[i]) * num_cols
871+
872+
if self.table_width - self.leading_cols > num_cols:
873+
raise CParserError("Too many columns specified: expected %s and found %s" %
874+
(self.table_width - self.leading_cols, num_cols))
875+
860876
results = {}
861877
nused = 0
862878
for i in range(self.table_width):
@@ -1446,7 +1462,6 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
14461462
if na_filter:
14471463
for i in range(lines):
14481464
word = COLITER_NEXT(it)
1449-
14501465
k = kh_get_str(na_hashset, word)
14511466
# in the hash table
14521467
if k != na_hashset.n_buckets:
@@ -1828,16 +1843,6 @@ cdef _apply_converter(object f, parser_t *parser, int col,
18281843

18291844
return lib.maybe_convert_objects(result)
18301845

1831-
# if issubclass(values.dtype.type, (np.number, np.bool_)):
1832-
# return values
1833-
1834-
# # XXX
1835-
# na_values = set([''])
1836-
# try:
1837-
# return lib.maybe_convert_numeric(values, na_values, False)
1838-
# except Exception:
1839-
# na_count = lib.sanitize_objects(values, na_values, False)
1840-
# return result
18411846

18421847
def _to_structured_array(dict columns, object names):
18431848
cdef:

pandas/src/parser/tokenizer.c

-1
Original file line numberDiff line numberDiff line change
@@ -709,7 +709,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
709709
if (c == '\n') {
710710
END_FIELD();
711711
END_LINE();
712-
/* self->state = START_RECORD; */
713712
} else if (c == '\r') {
714713
END_FIELD();
715714
self->state = EAT_CRNL;

pandas/src/parser/tokenizer.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ typedef struct parser_t {
161161

162162
int *line_start; // position in words for start of line
163163
int *line_fields; // Number of fields in each line
164-
int lines; // Number of (good) lines observedb
164+
int lines; // Number of (good) lines observed
165165
int file_lines; // Number of file lines observed (including bad or skipped)
166166
int lines_cap; // Vector capacity
167167

0 commit comments

Comments
 (0)