Skip to content

Commit e05a3d0

Browse files
committed
BUG: parser-tokenizer incorrect state in trailing field handling logic. stupid goto. close #2668
1 parent 5da8df7 commit e05a3d0

File tree

4 files changed

+21
-4
lines changed

4 files changed

+21
-4
lines changed

RELEASE.rst

+3
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ pandas 0.10.1
7070
- handle correctly ``Term`` passed types (e.g. ``index<1000``, when index
7171
is ``Int64``), (closes GH512_)
7272
- handle Timestamp correctly in data_columns (closes GH2637_)
73+
7374
- Fix DataFrame.info bug with UTF8-encoded columns. (GH2576_)
7475
- Fix DatetimeIndex handling of FixedOffset tz (GH2604_)
7576
- More robust detection of being in IPython session for wide DataFrame
@@ -86,6 +87,7 @@ pandas 0.10.1
8687
- Don't bork Series containing datetime64 values with to_datetime (GH2699_)
8788
- Fix DataFrame.from_records corner case when passed columns, index column,
8889
but empty record list (GH2633_)
90+
- Fix C parser-tokenizer bug with trailing fields. (GH2668_)
8991

9092
**API Changes**
9193

@@ -108,6 +110,7 @@ pandas 0.10.1
108110
.. _GH2631: https://github.com/pydata/pandas/issues/2631
109111
.. _GH2633: https://github.com/pydata/pandas/issues/2633
110112
.. _GH2637: https://github.com/pydata/pandas/issues/2637
113+
.. _GH2668: https://github.com/pydata/pandas/issues/2668
111114
.. _GH2690: https://github.com/pydata/pandas/issues/2690
112115
.. _GH2692: https://github.com/pydata/pandas/issues/2692
113116
.. _GH2699: https://github.com/pydata/pandas/issues/2699

pandas/io/tests/test_parsers.py

+9
Original file line numberDiff line numberDiff line change
@@ -2017,6 +2017,15 @@ def test_raise_on_passed_int_dtype_with_nas(self):
20172017
skipinitialspace=True,
20182018
dtype={'DOY': np.int64})
20192019

2020+
def test_na_trailing_columns(self):
2021+
data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax
2022+
2012-03-14,USD,AAPL,BUY,1000
2023+
2012-05-12,USD,SBUX,SELL,500"""
2024+
2025+
result = self.read_csv(StringIO(data))
2026+
self.assertEquals(result['Date'][1], '2012-05-12')
2027+
self.assertTrue(result['UnitPrice'].isnull().all())
2028+
20202029

20212030
class TestParseSQL(unittest.TestCase):
20222031

pandas/src/parser/tokenizer.c

+8-3
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,7 @@ static int end_line(parser_t *self) {
494494
if (self->lines >= self->header + 1 && self->lines > 0) {
495495
while (fields < ex_fields){
496496
end_field(self);
497+
/* printf("Prior word: %s\n", self->words[self->words_len - 2]); */
497498
fields++;
498499
}
499500
}
@@ -503,6 +504,10 @@ static int end_line(parser_t *self) {
503504

504505
self->lines++;
505506

507+
/* coliter_t it; */
508+
/* coliter_setup(&it, self, 5, self->lines - 1); */
509+
/* printf("word at column 5: %s\n", COLITER_NEXT(it)); */
510+
506511
// good line, set new start point
507512
self->line_start[self->lines] = (self->line_start[self->lines - 1] +
508513
fields);
@@ -601,13 +606,13 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
601606
if (end_line(self) < 0) { \
602607
goto parsingerror; \
603608
} \
609+
stream = self->stream + self->stream_len; \
610+
slen = self->stream_len; \
604611
self->state = STATE; \
605612
if (line_limit > 0 && self->lines == start_lines + line_limit) { \
606613
goto linelimit; \
607614
\
608-
} \
609-
stream = self->stream + self->stream_len; \
610-
slen = self->stream_len;
615+
}
611616

612617
#define END_LINE_AND_FIELD_STATE(STATE) \
613618
self->stream_len = slen; \

pandas/src/parser/tokenizer.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ See LICENSE for the license
8888
#define ERROR_NO_DATA 23
8989

9090

91-
/* #define VERBOSE */
91+
// #define VERBOSE
9292

9393
#if defined(VERBOSE)
9494
#define TRACE(X) printf X;

0 commit comments

Comments
 (0)