Skip to content

Commit b8dae94

Browse files
committed
BUG: fix issues with \r-delimited files in C tokenization code. close #2296
1 parent a28a5cc commit b8dae94

File tree

4 files changed

+98
-22
lines changed

4 files changed

+98
-22
lines changed

pandas/io/tests/test_cparser.py

+34
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,40 @@ def _make_reader(**kwds):
265265
self.assertTrue((result[1] == exp[1]).all())
266266
self.assertTrue((result[2] == exp[2]).all())
267267

268+
def test_cr_delimited(self):
269+
def _test(text, **kwargs):
270+
nice_text = text.replace('\r', '\r\n')
271+
result = TextReader(StringIO(text), **kwargs).read()
272+
expected = TextReader(StringIO(nice_text), **kwargs).read()
273+
assert_array_dicts_equal(result, expected)
274+
275+
data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12'
276+
_test(data, delimiter=',')
277+
278+
data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12'
279+
_test(data, delim_whitespace=True)
280+
281+
data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12'
282+
_test(data, delimiter=',')
283+
284+
sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r'
285+
'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r'
286+
',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0')
287+
_test(sample, delimiter=',')
288+
289+
data = 'A B C\r 2 3\r4 5 6'
290+
_test(data, delim_whitespace=True)
291+
292+
def test_empty_field_eof(self):
293+
data = 'a,b,c\n1,2,3\n4,,'
294+
295+
result = TextReader(StringIO(data), delimiter=',').read()
296+
297+
expected = {0: np.array([1, 4]),
298+
1: np.array(['2', ''], dtype=object),
299+
2: np.array(['3', ''], dtype=object)}
300+
assert_array_dicts_equal(result, expected)
301+
268302

269303
def assert_array_dicts_equal(left, right):
270304
for k, v in left.iteritems():

pandas/src/parse_helper.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ int to_double(char *item, double *p_value, char sci, char decimal)
1717
#define PyBytes_AS_STRING PyString_AS_STRING
1818
#endif
1919

20-
PANDAS_INLINE int floatify(PyObject* str, double *result) {
20+
int floatify(PyObject* str, double *result) {
2121
int status;
2222
char *data;
2323
PyObject* tmp = NULL;

pandas/src/parser/tokenizer.c

+62-20
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,10 @@ int P_INLINE end_field(parser_t *self) {
377377
// set pointer and metadata
378378
self->words[self->words_len] = self->pword_start;
379379

380-
TRACE(("Saw word %s at: %d\n", self->pword_start, self->word_start))
380+
TRACE(("Char diff: %d\n", self->pword_start - self->words[0]));
381+
382+
TRACE(("Saw word %s at: %d. Total: %d\n",
383+
self->pword_start, self->word_start, self->words_len + 1))
381384

382385
self->word_starts[self->words_len] = self->word_start;
383386
self->words_len++;
@@ -399,6 +402,9 @@ int P_INLINE end_line(parser_t *self) {
399402

400403
fields = self->line_fields[self->lines];
401404

405+
TRACE(("Line end, nfields: %d\n", fields));
406+
407+
402408
if (self->lines > 0) {
403409
ex_fields = self->line_fields[self->lines - 1];
404410
}
@@ -524,9 +530,18 @@ int parser_buffer_bytes(parser_t *self, size_t nbytes) {
524530

525531
// printf("pushing %c\n", c);
526532

533+
#if defined(VERBOSE)
534+
#define PUSH_CHAR(c) \
535+
printf("Pushing %c, slen now: %d\n", c, slen); \
536+
*stream++ = c; \
537+
slen++;
538+
#else
527539
#define PUSH_CHAR(c) \
528540
*stream++ = c; \
529541
slen++;
542+
#endif
543+
544+
530545

531546
// This is a little bit of a hack but works for now
532547

@@ -538,19 +553,37 @@ int parser_buffer_bytes(parser_t *self, size_t nbytes) {
538553
stream = self->stream + self->stream_len; \
539554
slen = self->stream_len;
540555

541-
#define END_LINE() \
556+
#define END_LINE_STATE(STATE) \
542557
self->stream_len = slen; \
543558
if (end_line(self) < 0) { \
544559
goto parsingerror; \
545560
} \
546-
self->state = START_RECORD; \
561+
self->state = STATE; \
547562
if (line_limit > 0 && self->lines == start_lines + line_limit) { \
548563
goto linelimit; \
549564
\
550565
} \
551566
stream = self->stream + self->stream_len; \
552567
slen = self->stream_len;
553568

569+
#define END_LINE_AND_FIELD_STATE(STATE) \
570+
self->stream_len = slen; \
571+
if (end_line(self) < 0) { \
572+
goto parsingerror; \
573+
} \
574+
if (end_field(self) < 0) { \
575+
goto parsingerror; \
576+
} \
577+
stream = self->stream + self->stream_len; \
578+
slen = self->stream_len; \
579+
self->state = STATE; \
580+
if (line_limit > 0 && self->lines == start_lines + line_limit) { \
581+
goto linelimit; \
582+
\
583+
}
584+
585+
#define END_LINE() END_LINE_STATE(START_RECORD)
586+
554587
#define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))
555588

556589
typedef int (*parser_op)(parser_t *self, size_t line_limit);
@@ -747,14 +780,15 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
747780
if (c == '\n') {
748781
END_LINE();
749782
/* self->state = START_RECORD; */
783+
} else if (c == self->delimiter){
784+
// Handle \r-delimited files
785+
END_LINE_AND_FIELD_STATE(START_FIELD);
750786
} else {
751-
/* self->error_msg = ("new-line character seen in" */
752-
/* " unquoted field - do you need" */
753-
/* " to open the file in " */
754-
/* "universal-newline mode?"); */
755-
goto parsingerror;
787+
PUSH_CHAR(c);
788+
END_LINE_STATE(IN_FIELD);
756789
}
757790
break;
791+
758792
default:
759793
break;
760794

@@ -804,8 +838,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
804838
// Next character in file
805839
c = *buf++;
806840

807-
TRACE(("Iter: %d Char: %c Line %d field_count %d\n",
808-
i, c, self->file_lines + 1, self->line_fields[self->lines]));
841+
TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
842+
i, c, self->file_lines + 1, self->line_fields[self->lines],
843+
self->state));
809844

810845
switch(self->state) {
811846

@@ -828,10 +863,14 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
828863
} else if (c == '\r') {
829864
self->state = EAT_CRNL;
830865
break;
866+
} else if (IS_WHITESPACE(c)) {
867+
END_FIELD();
868+
self->state = EAT_WHITESPACE;
869+
break;
870+
} else {
871+
/* normal character - handle as START_FIELD */
872+
self->state = START_FIELD;
831873
}
832-
833-
/* normal character - handle as START_FIELD */
834-
self->state = START_FIELD;
835874
/* fallthru */
836875
case START_FIELD:
837876
/* expecting field */
@@ -972,14 +1011,15 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
9721011
if (c == '\n') {
9731012
END_LINE();
9741013
/* self->state = START_RECORD; */
1014+
} else if (IS_WHITESPACE(c)){
1015+
// Handle \r-delimited files
1016+
END_LINE_AND_FIELD_STATE(EAT_WHITESPACE);
9751017
} else {
976-
/* self->error_msg = ("new-line character seen in" */
977-
/* " unquoted field - do you need" */
978-
/* " to open the file in " */
979-
/* "universal-newline mode?"); */
980-
goto parsingerror;
1018+
PUSH_CHAR(c);
1019+
END_LINE_STATE(IN_FIELD);
9811020
}
9821021
break;
1022+
9831023
default:
9841024
break;
9851025

@@ -1009,13 +1049,13 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
10091049

10101050

10111051
int parser_handle_eof(parser_t *self) {
1012-
TRACE(("handling eof, datalen: %d\n", self->datalen))
1052+
TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
10131053
if (self->datalen == 0 && (self->state != START_RECORD)) {
10141054
// test cases needed here
10151055
// TODO: empty field at end of line
10161056
TRACE(("handling eof\n"));
10171057

1018-
if (self->state == IN_FIELD) {
1058+
if (self->state == IN_FIELD || self->state == START_FIELD) {
10191059
if (end_field(self) < 0)
10201060
return -1;
10211061
} else if (self->state == QUOTE_IN_QUOTED_FIELD) {
@@ -1213,6 +1253,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
12131253

12141254
status = tokenize_bytes(self, nrows);
12151255

1256+
/* debug_print_parser(self); */
1257+
12161258
if (status < 0) {
12171259
// XXX
12181260
TRACE(("Status %d returned from tokenize_bytes, breaking\n",

pandas/src/parser/tokenizer.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ See LICENSE for the license
4343
#if defined(__GNUC__)
4444
#define P_INLINE __inline__
4545
#elif defined(_MSC_VER)
46-
#define P_INLINE
46+
#define P_INLINE
4747
#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
4848
#define P_INLINE inline
4949
#else

0 commit comments

Comments
 (0)