@@ -377,7 +377,10 @@ int P_INLINE end_field(parser_t *self) {
377
377
// set pointer and metadata
378
378
self -> words [self -> words_len ] = self -> pword_start ;
379
379
380
- TRACE (("Saw word %s at: %d\n" , self -> pword_start , self -> word_start ))
380
+ TRACE (("Char diff: %d\n" , self -> pword_start - self -> words [0 ]));
381
+
382
+ TRACE (("Saw word %s at: %d. Total: %d\n" ,
383
+ self -> pword_start , self -> word_start , self -> words_len + 1 ))
381
384
382
385
self -> word_starts [self -> words_len ] = self -> word_start ;
383
386
self -> words_len ++ ;
@@ -399,6 +402,9 @@ int P_INLINE end_line(parser_t *self) {
399
402
400
403
fields = self -> line_fields [self -> lines ];
401
404
405
+ TRACE (("Line end, nfields: %d\n" , fields ));
406
+
407
+
402
408
if (self -> lines > 0 ) {
403
409
ex_fields = self -> line_fields [self -> lines - 1 ];
404
410
}
@@ -524,9 +530,18 @@ int parser_buffer_bytes(parser_t *self, size_t nbytes) {
524
530
525
531
// printf("pushing %c\n", c);
526
532
533
+ #if defined(VERBOSE )
534
+ #define PUSH_CHAR (c ) \
535
+ printf("Pushing %c, slen now: %d\n", c, slen); \
536
+ *stream++ = c; \
537
+ slen++;
538
+ #else
527
539
#define PUSH_CHAR (c ) \
528
540
*stream++ = c; \
529
541
slen++;
542
+ #endif
543
+
544
+
530
545
531
546
// This is a little bit of a hack but works for now
532
547
@@ -538,19 +553,37 @@ int parser_buffer_bytes(parser_t *self, size_t nbytes) {
538
553
stream = self->stream + self->stream_len; \
539
554
slen = self->stream_len;
540
555
541
- #define END_LINE () \
556
+ #define END_LINE_STATE ( STATE ) \
542
557
self->stream_len = slen; \
543
558
if (end_line(self) < 0) { \
544
559
goto parsingerror; \
545
560
} \
546
- self->state = START_RECORD; \
561
+ self->state = STATE; \
547
562
if (line_limit > 0 && self->lines == start_lines + line_limit) { \
548
563
goto linelimit; \
549
564
\
550
565
} \
551
566
stream = self->stream + self->stream_len; \
552
567
slen = self->stream_len;
553
568
569
+ #define END_LINE_AND_FIELD_STATE (STATE ) \
570
+ self->stream_len = slen; \
571
+ if (end_line(self) < 0) { \
572
+ goto parsingerror; \
573
+ } \
574
+ if (end_field(self) < 0) { \
575
+ goto parsingerror; \
576
+ } \
577
+ stream = self->stream + self->stream_len; \
578
+ slen = self->stream_len; \
579
+ self->state = STATE; \
580
+ if (line_limit > 0 && self->lines == start_lines + line_limit) { \
581
+ goto linelimit; \
582
+ \
583
+ }
584
+
585
+ #define END_LINE () END_LINE_STATE(START_RECORD)
586
+
554
587
#define IS_WHITESPACE (c ) ((c == ' ' || c == '\t'))
555
588
556
589
typedef int (* parser_op )(parser_t * self , size_t line_limit );
@@ -747,14 +780,15 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
747
780
if (c == '\n' ) {
748
781
END_LINE ();
749
782
/* self->state = START_RECORD; */
783
+ } else if (c == self -> delimiter ){
784
+ // Handle \r-delimited files
785
+ END_LINE_AND_FIELD_STATE (START_FIELD );
750
786
} else {
751
- /* self->error_msg = ("new-line character seen in" */
752
- /* " unquoted field - do you need" */
753
- /* " to open the file in " */
754
- /* "universal-newline mode?"); */
755
- goto parsingerror ;
787
+ PUSH_CHAR (c );
788
+ END_LINE_STATE (IN_FIELD );
756
789
}
757
790
break ;
791
+
758
792
default :
759
793
break ;
760
794
@@ -804,8 +838,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
804
838
// Next character in file
805
839
c = * buf ++ ;
806
840
807
- TRACE (("Iter: %d Char: %c Line %d field_count %d\n" ,
808
- i , c , self -> file_lines + 1 , self -> line_fields [self -> lines ]));
841
+ TRACE (("Iter: %d Char: %c Line %d field_count %d, state %d\n" ,
842
+ i , c , self -> file_lines + 1 , self -> line_fields [self -> lines ],
843
+ self -> state ));
809
844
810
845
switch (self -> state ) {
811
846
@@ -828,10 +863,14 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
828
863
} else if (c == '\r' ) {
829
864
self -> state = EAT_CRNL ;
830
865
break ;
866
+ } else if (IS_WHITESPACE (c )) {
867
+ END_FIELD ();
868
+ self -> state = EAT_WHITESPACE ;
869
+ break ;
870
+ } else {
871
+ /* normal character - handle as START_FIELD */
872
+ self -> state = START_FIELD ;
831
873
}
832
-
833
- /* normal character - handle as START_FIELD */
834
- self -> state = START_FIELD ;
835
874
/* fallthru */
836
875
case START_FIELD :
837
876
/* expecting field */
@@ -972,14 +1011,15 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
972
1011
if (c == '\n' ) {
973
1012
END_LINE ();
974
1013
/* self->state = START_RECORD; */
1014
+ } else if (IS_WHITESPACE (c )){
1015
+ // Handle \r-delimited files
1016
+ END_LINE_AND_FIELD_STATE (EAT_WHITESPACE );
975
1017
} else {
976
- /* self->error_msg = ("new-line character seen in" */
977
- /* " unquoted field - do you need" */
978
- /* " to open the file in " */
979
- /* "universal-newline mode?"); */
980
- goto parsingerror ;
1018
+ PUSH_CHAR (c );
1019
+ END_LINE_STATE (IN_FIELD );
981
1020
}
982
1021
break ;
1022
+
983
1023
default :
984
1024
break ;
985
1025
@@ -1009,13 +1049,13 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
1009
1049
1010
1050
1011
1051
int parser_handle_eof (parser_t * self ) {
1012
- TRACE (("handling eof, datalen: %d\n" , self -> datalen ))
1052
+ TRACE (("handling eof, datalen: %d, pstate: %d \n" , self -> datalen , self -> state ))
1013
1053
if (self -> datalen == 0 && (self -> state != START_RECORD )) {
1014
1054
// test cases needed here
1015
1055
// TODO: empty field at end of line
1016
1056
TRACE (("handling eof\n" ));
1017
1057
1018
- if (self -> state == IN_FIELD ) {
1058
+ if (self -> state == IN_FIELD || self -> state == START_FIELD ) {
1019
1059
if (end_field (self ) < 0 )
1020
1060
return -1 ;
1021
1061
} else if (self -> state == QUOTE_IN_QUOTED_FIELD ) {
@@ -1213,6 +1253,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
1213
1253
1214
1254
status = tokenize_bytes (self , nrows );
1215
1255
1256
+ /* debug_print_parser(self); */
1257
+
1216
1258
if (status < 0 ) {
1217
1259
// XXX
1218
1260
TRACE (("Status %d returned from tokenize_bytes, breaking\n" ,
0 commit comments