@@ -143,6 +143,8 @@ cdef extern from "parser/tokenizer.h":
143
143
char thousands
144
144
145
145
int header # Boolean: 1: has header, 0: no header
146
+ int header_start # header row start
147
+ int header_end # header row end
146
148
147
149
void * skipset
148
150
int skip_footer
@@ -242,7 +244,7 @@ cdef class TextReader:
242
244
object na_values, true_values, false_values
243
245
object memory_map
244
246
object as_recarray
245
- object header, names
247
+ object header, names, header_start, header_end
246
248
object low_memory
247
249
object skiprows
248
250
object compact_ints, use_unsigned
@@ -256,6 +258,8 @@ cdef class TextReader:
256
258
delimiter = b' ,' ,
257
259
258
260
header = 0 ,
261
+ header_start = 0 ,
262
+ header_end = 0 ,
259
263
names = None ,
260
264
261
265
memory_map = False ,
@@ -435,11 +439,28 @@ cdef class TextReader:
435
439
# TODO: no header vs. header is not the first row
436
440
if header is None :
437
441
# sentinel value
442
+ self .parser.header_start = - 1
443
+ self .parser.header_end = - 1
438
444
self .parser.header = - 1
439
445
self .parser_start = 0
446
+ self .header = []
440
447
else :
441
- self .parser.header = header
442
- self .parser_start = header + 1
448
+ if isinstance (header, list ) and len (header):
449
+ # need to artifically skip the final line
450
+ # which is still a header line
451
+ header.append(header[- 1 ]+ 1 )
452
+
453
+ self .parser.header_start = header[0 ]
454
+ self .parser.header_end = header[- 1 ]
455
+ self .parser.header = header[0 ]
456
+ self .parser_start = header[- 1 ] + 1
457
+ self .header = header
458
+ else :
459
+ self .parser.header_start = header
460
+ self .parser.header_end = header
461
+ self .parser.header = header
462
+ self .parser_start = header + 1
463
+ self .header = [ header ]
443
464
444
465
self .names = names
445
466
self .header, self .table_width = self ._get_header()
@@ -534,8 +555,10 @@ cdef class TextReader:
534
555
' got %s type' % type (source))
535
556
536
557
cdef _get_header(self ):
558
+ # header is now a list of lists, so field_count should use header[0]
559
+
537
560
cdef:
538
- size_t i, start, data_line, field_count, passed_count
561
+ size_t i, start, data_line, field_count, passed_count, hr
539
562
char * word
540
563
object name
541
564
int status
@@ -544,49 +567,53 @@ cdef class TextReader:
544
567
545
568
header = []
546
569
547
- if self .parser.header >= 0 :
548
- # Header is in the file
570
+ if self .parser.header_start >= 0 :
549
571
550
- if self .parser.lines < self .parser.header + 1 :
551
- self ._tokenize_rows(self .parser.header + 2 )
552
-
553
- # e.g., if header=3 and file only has 2 lines
554
- if self .parser.lines < self .parser.header + 1 :
555
- raise CParserError(' Passed header=%d but only %d lines in file'
556
- % (self .parser.header, self .parser.lines))
572
+ # Header is in the file
573
+ for hr in self .header:
557
574
558
- field_count = self .parser.line_fields[self .parser.header]
559
- start = self .parser.line_start[self .parser.header]
575
+ this_header = []
560
576
561
- # TODO: Py3 vs. Py2
562
- counts = {}
563
- for i in range (field_count):
564
- word = self .parser.words[start + i]
577
+ if self .parser.lines < hr + 1 :
578
+ self ._tokenize_rows(hr + 2 )
565
579
566
- if self .c_encoding == NULL and not PY3:
567
- name = PyBytes_FromString(word)
568
- else :
569
- if self .c_encoding == NULL or self .c_encoding == b' utf-8' :
570
- name = PyUnicode_FromString(word)
571
- else :
572
- name = PyUnicode_Decode(word, strlen(word),
573
- self .c_encoding, errors)
580
+ # e.g., if header=3 and file only has 2 lines
581
+ if self .parser.lines < hr + 1 :
582
+ raise CParserError(' Passed header=%d but only %d lines in file'
583
+ % (self .parser.header, self .parser.lines))
574
584
575
- if name == ' ' :
576
- name = ' Unnamed: %d ' % i
585
+ field_count = self .parser.line_fields[hr]
586
+ start = self .parser.line_start[hr]
577
587
588
+ # TODO: Py3 vs. Py2
589
+ counts = {}
590
+ for i in range (field_count):
591
+ word = self .parser.words[start + i]
578
592
579
- count = counts.get(name, 0 )
580
- if count > 0 and self .mangle_dupe_cols:
581
- header.append(' %s .%d ' % (name, count))
582
- else :
583
- header.append(name)
584
- counts[name] = count + 1
593
+ if self .c_encoding == NULL and not PY3:
594
+ name = PyBytes_FromString(word)
595
+ else :
596
+ if self .c_encoding == NULL or self .c_encoding == b' utf-8' :
597
+ name = PyUnicode_FromString(word)
598
+ else :
599
+ name = PyUnicode_Decode(word, strlen(word),
600
+ self .c_encoding, errors)
601
+
602
+ if name == ' ' :
603
+ name = ' Unnamed: %d ' % i
604
+
605
+ count = counts.get(name, 0 )
606
+ if count > 0 and self .mangle_dupe_cols:
607
+ this_header.append(' %s .%d ' % (name, count))
608
+ else :
609
+ this_header.append(name)
610
+ counts[name] = count + 1
585
611
586
- data_line = self .parser.header + 1
612
+ data_line = hr + 1
613
+ header.append(this_header)
587
614
588
615
if self .names is not None :
589
- header = self .names
616
+ header = [ self .names ]
590
617
591
618
elif self .names is not None :
592
619
# Enforce this unless usecols
@@ -597,11 +624,11 @@ cdef class TextReader:
597
624
if self .parser.lines < 1 :
598
625
self ._tokenize_rows(1 )
599
626
600
- header = self .names
627
+ header = [ self .names ]
601
628
data_line = 0
602
629
603
630
if self .parser.lines < 1 :
604
- field_count = len (header)
631
+ field_count = len (header[ 0 ] )
605
632
else :
606
633
field_count = self .parser.line_fields[data_line]
607
634
else :
@@ -613,7 +640,7 @@ cdef class TextReader:
613
640
614
641
# Corner case, not enough lines in the file
615
642
if self .parser.lines < data_line + 1 :
616
- field_count = len (header)
643
+ field_count = len (header[ 0 ] )
617
644
else : # not self.has_usecols:
618
645
619
646
field_count = self .parser.line_fields[data_line]
@@ -622,7 +649,7 @@ cdef class TextReader:
622
649
if self .names is not None :
623
650
field_count = max (field_count, len (self .names))
624
651
625
- passed_count = len (header)
652
+ passed_count = len (header[ 0 ] )
626
653
627
654
# if passed_count > field_count:
628
655
# raise CParserError('Column names have %d fields, '
@@ -1038,10 +1065,10 @@ cdef class TextReader:
1038
1065
if self .header is not None :
1039
1066
j = i - self .leading_cols
1040
1067
# hack for #2442
1041
- if j == len (self .header):
1068
+ if j == len (self .header[ 0 ] ):
1042
1069
return j
1043
1070
else :
1044
- return self .header[j]
1071
+ return self .header[0 ][ j]
1045
1072
else :
1046
1073
return None
1047
1074
0 commit comments