Skip to content

Commit cc93d61

Browse files
committed
ENH: Allow read_csv to handle multi-index in columns
GH3571, GH1651, GH3141
1 parent de27eef commit cc93d61

File tree

7 files changed

+162
-95
lines changed

7 files changed

+162
-95
lines changed

pandas/core/format.py

+35-34
Original file line numberDiff line numberDiff line change
@@ -963,48 +963,49 @@ def _save_header(self):
963963
encoded_labels = []
964964

965965
has_aliases = isinstance(header, (tuple, list, np.ndarray))
966-
if has_aliases or self.header:
966+
if not (has_aliases or self.header):
967+
return
967968

968-
if self.index:
969-
# should write something for index label
970-
if index_label is not False:
971-
if index_label is None:
972-
if isinstance(obj.index, MultiIndex):
973-
index_label = []
974-
for i, name in enumerate(obj.index.names):
975-
if name is None:
976-
name = ''
977-
index_label.append(name)
969+
if self.index:
970+
# should write something for index label
971+
if index_label is not False:
972+
if index_label is None:
973+
if isinstance(obj.index, MultiIndex):
974+
index_label = []
975+
for i, name in enumerate(obj.index.names):
976+
if name is None:
977+
name = ''
978+
index_label.append(name)
979+
else:
980+
index_label = obj.index.name
981+
if index_label is None:
982+
index_label = ['']
978983
else:
979-
index_label = obj.index.name
980-
if index_label is None:
981-
index_label = ['']
982-
else:
983-
index_label = [index_label]
984-
elif not isinstance(index_label, (list, tuple, np.ndarray)):
985-
# given a string for a DF with Index
986-
index_label = [index_label]
984+
index_label = [index_label]
985+
elif not isinstance(index_label, (list, tuple, np.ndarray)):
986+
# given a string for a DF with Index
987+
index_label = [index_label]
987988

988-
encoded_labels = list(index_label)
989-
else:
990-
encoded_labels = []
989+
encoded_labels = list(index_label)
990+
else:
991+
encoded_labels = []
991992

992-
if has_aliases:
993-
if len(header) != len(cols):
994-
raise ValueError(('Writing %d cols but got %d aliases'
995-
% (len(cols), len(header))))
996-
else:
997-
write_cols = header
993+
if has_aliases:
994+
if len(header) != len(cols):
995+
raise ValueError(('Writing %d cols but got %d aliases'
996+
% (len(cols), len(header))))
998997
else:
999-
write_cols = cols
998+
write_cols = header
999+
else:
1000+
write_cols = cols
10001001

1001-
if not has_mi_columns:
1002-
encoded_labels += list(write_cols)
1002+
if not has_mi_columns:
1003+
encoded_labels += list(write_cols)
10031004

1004-
else:
1005+
else:
10051006

1006-
if not has_mi_columns:
1007-
encoded_labels += list(cols)
1007+
if not has_mi_columns:
1008+
encoded_labels += list(cols)
10081009

10091010
# write out the mi
10101011
if has_mi_columns:

pandas/io/parsers.py

+42-11
Original file line numberDiff line numberDiff line change
@@ -677,19 +677,18 @@ def read(self, nrows=None):
677677
if self.options.get('as_recarray'):
678678
return ret
679679

680-
index, columns, col_dict = ret
681-
682680
# May alter columns / col_dict
683-
# index, columns, col_dict = self._create_index(col_dict, columns)
681+
index, columns, col_dict = self._create_index(ret)
684682

685683
df = DataFrame(col_dict, columns=columns, index=index)
686684

687685
if self.squeeze and len(df.columns) == 1:
688686
return df[df.columns[0]]
689687
return df
690688

691-
def _create_index(self, col_dict, columns):
692-
pass
689+
def _create_index(self, ret):
690+
index, columns, col_dict = ret
691+
return index, columns, col_dict
693692

694693
def get_chunk(self, size=None):
695694
if size is None:
@@ -709,6 +708,7 @@ def __init__(self, kwds):
709708

710709
self.index_col = kwds.pop('index_col', None)
711710
self.index_names = None
711+
self.col_names = None
712712

713713
self.parse_dates = kwds.pop('parse_dates', False)
714714
self.date_parser = kwds.pop('date_parser', None)
@@ -942,7 +942,32 @@ def __init__(self, src, **kwds):
942942
if self._reader.header is None:
943943
self.names = None
944944
else:
945-
self.names = list(self._reader.header)
945+
if len(self._reader.header) > 1:
946+
# the names are the tuples of the header that are not the index cols
947+
# 0 is the name of the index, assuming index_col is a list of column
948+
# numbers
949+
if (self._reader.leading_cols == 0 and
950+
_is_index_col(self.index_col)):
951+
ic = self.index_col
952+
if not isinstance(ic, (list,tuple,np.ndarray)):
953+
ic = [ ic ]
954+
sic = set(ic)
955+
956+
header = list(self._reader.header)
957+
index_names = header.pop(-1)
958+
self.index_names = [ index_names[i] for i in ic ]
959+
field_count = len(header[0])
960+
961+
def extract(r):
962+
return tuple([ r[i] for i in range(field_count) if i not in sic ])
963+
964+
self.names = ic + zip(*[ extract(r) for r in header ])
965+
self.col_names = [ r[0] if len(r[0]) else None for r in header ]
966+
passed_names = True
967+
else:
968+
raise Exception("must have an index_col when have a multi-index specified")
969+
else:
970+
self.names = list(self._reader.header[0])
946971

947972
if self.names is None:
948973
if self.prefix:
@@ -958,12 +983,14 @@ def __init__(self, src, **kwds):
958983

959984
if not self._has_complex_date_col:
960985
if (self._reader.leading_cols == 0 and
961-
_is_index_col(self.index_col)):
986+
_is_index_col(self.index_col)):
962987

963988
self._name_processed = True
964-
(self.index_names, self.names,
965-
self.index_col) = _clean_index_names(self.names,
966-
self.index_col)
989+
(index_names, self.names,
990+
self.index_col) = _clean_index_names(self.names, self.index_col)
991+
992+
if self.index_names is None:
993+
self.index_names = index_names
967994

968995
if self._reader.header is None and not passed_names:
969996
self.index_names = [None] * len(self.index_names)
@@ -1051,6 +1078,10 @@ def read(self, nrows=None):
10511078
names, data = self._do_date_conversions(names, data)
10521079
index = self._make_index(data, alldata, names)
10531080

1081+
# possibly create a column mi here
1082+
if all([ isinstance(c,tuple) for c in names]):
1083+
names = MultiIndex.from_tuples(names,names=self.col_names)
1084+
10541085
return index, names, data
10551086

10561087
def _filter_usecols(self, names):
@@ -1061,7 +1092,7 @@ def _filter_usecols(self, names):
10611092
return names
10621093

10631094
def _get_index_names(self):
1064-
names = list(self._reader.header)
1095+
names = list(self._reader.header[0])
10651096
idx_names = None
10661097

10671098
if self._reader.leading_cols == 0 and self.index_col is not None:

pandas/io/tests/test_cparser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def test_header_not_enough_lines(self):
179179
reader = TextReader(StringIO(data), delimiter=',', header=2,
180180
as_recarray=True)
181181
header = reader.header
182-
expected = ['a', 'b', 'c']
182+
expected = [['a', 'b', 'c']]
183183
self.assertEquals(header, expected)
184184

185185
recs = reader.read()

pandas/src/parser.pyx

+70-43
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ cdef extern from "parser/tokenizer.h":
143143
char thousands
144144

145145
int header # Boolean: 1: has header, 0: no header
146+
int header_start # header row start
147+
int header_end # header row end
146148

147149
void *skipset
148150
int skip_footer
@@ -242,7 +244,7 @@ cdef class TextReader:
242244
object na_values, true_values, false_values
243245
object memory_map
244246
object as_recarray
245-
object header, names
247+
object header, names, header_start, header_end
246248
object low_memory
247249
object skiprows
248250
object compact_ints, use_unsigned
@@ -256,6 +258,8 @@ cdef class TextReader:
256258
delimiter=b',',
257259

258260
header=0,
261+
header_start=0,
262+
header_end=0,
259263
names=None,
260264

261265
memory_map=False,
@@ -435,11 +439,28 @@ cdef class TextReader:
435439
# TODO: no header vs. header is not the first row
436440
if header is None:
437441
# sentinel value
442+
self.parser.header_start = -1
443+
self.parser.header_end = -1
438444
self.parser.header = -1
439445
self.parser_start = 0
446+
self.header = []
440447
else:
441-
self.parser.header = header
442-
self.parser_start = header + 1
448+
if isinstance(header, list) and len(header):
449+
# need to artifically skip the final line
450+
# which is still a header line
451+
header.append(header[-1]+1)
452+
453+
self.parser.header_start = header[0]
454+
self.parser.header_end = header[-1]
455+
self.parser.header = header[0]
456+
self.parser_start = header[-1] + 1
457+
self.header = header
458+
else:
459+
self.parser.header_start = header
460+
self.parser.header_end = header
461+
self.parser.header = header
462+
self.parser_start = header + 1
463+
self.header = [ header ]
443464

444465
self.names = names
445466
self.header, self.table_width = self._get_header()
@@ -534,8 +555,10 @@ cdef class TextReader:
534555
' got %s type' % type(source))
535556

536557
cdef _get_header(self):
558+
# header is now a list of lists, so field_count should use header[0]
559+
537560
cdef:
538-
size_t i, start, data_line, field_count, passed_count
561+
size_t i, start, data_line, field_count, passed_count, hr
539562
char *word
540563
object name
541564
int status
@@ -544,49 +567,53 @@ cdef class TextReader:
544567

545568
header = []
546569

547-
if self.parser.header >= 0:
548-
# Header is in the file
570+
if self.parser.header_start >= 0:
549571

550-
if self.parser.lines < self.parser.header + 1:
551-
self._tokenize_rows(self.parser.header + 2)
552-
553-
# e.g., if header=3 and file only has 2 lines
554-
if self.parser.lines < self.parser.header + 1:
555-
raise CParserError('Passed header=%d but only %d lines in file'
556-
% (self.parser.header, self.parser.lines))
572+
# Header is in the file
573+
for hr in self.header:
557574

558-
field_count = self.parser.line_fields[self.parser.header]
559-
start = self.parser.line_start[self.parser.header]
575+
this_header = []
560576

561-
# TODO: Py3 vs. Py2
562-
counts = {}
563-
for i in range(field_count):
564-
word = self.parser.words[start + i]
577+
if self.parser.lines < hr + 1:
578+
self._tokenize_rows(hr + 2)
565579

566-
if self.c_encoding == NULL and not PY3:
567-
name = PyBytes_FromString(word)
568-
else:
569-
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
570-
name = PyUnicode_FromString(word)
571-
else:
572-
name = PyUnicode_Decode(word, strlen(word),
573-
self.c_encoding, errors)
580+
# e.g., if header=3 and file only has 2 lines
581+
if self.parser.lines < hr + 1:
582+
raise CParserError('Passed header=%d but only %d lines in file'
583+
% (self.parser.header, self.parser.lines))
574584

575-
if name == '':
576-
name = 'Unnamed: %d' % i
585+
field_count = self.parser.line_fields[hr]
586+
start = self.parser.line_start[hr]
577587

588+
# TODO: Py3 vs. Py2
589+
counts = {}
590+
for i in range(field_count):
591+
word = self.parser.words[start + i]
578592

579-
count = counts.get(name, 0)
580-
if count > 0 and self.mangle_dupe_cols:
581-
header.append('%s.%d' % (name, count))
582-
else:
583-
header.append(name)
584-
counts[name] = count + 1
593+
if self.c_encoding == NULL and not PY3:
594+
name = PyBytes_FromString(word)
595+
else:
596+
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
597+
name = PyUnicode_FromString(word)
598+
else:
599+
name = PyUnicode_Decode(word, strlen(word),
600+
self.c_encoding, errors)
601+
602+
if name == '':
603+
name = 'Unnamed: %d' % i
604+
605+
count = counts.get(name, 0)
606+
if count > 0 and self.mangle_dupe_cols:
607+
this_header.append('%s.%d' % (name, count))
608+
else:
609+
this_header.append(name)
610+
counts[name] = count + 1
585611

586-
data_line = self.parser.header + 1
612+
data_line = hr + 1
613+
header.append(this_header)
587614

588615
if self.names is not None:
589-
header = self.names
616+
header = [ self.names ]
590617

591618
elif self.names is not None:
592619
# Enforce this unless usecols
@@ -597,11 +624,11 @@ cdef class TextReader:
597624
if self.parser.lines < 1:
598625
self._tokenize_rows(1)
599626

600-
header = self.names
627+
header = [ self.names ]
601628
data_line = 0
602629

603630
if self.parser.lines < 1:
604-
field_count = len(header)
631+
field_count = len(header[0])
605632
else:
606633
field_count = self.parser.line_fields[data_line]
607634
else:
@@ -613,7 +640,7 @@ cdef class TextReader:
613640

614641
# Corner case, not enough lines in the file
615642
if self.parser.lines < data_line + 1:
616-
field_count = len(header)
643+
field_count = len(header[0])
617644
else: # not self.has_usecols:
618645

619646
field_count = self.parser.line_fields[data_line]
@@ -622,7 +649,7 @@ cdef class TextReader:
622649
if self.names is not None:
623650
field_count = max(field_count, len(self.names))
624651

625-
passed_count = len(header)
652+
passed_count = len(header[0])
626653

627654
# if passed_count > field_count:
628655
# raise CParserError('Column names have %d fields, '
@@ -1038,10 +1065,10 @@ cdef class TextReader:
10381065
if self.header is not None:
10391066
j = i - self.leading_cols
10401067
# hack for #2442
1041-
if j == len(self.header):
1068+
if j == len(self.header[0]):
10421069
return j
10431070
else:
1044-
return self.header[j]
1071+
return self.header[0][j]
10451072
else:
10461073
return None
10471074

0 commit comments

Comments
 (0)