@@ -755,7 +755,42 @@ def _should_parse_dates(self, i):
755
755
else :
756
756
return (j in self .parse_dates ) or (name in self .parse_dates )
757
757
758
- def _make_index (self , data , alldata , columns ):
758
+
759
+ def _extract_multi_indexer_columns (self , header , index_names , col_names , passed_names = False ):
760
+ """ extract and return the names, index_names, col_names
761
+ header is a list-of-lists returned from the parsers """
762
+ if len (header ) < 2 :
763
+ return header [0 ], index_names , col_names , passed_names
764
+
765
+ # the names are the tuples of the header that are not the index cols
766
+ # 0 is the name of the index, assuming index_col is a list of column
767
+ # numbers
768
+ ic = self .index_col
769
+ if not isinstance (ic , (list ,tuple ,np .ndarray )):
770
+ ic = [ ic ]
771
+ sic = set (ic )
772
+
773
+ orig_header = list (header )
774
+ index_names = header .pop (- 1 )
775
+ index_names = [ index_names [i ] for i in ic ]
776
+ field_count = len (header [0 ])
777
+
778
+ def extract (r ):
779
+ return tuple ([ r [i ] for i in range (field_count ) if i not in sic ])
780
+
781
+ names = ic + zip (* [ extract (r ) for r in header ])
782
+ col_names = [ r [0 ] if len (r [0 ]) else None for r in header ]
783
+ passed_names = True
784
+
785
+ return names , index_names , col_names , passed_names
786
+
787
+ def _maybe_make_multi_index_columns (self , columns , col_names = None ):
788
+ # possibly create a column mi here
789
+ if len (columns ) and not isinstance (columns , MultiIndex ) and all ([ isinstance (c ,tuple ) for c in columns ]):
790
+ columns = MultiIndex .from_tuples (columns ,names = col_names )
791
+ return columns
792
+
793
+ def _make_index (self , data , alldata , columns , indexnamerow = False ):
759
794
if not _is_index_col (self .index_col ) or len (self .index_col ) == 0 :
760
795
index = None
761
796
@@ -772,7 +807,15 @@ def _make_index(self, data, alldata, columns):
772
807
index = self ._get_complex_date_index (data , columns )
773
808
index = self ._agg_index (index , try_parse_dates = False )
774
809
775
- return index
810
+ # add names for the index
811
+ if indexnamerow :
812
+ coffset = len (indexnamerow ) - len (columns )
813
+ index .names = indexnamerow [:coffset ]
814
+
815
+ # maybe create a mi on the columns
816
+ columns = self ._maybe_make_multi_index_columns (columns , self .col_names )
817
+
818
+ return index , columns
776
819
777
820
_implicit_index = False
778
821
@@ -955,27 +998,11 @@ def __init__(self, src, **kwds):
955
998
self .names = None
956
999
else :
957
1000
if len (self ._reader .header ) > 1 :
958
- # the names are the tuples of the header that are not the index cols
959
- # 0 is the name of the index, assuming index_col is a list of column
960
- # numbers
1001
+ # we have a multi index in the columns
961
1002
if (self ._reader .leading_cols == 0 and
962
1003
_is_index_col (self .index_col )):
963
- ic = self .index_col
964
- if not isinstance (ic , (list ,tuple ,np .ndarray )):
965
- ic = [ ic ]
966
- sic = set (ic )
967
-
968
- header = list (self ._reader .header )
969
- index_names = header .pop (- 1 )
970
- self .index_names = [ index_names [i ] for i in ic ]
971
- field_count = len (header [0 ])
972
-
973
- def extract (r ):
974
- return tuple ([ r [i ] for i in range (field_count ) if i not in sic ])
975
-
976
- self .names = ic + zip (* [ extract (r ) for r in header ])
977
- self .col_names = [ r [0 ] if len (r [0 ]) else None for r in header ]
978
- passed_names = True
1004
+ self .names , self .index_names , self .col_names , passed_names = self ._extract_multi_indexer_columns (
1005
+ self ._reader .header , self .index_names , self .col_names , passed_names )
979
1006
else :
980
1007
raise Exception ("must have an index_col when have a multi-index "
981
1008
"header is specified" )
@@ -1089,11 +1116,10 @@ def read(self, nrows=None):
1089
1116
data = dict ((k , v ) for k , (i , v ) in zip (names , data ))
1090
1117
1091
1118
names , data = self ._do_date_conversions (names , data )
1092
- index = self ._make_index (data , alldata , names )
1119
+ index , names = self ._make_index (data , alldata , names )
1093
1120
1094
- # possibly create a column mi here
1095
- if all ([ isinstance (c ,tuple ) for c in names ]):
1096
- names = MultiIndex .from_tuples (names ,names = self .col_names )
1121
+ # maybe create a mi on the columns
1122
+ names = self ._maybe_make_multi_index_columns (names , self .col_names )
1097
1123
1098
1124
return index , names , data
1099
1125
@@ -1252,16 +1278,25 @@ def __init__(self, f, **kwds):
1252
1278
self .data = f
1253
1279
self .columns = self ._infer_columns ()
1254
1280
1281
+ # we are processing a multi index column
1282
+ if len (self .columns ) > 1 :
1283
+ self .columns , self .index_names , self .col_names , _ = self ._extract_multi_indexer_columns (
1284
+ self .columns , self .index_names , self .col_names )
1285
+ else :
1286
+ self .columns = self .columns [0 ]
1287
+
1255
1288
# get popped off for index
1256
1289
self .orig_names = list (self .columns )
1257
1290
1258
1291
# needs to be cleaned/refactored
1259
1292
# multiple date column thing turning into a real spaghetti factory
1260
1293
1261
1294
if not self ._has_complex_date_col :
1262
- (self . index_names ,
1295
+ (index_names ,
1263
1296
self .orig_names , _ ) = self ._get_index_name (self .columns )
1264
1297
self ._name_processed = True
1298
+ if self .index_names is None :
1299
+ self .index_names = index_names
1265
1300
self ._first_chunk = True
1266
1301
1267
1302
def _make_reader (self , f ):
@@ -1365,10 +1400,7 @@ def read(self, rows=None):
1365
1400
columns , data = self ._do_date_conversions (self .columns , data )
1366
1401
1367
1402
data = self ._convert_data (data )
1368
- index = self ._make_index (data , alldata , columns )
1369
- if indexnamerow :
1370
- coffset = len (indexnamerow ) - len (columns )
1371
- index .names = indexnamerow [:coffset ]
1403
+ index , columns = self ._make_index (data , alldata , columns , indexnamerow )
1372
1404
1373
1405
return index , columns , data
1374
1406
@@ -1394,39 +1426,52 @@ def _infer_columns(self):
1394
1426
names = self .names
1395
1427
1396
1428
if self .header is not None :
1397
- if isinstance (self .header ,(list ,tuple ,np .ndarray )):
1398
- raise Exception ("PythonParser does not support a multi-index header" )
1429
+ header = self .header
1399
1430
1400
- if len (self .buf ) > 0 :
1401
- line = self .buf [0 ]
1431
+ # we have a mi columns, so read and extra line
1432
+ if isinstance (header ,(list ,tuple ,np .ndarray )):
1433
+ header = list (header ) + [header [- 1 ]+ 1 ]
1402
1434
else :
1403
- line = self ._next_line ()
1404
-
1405
- while self .pos <= self .header :
1406
- line = self ._next_line ()
1435
+ header = [ header ]
1407
1436
1408
1437
columns = []
1409
- for i , c in enumerate (line ):
1410
- if c == '' :
1411
- columns .append ('Unnamed: %d' % i )
1438
+ for hr in header :
1439
+
1440
+ if len (self .buf ) > 0 :
1441
+ line = self .buf [0 ]
1412
1442
else :
1413
- columns . append ( c )
1443
+ line = self . _next_line ( )
1414
1444
1415
- if self .mangle_dupe_cols :
1416
- counts = {}
1417
- for i , col in enumerate (columns ):
1418
- cur_count = counts .get (col , 0 )
1419
- if cur_count > 0 :
1420
- columns [i ] = '%s.%d' % (col , cur_count )
1421
- counts [col ] = cur_count + 1
1445
+ while self .pos <= hr :
1446
+ line = self ._next_line ()
1447
+
1448
+ this_columns = []
1449
+ for i , c in enumerate (line ):
1450
+ if c == '' :
1451
+ this_columns .append ('Unnamed: %d' % i )
1452
+ else :
1453
+ this_columns .append (c )
1454
+
1455
+ if self .mangle_dupe_cols :
1456
+ counts = {}
1457
+ for i , col in enumerate (this_columns ):
1458
+ cur_count = counts .get (col , 0 )
1459
+ if cur_count > 0 :
1460
+ this_columns [i ] = '%s.%d' % (col , cur_count )
1461
+ counts [col ] = cur_count + 1
1462
+
1463
+ columns .append (this_columns )
1422
1464
1423
1465
self ._clear_buffer ()
1424
1466
1425
1467
if names is not None :
1426
- if len (names ) != len (columns ):
1468
+ if len (names ) != len (columns [ 0 ] ):
1427
1469
raise Exception ('Number of passed names did not match '
1428
1470
'number of header fields in the file' )
1429
- columns = names
1471
+ if len (columns ) > 1 :
1472
+ raise Exception ('Cannot pass names with multi-index columns' )
1473
+ columns = [ names ]
1474
+
1430
1475
else :
1431
1476
if len (self .buf ) > 0 :
1432
1477
line = self .buf [0 ]
@@ -1436,11 +1481,11 @@ def _infer_columns(self):
1436
1481
ncols = len (line )
1437
1482
if not names :
1438
1483
if self .prefix :
1439
- columns = ['X%d' % i for i in range (ncols )]
1484
+ columns = [ [ 'X%d' % i for i in range (ncols )] ]
1440
1485
else :
1441
- columns = range (ncols )
1486
+ columns = [ range (ncols ) ]
1442
1487
else :
1443
- columns = names
1488
+ columns = [ names ]
1444
1489
1445
1490
return columns
1446
1491
0 commit comments