52
52
dialect : string or csv.Dialect instance, default None
53
53
If None defaults to Excel dialect. Ignored if sep longer than 1 char
54
54
See csv.Dialect documentation for more details
55
- header : int, default 0 if names parameter not specified,
55
+ header : int, default None if names parameter not specified and 0 otherwise
56
56
Row to use for the column labels of the parsed DataFrame. Specify None if
57
57
there is no header row. Can be a list of integers that specify row
58
58
locations for a multi-index on the columns E.g. [0,1,3]. Interveaning
@@ -917,22 +917,6 @@ def _do_date_conversions(self, names, data):
917
917
918
918
return names , data
919
919
920
- def _exclude_implicit_index (self , alldata ):
921
-
922
- if self ._implicit_index :
923
- excl_indices = self .index_col
924
-
925
- data = {}
926
- offset = 0
927
- for i , col in enumerate (self .orig_names ):
928
- while i + offset in excl_indices :
929
- offset += 1
930
- data [col ] = alldata [i + offset ]
931
- else :
932
- data = dict ((k , v ) for k , v in zip (self .orig_names , alldata ))
933
-
934
- return data
935
-
936
920
937
921
class CParserWrapper (ParserBase ):
938
922
"""
@@ -1173,22 +1157,6 @@ def TextParser(*args, **kwds):
1173
1157
return TextFileReader (* args , ** kwds )
1174
1158
1175
1159
1176
- # delimiter=None, dialect=None, names=None, header=0,
1177
- # index_col=None,
1178
- # na_values=None,
1179
- # na_filter=True,
1180
- # thousands=None,
1181
- # quotechar='"',
1182
- # escapechar=None,
1183
- # doublequote=True,
1184
- # skipinitialspace=False,
1185
- # quoting=csv.QUOTE_MINIMAL,
1186
- # comment=None, parse_dates=False, keep_date_col=False,
1187
- # date_parser=None, dayfirst=False,
1188
- # chunksize=None, skiprows=None, skip_footer=0, converters=None,
1189
- # verbose=False, encoding=None, squeeze=False):
1190
-
1191
-
1192
1160
def count_empty_vals (vals ):
1193
1161
return sum ([1 for v in vals if v == '' or v is None ])
1194
1162
@@ -1242,10 +1210,6 @@ def __init__(self, f, **kwds):
1242
1210
self .buf = []
1243
1211
self .pos = 0
1244
1212
1245
- if kwds ['usecols' ] is not None :
1246
- raise Exception ("usecols not supported with engine='python'"
1247
- " or multicharacter separators (yet)." )
1248
-
1249
1213
self .encoding = kwds ['encoding' ]
1250
1214
self .compression = kwds ['compression' ]
1251
1215
self .skiprows = kwds ['skiprows' ]
@@ -1259,7 +1223,10 @@ def __init__(self, f, **kwds):
1259
1223
self .skipinitialspace = kwds ['skipinitialspace' ]
1260
1224
self .lineterminator = kwds ['lineterminator' ]
1261
1225
self .quoting = kwds ['quoting' ]
1262
- self .mangle_dupe_cols = kwds .get ('mangle_dupe_cols' ,True )
1226
+ self .mangle_dupe_cols = kwds .get ('mangle_dupe_cols' , True )
1227
+ self .usecols = kwds ['usecols' ]
1228
+
1229
+ self .names_passed = kwds ['names' ] or None
1263
1230
1264
1231
self .has_index_names = False
1265
1232
if 'has_index_names' in kwds :
@@ -1283,17 +1250,25 @@ def __init__(self, f, **kwds):
1283
1250
1284
1251
f = TextIOWrapper (f , encoding = self .encoding )
1285
1252
1253
+ # Set self.data to something that can read lines.
1286
1254
if hasattr (f , 'readline' ):
1287
1255
self ._make_reader (f )
1288
1256
else :
1289
1257
self .data = f
1290
1258
1291
- self .columns = self ._infer_columns ()
1259
+ # Get columns in two steps: infer from data, then
1260
+ # infer column indices from self.usecols if is is specified.
1261
+ self ._col_indices = None
1262
+ self .columns , self .num_original_columns = self ._infer_columns ()
1292
1263
1293
- # we are processing a multi index column
1264
+ # Now self.columns has the set of columns that we will process.
1265
+ # The original set is stored in self.original_columns.
1294
1266
if len (self .columns ) > 1 :
1267
+ # we are processing a multi index column
1295
1268
self .columns , self .index_names , self .col_names , _ = self ._extract_multi_indexer_columns (
1296
1269
self .columns , self .index_names , self .col_names )
1270
+ # Update list of original names to include all indices.
1271
+ self .num_original_columns = len (self .columns )
1297
1272
else :
1298
1273
self .columns = self .columns [0 ]
1299
1274
@@ -1304,7 +1279,7 @@ def __init__(self, f, **kwds):
1304
1279
# multiple date column thing turning into a real spaghetti factory
1305
1280
if not self ._has_complex_date_col :
1306
1281
(index_names ,
1307
- self .orig_names , _ ) = self ._get_index_name (self .columns )
1282
+ self .orig_names , columns_ ) = self ._get_index_name (self .columns )
1308
1283
self ._name_processed = True
1309
1284
if self .index_names is None :
1310
1285
self .index_names = index_names
@@ -1442,6 +1417,22 @@ def read(self, rows=None):
1442
1417
1443
1418
return index , columns , data
1444
1419
1420
+ def _exclude_implicit_index (self , alldata ):
1421
+
1422
+ if self ._implicit_index :
1423
+ excl_indices = self .index_col
1424
+
1425
+ data = {}
1426
+ offset = 0
1427
+ for i , col in enumerate (self .orig_names ):
1428
+ while i + offset in excl_indices :
1429
+ offset += 1
1430
+ data [col ] = alldata [i + offset ]
1431
+ else :
1432
+ data = dict ((k , v ) for k , v in zip (self .orig_names , alldata ))
1433
+
1434
+ return data
1435
+
1445
1436
# legacy
1446
1437
def get_chunk (self , size = None ):
1447
1438
if size is None :
@@ -1462,7 +1453,7 @@ def _convert_data(self, data):
1462
1453
1463
1454
def _infer_columns (self ):
1464
1455
names = self .names
1465
-
1456
+ num_original_columns = 0
1466
1457
if self .header is not None :
1467
1458
header = self .header
1468
1459
@@ -1476,10 +1467,7 @@ def _infer_columns(self):
1476
1467
1477
1468
columns = []
1478
1469
for level , hr in enumerate (header ):
1479
- if len (self .buf ) > 0 :
1480
- line = self .buf [0 ]
1481
- else :
1482
- line = self ._next_line ()
1470
+ line = self ._buffered_line ()
1483
1471
1484
1472
while self .pos <= hr :
1485
1473
line = self ._next_line ()
@@ -1488,51 +1476,103 @@ def _infer_columns(self):
1488
1476
for i , c in enumerate (line ):
1489
1477
if c == '' :
1490
1478
if have_mi_columns :
1491
- this_columns .append ('Unnamed: %d_level_%d' % (i ,level ))
1479
+ this_columns .append ('Unnamed: %d_level_%d' % (i , level ))
1492
1480
else :
1493
1481
this_columns .append ('Unnamed: %d' % i )
1494
1482
else :
1495
1483
this_columns .append (c )
1496
1484
1497
- if not have_mi_columns :
1498
- if self .mangle_dupe_cols :
1499
- counts = {}
1500
- for i , col in enumerate (this_columns ):
1501
- cur_count = counts .get (col , 0 )
1502
- if cur_count > 0 :
1503
- this_columns [i ] = '%s.%d' % (col , cur_count )
1504
- counts [col ] = cur_count + 1
1485
+ if not have_mi_columns and self .mangle_dupe_cols :
1486
+ counts = {}
1487
+ for i , col in enumerate (this_columns ):
1488
+ cur_count = counts .get (col , 0 )
1489
+ if cur_count > 0 :
1490
+ this_columns [i ] = '%s.%d' % (col , cur_count )
1491
+ counts [col ] = cur_count + 1
1505
1492
1506
1493
columns .append (this_columns )
1494
+ if len (columns ) == 1 :
1495
+ num_original_columns = len (this_columns )
1507
1496
1508
1497
self ._clear_buffer ()
1509
1498
1510
1499
if names is not None :
1511
- if len (names ) != len (columns [0 ]):
1500
+ if (self .usecols is not None and len (names ) != len (self .usecols )) \
1501
+ or (self .usecols is None and len (names ) != len (columns [0 ])):
1502
+
1512
1503
raise ValueError ('Number of passed names did not match '
1513
- 'number of header fields in the file' )
1504
+ 'number of header fields in the file' )
1514
1505
if len (columns ) > 1 :
1515
1506
raise TypeError ('Cannot pass names with multi-index '
1516
1507
'columns' )
1517
- columns = [ names ]
1518
1508
1519
- else :
1520
- if len (self .buf ) > 0 :
1521
- line = self .buf [0 ]
1509
+ if self .usecols is not None :
1510
+ # Set _use_cols. We don't store columns because they are overwritten.
1511
+ self ._handle_usecols (columns , names )
1512
+ else :
1513
+ self ._col_indices = None
1514
+ num_original_columns = len (names )
1515
+ columns = [names ]
1522
1516
else :
1523
- line = self ._next_line ()
1524
-
1517
+ columns = self ._handle_usecols (columns , columns [0 ])
1518
+ else :
1519
+ # header is None
1520
+ line = self ._buffered_line ()
1525
1521
ncols = len (line )
1522
+ num_original_columns = ncols
1526
1523
if not names :
1527
1524
if self .prefix :
1528
1525
columns = [ ['X%d' % i for i in range (ncols )] ]
1529
1526
else :
1530
1527
columns = [ lrange (ncols ) ]
1528
+ columns = self ._handle_usecols (columns , columns [0 ])
1531
1529
else :
1532
- columns = [ names ]
1530
+ if self .usecols is None or len (names ) == num_original_columns :
1531
+ columns = self ._handle_usecols ([names ], names )
1532
+ num_original_columns = len (names )
1533
+ else :
1534
+ if self .usecols and len (names ) != len (self .usecols ):
1535
+ raise ValueError ('Number of passed names did not match '
1536
+ 'number of header fields in the file' )
1537
+ # Ignore output but set used columns.
1538
+ self ._handle_usecols ([names ], names )
1539
+ columns = [names ]
1540
+ num_original_columns = ncols
1533
1541
1542
+ return columns , num_original_columns
1543
+
1544
+ def _handle_usecols (self , columns , usecols_key ):
1545
+ """
1546
+ Sets self._col_indices
1547
+
1548
+ usecols_key is used if there are string usecols.
1549
+ """
1550
+ if self .usecols is not None :
1551
+ if any ([isinstance (u , basestring ) for u in self .usecols ]):
1552
+ if len (columns ) > 1 :
1553
+ raise ValueError ("If using multiple headers, usecols must be integers." )
1554
+ col_indices = []
1555
+ for u in self .usecols :
1556
+ if isinstance (u , string_types ):
1557
+ col_indices .append (usecols_key .index (u ))
1558
+ else :
1559
+ col_indices .append (u )
1560
+ else :
1561
+ col_indices = self .usecols
1562
+
1563
+ columns = [[n for i , n in enumerate (column ) if i in col_indices ] for column in columns ]
1564
+ self ._col_indices = col_indices
1534
1565
return columns
1535
1566
1567
+ def _buffered_line (self ):
1568
+ """
1569
+ Return a line from buffer, filling buffer if required.
1570
+ """
1571
+ if len (self .buf ) > 0 :
1572
+ return self .buf [0 ]
1573
+ else :
1574
+ return self ._next_line ()
1575
+
1536
1576
def _next_line (self ):
1537
1577
if isinstance (self .data , list ):
1538
1578
while self .pos in self .skiprows :
@@ -1598,6 +1638,17 @@ def _clear_buffer(self):
1598
1638
_implicit_index = False
1599
1639
1600
1640
def _get_index_name (self , columns ):
1641
+ """
1642
+ Try several cases to get lines:
1643
+
1644
+ 0) There are headers on row 0 and row 1 and their
1645
+ total summed lengths equals the length of the next line.
1646
+ Treat row 0 as columns and row 1 as indices
1647
+ 1) Look for implicit index: there are more columns
1648
+ on row 1 than row 0. If this is true, assume that row
1649
+ 1 lists index columns and row 0 lists normal columns.
1650
+ 2) Get index from the columns if it was listed.
1651
+ """
1601
1652
orig_names = list (columns )
1602
1653
columns = list (columns )
1603
1654
@@ -1615,29 +1666,34 @@ def _get_index_name(self, columns):
1615
1666
implicit_first_cols = 0
1616
1667
if line is not None :
1617
1668
# leave it 0, #2442
1669
+ # Case 1
1618
1670
if self .index_col is not False :
1619
- implicit_first_cols = len (line ) - len ( columns )
1671
+ implicit_first_cols = len (line ) - self . num_original_columns
1620
1672
1673
+ # Case 0
1621
1674
if next_line is not None :
1622
- if len (next_line ) == len (line ) + len ( columns ) :
1675
+ if len (next_line ) == len (line ) + self . num_original_columns :
1623
1676
# column and index names on diff rows
1624
- implicit_first_cols = 0
1625
-
1626
1677
self .index_col = lrange (len (line ))
1627
1678
self .buf = self .buf [1 :]
1628
1679
1629
1680
for c in reversed (line ):
1630
1681
columns .insert (0 , c )
1631
1682
1683
+ # Update list of original names to include all indices.
1684
+ self .num_original_columns = len (next_line )
1632
1685
return line , columns , orig_names
1633
1686
1634
1687
if implicit_first_cols > 0 :
1688
+ # Case 1
1635
1689
self ._implicit_index = True
1636
1690
if self .index_col is None :
1637
1691
self .index_col = lrange (implicit_first_cols )
1692
+
1638
1693
index_name = None
1639
1694
1640
1695
else :
1696
+ # Case 2
1641
1697
(index_name , columns ,
1642
1698
self .index_col ) = _clean_index_names (columns , self .index_col )
1643
1699
@@ -1646,7 +1702,7 @@ def _get_index_name(self, columns):
1646
1702
def _rows_to_cols (self , content ):
1647
1703
zipped_content = list (lib .to_object_array (content ).T )
1648
1704
1649
- col_len = len ( self .orig_names )
1705
+ col_len = self .num_original_columns
1650
1706
zip_len = len (zipped_content )
1651
1707
1652
1708
if self ._implicit_index :
@@ -1655,6 +1711,7 @@ def _rows_to_cols(self, content):
1655
1711
if self .skip_footer < 0 :
1656
1712
raise ValueError ('skip footer cannot be negative' )
1657
1713
1714
+ # Loop through rows to verify lengths are correct.
1658
1715
if col_len != zip_len and self .index_col is not False :
1659
1716
i = 0
1660
1717
for (i , l ) in enumerate (content ):
@@ -1671,6 +1728,11 @@ def _rows_to_cols(self, content):
1671
1728
(col_len , row_num + 1 , zip_len ))
1672
1729
raise ValueError (msg )
1673
1730
1731
+ if self .usecols :
1732
+ if self ._implicit_index :
1733
+ zipped_content = [a for i , a in enumerate (zipped_content ) if i < len (self .index_col ) or i - len (self .index_col ) in self ._col_indices ]
1734
+ else :
1735
+ zipped_content = [a for i , a in enumerate (zipped_content ) if i in self ._col_indices ]
1674
1736
return zipped_content
1675
1737
1676
1738
def _get_lines (self , rows = None ):
0 commit comments