Skip to content

Commit 237123e

Browse files
committed
ENH: Add usecols option to python parser.
Closes pandas-dev#4335
1 parent d52ee75 commit 237123e

File tree

2 files changed

+197
-131
lines changed

2 files changed

+197
-131
lines changed

pandas/io/parsers.py

+131-69
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
dialect : string or csv.Dialect instance, default None
5353
If None defaults to Excel dialect. Ignored if sep longer than 1 char
5454
See csv.Dialect documentation for more details
55-
header : int, default 0 if names parameter not specified,
55+
header : int, default None if names parameter not specified and 0 otherwise
5656
Row to use for the column labels of the parsed DataFrame. Specify None if
5757
there is no header row. Can be a list of integers that specify row
5858
locations for a multi-index on the columns E.g. [0,1,3]. Interveaning
@@ -917,22 +917,6 @@ def _do_date_conversions(self, names, data):
917917

918918
return names, data
919919

920-
def _exclude_implicit_index(self, alldata):
921-
922-
if self._implicit_index:
923-
excl_indices = self.index_col
924-
925-
data = {}
926-
offset = 0
927-
for i, col in enumerate(self.orig_names):
928-
while i + offset in excl_indices:
929-
offset += 1
930-
data[col] = alldata[i + offset]
931-
else:
932-
data = dict((k, v) for k, v in zip(self.orig_names, alldata))
933-
934-
return data
935-
936920

937921
class CParserWrapper(ParserBase):
938922
"""
@@ -1173,22 +1157,6 @@ def TextParser(*args, **kwds):
11731157
return TextFileReader(*args, **kwds)
11741158

11751159

1176-
# delimiter=None, dialect=None, names=None, header=0,
1177-
# index_col=None,
1178-
# na_values=None,
1179-
# na_filter=True,
1180-
# thousands=None,
1181-
# quotechar='"',
1182-
# escapechar=None,
1183-
# doublequote=True,
1184-
# skipinitialspace=False,
1185-
# quoting=csv.QUOTE_MINIMAL,
1186-
# comment=None, parse_dates=False, keep_date_col=False,
1187-
# date_parser=None, dayfirst=False,
1188-
# chunksize=None, skiprows=None, skip_footer=0, converters=None,
1189-
# verbose=False, encoding=None, squeeze=False):
1190-
1191-
11921160
def count_empty_vals(vals):
11931161
return sum([1 for v in vals if v == '' or v is None])
11941162

@@ -1242,10 +1210,6 @@ def __init__(self, f, **kwds):
12421210
self.buf = []
12431211
self.pos = 0
12441212

1245-
if kwds['usecols'] is not None:
1246-
raise Exception("usecols not supported with engine='python'"
1247-
" or multicharacter separators (yet).")
1248-
12491213
self.encoding = kwds['encoding']
12501214
self.compression = kwds['compression']
12511215
self.skiprows = kwds['skiprows']
@@ -1259,7 +1223,10 @@ def __init__(self, f, **kwds):
12591223
self.skipinitialspace = kwds['skipinitialspace']
12601224
self.lineterminator = kwds['lineterminator']
12611225
self.quoting = kwds['quoting']
1262-
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols',True)
1226+
self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
1227+
self.usecols = kwds['usecols']
1228+
1229+
self.names_passed = kwds['names'] or None
12631230

12641231
self.has_index_names = False
12651232
if 'has_index_names' in kwds:
@@ -1283,17 +1250,25 @@ def __init__(self, f, **kwds):
12831250

12841251
f = TextIOWrapper(f, encoding=self.encoding)
12851252

1253+
# Set self.data to something that can read lines.
12861254
if hasattr(f, 'readline'):
12871255
self._make_reader(f)
12881256
else:
12891257
self.data = f
12901258

1291-
self.columns = self._infer_columns()
1259+
# Get columns in two steps: infer from data, then
1260+
# infer column indices from self.usecols if is is specified.
1261+
self._col_indices = None
1262+
self.columns, self.num_original_columns = self._infer_columns()
12921263

1293-
# we are processing a multi index column
1264+
# Now self.columns has the set of columns that we will process.
1265+
# The original set is stored in self.original_columns.
12941266
if len(self.columns) > 1:
1267+
# we are processing a multi index column
12951268
self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns(
12961269
self.columns, self.index_names, self.col_names)
1270+
# Update list of original names to include all indices.
1271+
self.num_original_columns = len(self.columns)
12971272
else:
12981273
self.columns = self.columns[0]
12991274

@@ -1304,7 +1279,7 @@ def __init__(self, f, **kwds):
13041279
# multiple date column thing turning into a real spaghetti factory
13051280
if not self._has_complex_date_col:
13061281
(index_names,
1307-
self.orig_names, _) = self._get_index_name(self.columns)
1282+
self.orig_names, columns_) = self._get_index_name(self.columns)
13081283
self._name_processed = True
13091284
if self.index_names is None:
13101285
self.index_names = index_names
@@ -1442,6 +1417,22 @@ def read(self, rows=None):
14421417

14431418
return index, columns, data
14441419

1420+
def _exclude_implicit_index(self, alldata):
1421+
1422+
if self._implicit_index:
1423+
excl_indices = self.index_col
1424+
1425+
data = {}
1426+
offset = 0
1427+
for i, col in enumerate(self.orig_names):
1428+
while i + offset in excl_indices:
1429+
offset += 1
1430+
data[col] = alldata[i + offset]
1431+
else:
1432+
data = dict((k, v) for k, v in zip(self.orig_names, alldata))
1433+
1434+
return data
1435+
14451436
# legacy
14461437
def get_chunk(self, size=None):
14471438
if size is None:
@@ -1462,7 +1453,7 @@ def _convert_data(self, data):
14621453

14631454
def _infer_columns(self):
14641455
names = self.names
1465-
1456+
num_original_columns = 0
14661457
if self.header is not None:
14671458
header = self.header
14681459

@@ -1476,10 +1467,7 @@ def _infer_columns(self):
14761467

14771468
columns = []
14781469
for level, hr in enumerate(header):
1479-
if len(self.buf) > 0:
1480-
line = self.buf[0]
1481-
else:
1482-
line = self._next_line()
1470+
line = self._buffered_line()
14831471

14841472
while self.pos <= hr:
14851473
line = self._next_line()
@@ -1488,51 +1476,103 @@ def _infer_columns(self):
14881476
for i, c in enumerate(line):
14891477
if c == '':
14901478
if have_mi_columns:
1491-
this_columns.append('Unnamed: %d_level_%d' % (i,level))
1479+
this_columns.append('Unnamed: %d_level_%d' % (i, level))
14921480
else:
14931481
this_columns.append('Unnamed: %d' % i)
14941482
else:
14951483
this_columns.append(c)
14961484

1497-
if not have_mi_columns:
1498-
if self.mangle_dupe_cols:
1499-
counts = {}
1500-
for i, col in enumerate(this_columns):
1501-
cur_count = counts.get(col, 0)
1502-
if cur_count > 0:
1503-
this_columns[i] = '%s.%d' % (col, cur_count)
1504-
counts[col] = cur_count + 1
1485+
if not have_mi_columns and self.mangle_dupe_cols:
1486+
counts = {}
1487+
for i, col in enumerate(this_columns):
1488+
cur_count = counts.get(col, 0)
1489+
if cur_count > 0:
1490+
this_columns[i] = '%s.%d' % (col, cur_count)
1491+
counts[col] = cur_count + 1
15051492

15061493
columns.append(this_columns)
1494+
if len(columns) == 1:
1495+
num_original_columns = len(this_columns)
15071496

15081497
self._clear_buffer()
15091498

15101499
if names is not None:
1511-
if len(names) != len(columns[0]):
1500+
if (self.usecols is not None and len(names) != len(self.usecols)) \
1501+
or (self.usecols is None and len(names) != len(columns[0])):
1502+
15121503
raise ValueError('Number of passed names did not match '
1513-
'number of header fields in the file')
1504+
'number of header fields in the file')
15141505
if len(columns) > 1:
15151506
raise TypeError('Cannot pass names with multi-index '
15161507
'columns')
1517-
columns = [ names ]
15181508

1519-
else:
1520-
if len(self.buf) > 0:
1521-
line = self.buf[0]
1509+
if self.usecols is not None:
1510+
# Set _use_cols. We don't store columns because they are overwritten.
1511+
self._handle_usecols(columns, names)
1512+
else:
1513+
self._col_indices = None
1514+
num_original_columns = len(names)
1515+
columns = [names]
15221516
else:
1523-
line = self._next_line()
1524-
1517+
columns = self._handle_usecols(columns, columns[0])
1518+
else:
1519+
# header is None
1520+
line = self._buffered_line()
15251521
ncols = len(line)
1522+
num_original_columns = ncols
15261523
if not names:
15271524
if self.prefix:
15281525
columns = [ ['X%d' % i for i in range(ncols)] ]
15291526
else:
15301527
columns = [ lrange(ncols) ]
1528+
columns = self._handle_usecols(columns, columns[0])
15311529
else:
1532-
columns = [ names ]
1530+
if self.usecols is None or len(names) == num_original_columns:
1531+
columns = self._handle_usecols([names], names)
1532+
num_original_columns = len(names)
1533+
else:
1534+
if self.usecols and len(names) != len(self.usecols):
1535+
raise ValueError('Number of passed names did not match '
1536+
'number of header fields in the file')
1537+
# Ignore output but set used columns.
1538+
self._handle_usecols([names], names)
1539+
columns = [names]
1540+
num_original_columns = ncols
15331541

1542+
return columns, num_original_columns
1543+
1544+
def _handle_usecols(self, columns, usecols_key):
1545+
"""
1546+
Sets self._col_indices
1547+
1548+
usecols_key is used if there are string usecols.
1549+
"""
1550+
if self.usecols is not None:
1551+
if any([isinstance(u, basestring) for u in self.usecols]):
1552+
if len(columns) > 1:
1553+
raise ValueError("If using multiple headers, usecols must be integers.")
1554+
col_indices = []
1555+
for u in self.usecols:
1556+
if isinstance(u, string_types):
1557+
col_indices.append(usecols_key.index(u))
1558+
else:
1559+
col_indices.append(u)
1560+
else:
1561+
col_indices = self.usecols
1562+
1563+
columns = [[n for i, n in enumerate(column) if i in col_indices] for column in columns]
1564+
self._col_indices = col_indices
15341565
return columns
15351566

1567+
def _buffered_line(self):
1568+
"""
1569+
Return a line from buffer, filling buffer if required.
1570+
"""
1571+
if len(self.buf) > 0:
1572+
return self.buf[0]
1573+
else:
1574+
return self._next_line()
1575+
15361576
def _next_line(self):
15371577
if isinstance(self.data, list):
15381578
while self.pos in self.skiprows:
@@ -1598,6 +1638,17 @@ def _clear_buffer(self):
15981638
_implicit_index = False
15991639

16001640
def _get_index_name(self, columns):
1641+
"""
1642+
Try several cases to get lines:
1643+
1644+
0) There are headers on row 0 and row 1 and their
1645+
total summed lengths equals the length of the next line.
1646+
Treat row 0 as columns and row 1 as indices
1647+
1) Look for implicit index: there are more columns
1648+
on row 1 than row 0. If this is true, assume that row
1649+
1 lists index columns and row 0 lists normal columns.
1650+
2) Get index from the columns if it was listed.
1651+
"""
16011652
orig_names = list(columns)
16021653
columns = list(columns)
16031654

@@ -1615,29 +1666,34 @@ def _get_index_name(self, columns):
16151666
implicit_first_cols = 0
16161667
if line is not None:
16171668
# leave it 0, #2442
1669+
# Case 1
16181670
if self.index_col is not False:
1619-
implicit_first_cols = len(line) - len(columns)
1671+
implicit_first_cols = len(line) - self.num_original_columns
16201672

1673+
# Case 0
16211674
if next_line is not None:
1622-
if len(next_line) == len(line) + len(columns):
1675+
if len(next_line) == len(line) + self.num_original_columns:
16231676
# column and index names on diff rows
1624-
implicit_first_cols = 0
1625-
16261677
self.index_col = lrange(len(line))
16271678
self.buf = self.buf[1:]
16281679

16291680
for c in reversed(line):
16301681
columns.insert(0, c)
16311682

1683+
# Update list of original names to include all indices.
1684+
self.num_original_columns = len(next_line)
16321685
return line, columns, orig_names
16331686

16341687
if implicit_first_cols > 0:
1688+
# Case 1
16351689
self._implicit_index = True
16361690
if self.index_col is None:
16371691
self.index_col = lrange(implicit_first_cols)
1692+
16381693
index_name = None
16391694

16401695
else:
1696+
# Case 2
16411697
(index_name, columns,
16421698
self.index_col) = _clean_index_names(columns, self.index_col)
16431699

@@ -1646,7 +1702,7 @@ def _get_index_name(self, columns):
16461702
def _rows_to_cols(self, content):
16471703
zipped_content = list(lib.to_object_array(content).T)
16481704

1649-
col_len = len(self.orig_names)
1705+
col_len = self.num_original_columns
16501706
zip_len = len(zipped_content)
16511707

16521708
if self._implicit_index:
@@ -1655,6 +1711,7 @@ def _rows_to_cols(self, content):
16551711
if self.skip_footer < 0:
16561712
raise ValueError('skip footer cannot be negative')
16571713

1714+
# Loop through rows to verify lengths are correct.
16581715
if col_len != zip_len and self.index_col is not False:
16591716
i = 0
16601717
for (i, l) in enumerate(content):
@@ -1671,6 +1728,11 @@ def _rows_to_cols(self, content):
16711728
(col_len, row_num + 1, zip_len))
16721729
raise ValueError(msg)
16731730

1731+
if self.usecols:
1732+
if self._implicit_index:
1733+
zipped_content = [a for i, a in enumerate(zipped_content) if i < len(self.index_col) or i - len(self.index_col) in self._col_indices]
1734+
else:
1735+
zipped_content = [a for i, a in enumerate(zipped_content) if i in self._col_indices]
16741736
return zipped_content
16751737

16761738
def _get_lines(self, rows=None):

0 commit comments

Comments
 (0)