|
34 | 34 | from pandas.util.decorators import Appender
|
35 | 35 |
|
36 | 36 | import pandas.lib as lib
|
| 37 | +import pandas.core.common as com |
37 | 38 | import pandas.parser as _parser
|
38 | 39 |
|
39 | 40 |
|
|
86 | 87 | MultiIndex is used. If you have a malformed file with delimiters at the end
|
87 | 88 | of each line, you might consider index_col=False to force pandas to _not_
|
88 | 89 | use the first column as the index (row names)
|
89 |
| -usecols : array-like, default None |
90 |
| - Return a subset of the columns. All elements in this array must either |
| 90 | +usecols : array-like or callable, default None |
| 91 | + Return a subset of the columns. If array-like, all elements must either |
91 | 92 | be positional (i.e. integer indices into the document columns) or strings
|
92 | 93 | that correspond to column names provided either by the user in `names` or
|
93 | 94 | inferred from the document header row(s). For example, a valid `usecols`
|
94 |
| - parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter |
95 |
| - results in much faster parsing time and lower memory usage. |
| 95 | + parameter would be [0, 1, 2], ['foo', 'bar', 'baz'] or lambda x: x.upper() |
| 96 | + in ['AAA', 'BBB', 'DDD']. Using this parameter results in much faster |
| 97 | + parsing time and lower memory usage. |
96 | 98 | as_recarray : boolean, default False
|
97 | 99 | DEPRECATED: this argument will be removed in a future version. Please call
|
98 | 100 | `pd.read_csv(...).to_records()` instead.
|
@@ -976,17 +978,26 @@ def _is_index_col(col):
|
976 | 978 | return col is not None and col is not False
|
977 | 979 |
|
978 | 980 |
|
| 981 | +def _evaluate_usecols(usecols, names): |
| 982 | + if callable(usecols): |
| 983 | + return set([i for i, name in enumerate(names) |
| 984 | + if com._apply_if_callable(usecols, name)]) |
| 985 | + return usecols |
| 986 | + |
| 987 | + |
979 | 988 | def _validate_usecols_arg(usecols):
|
980 | 989 | """
|
981 | 990 | Check whether or not the 'usecols' parameter
|
982 |
| - contains all integers (column selection by index) |
983 |
| - or strings (column by name). Raises a ValueError |
984 |
| - if that is not the case. |
| 991 | + contains all integers (column selection by index), |
| 992 | + strings (column by name) or is a callable. Raises |
| 993 | + a ValueError if that is not the case. |
985 | 994 | """
|
986 |
| - msg = ("The elements of 'usecols' must " |
987 |
| - "either be all strings, all unicode, or all integers") |
| 995 | + msg = ("'usecols' must either be all strings, all unicode, " |
| 996 | + "all integers or a callable") |
988 | 997 |
|
989 | 998 | if usecols is not None:
|
| 999 | + if callable(usecols): |
| 1000 | + return usecols |
990 | 1001 | usecols_dtype = lib.infer_dtype(usecols)
|
991 | 1002 | if usecols_dtype not in ('empty', 'integer',
|
992 | 1003 | 'string', 'unicode'):
|
@@ -1426,11 +1437,12 @@ def __init__(self, src, **kwds):
|
1426 | 1437 | self.orig_names = self.names[:]
|
1427 | 1438 |
|
1428 | 1439 | if self.usecols:
|
1429 |
| - if len(self.names) > len(self.usecols): |
| 1440 | + usecols = _evaluate_usecols(self.usecols, self.orig_names) |
| 1441 | + if len(self.names) > len(usecols): |
1430 | 1442 | self.names = [n for i, n in enumerate(self.names)
|
1431 |
| - if (i in self.usecols or n in self.usecols)] |
| 1443 | + if (i in usecols or n in usecols)] |
1432 | 1444 |
|
1433 |
| - if len(self.names) < len(self.usecols): |
| 1445 | + if len(self.names) < len(usecols): |
1434 | 1446 | raise ValueError("Usecols do not match names.")
|
1435 | 1447 |
|
1436 | 1448 | self._set_noconvert_columns()
|
@@ -1592,9 +1604,10 @@ def read(self, nrows=None):
|
1592 | 1604 |
|
1593 | 1605 | def _filter_usecols(self, names):
|
1594 | 1606 | # hackish
|
1595 |
| - if self.usecols is not None and len(names) != len(self.usecols): |
| 1607 | + usecols = _evaluate_usecols(self.usecols, names) |
| 1608 | + if usecols is not None and len(names) != len(usecols): |
1596 | 1609 | names = [name for i, name in enumerate(names)
|
1597 |
| - if i in self.usecols or name in self.usecols] |
| 1610 | + if i in usecols or name in usecols] |
1598 | 1611 | return names
|
1599 | 1612 |
|
1600 | 1613 | def _get_index_names(self):
|
@@ -2207,7 +2220,9 @@ def _handle_usecols(self, columns, usecols_key):
|
2207 | 2220 | usecols_key is used if there are string usecols.
|
2208 | 2221 | """
|
2209 | 2222 | if self.usecols is not None:
|
2210 |
| - if any([isinstance(col, string_types) for col in self.usecols]): |
| 2223 | + if callable(self.usecols): |
| 2224 | + col_indices = _evaluate_usecols(self.usecols, usecols_key) |
| 2225 | + elif any([isinstance(u, string_types) for u in self.usecols]): |
2211 | 2226 | if len(columns) > 1:
|
2212 | 2227 | raise ValueError("If using multiple headers, usecols must "
|
2213 | 2228 | "be integers.")
|
|
0 commit comments