diff --git a/doc/source/io.rst b/doc/source/io.rst index f22374553e9c3..75f36c5274cd2 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -126,13 +126,23 @@ index_col : int or sequence or ``False``, default ``None`` MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider ``index_col=False`` to force pandas to *not* use the first column as the index (row names). -usecols : array-like, default ``None`` - Return a subset of the columns. All elements in this array must either +usecols : array-like or callable, default ``None`` + Return a subset of the columns. If array-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or - inferred from the document header row(s). For example, a valid `usecols` - parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter - results in much faster parsing time and lower memory usage. + inferred from the document header row(s). For example, a valid array-like + `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. + + If callable, the callable function will be evaluated against the column names, + returning names where the callable function evaluates to True: + + .. ipython:: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3']) + + Using this parameter results in much faster parsing time and lower memory usage. as_recarray : boolean, default ``False`` DEPRECATED: this argument will be removed in a future version. Please call ``pd.read_csv(...).to_records()`` instead. @@ -617,7 +627,9 @@ Filtering columns (``usecols``) +++++++++++++++++++++++++++++++ The ``usecols`` argument allows you to select any subset of the columns in a -file, either using the column names or position numbers: +file, either using the column names, position numbers or a callable: + +.. versionadded:: 0.20.0 support for callable `usecols` arguments .. ipython:: python @@ -625,6 +637,7 @@ file, either using the column names or position numbers: pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), usecols=['b', 'd']) pd.read_csv(StringIO(data), usecols=[0, 2, 3]) + pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C']) Comments and Empty Lines '''''''''''''''''''''''' diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9774c3ec9cc7f..0bfd755aae40c 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -52,6 +52,7 @@ Other enhancements - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) +- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) .. _whatsnew_0200.api_breaking: @@ -106,4 +107,4 @@ Performance Improvements .. _whatsnew_0200.bug_fixes: Bug Fixes -~~~~~~~~~ \ No newline at end of file +~~~~~~~~~ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ef839297c80d3..30443f894a64d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -90,13 +90,18 @@ MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names) -usecols : array-like, default None - Return a subset of the columns. All elements in this array must either +usecols : array-like or callable, default None + Return a subset of the columns. If array-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or - inferred from the document header row(s). For example, a valid `usecols` - parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter - results in much faster parsing time and lower memory usage. + inferred from the document header row(s). For example, a valid array-like + `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. + + If callable, the callable function will be evaluated against the column + names, returning names where the callable function evaluates to True. An + example of a valid callable argument would be ``lambda x: x.upper() in + ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster + parsing time and lower memory usage. as_recarray : boolean, default False DEPRECATED: this argument will be removed in a future version. Please call `pd.read_csv(...).to_records()` instead. @@ -977,17 +982,33 @@ def _is_index_col(col): return col is not None and col is not False +def _evaluate_usecols(usecols, names): + """ + Check whether or not the 'usecols' parameter + is a callable. If so, enumerates the 'names' + parameter and returns a set of indices for + each entry in 'names' that evaluates to True. + If not a callable, returns 'usecols'. + """ + if callable(usecols): + return set([i for i, name in enumerate(names) + if usecols(name)]) + return usecols + + def _validate_usecols_arg(usecols): """ Check whether or not the 'usecols' parameter - contains all integers (column selection by index) - or strings (column by name). Raises a ValueError - if that is not the case. + contains all integers (column selection by index), + strings (column by name) or is a callable. Raises + a ValueError if that is not the case. """ - msg = ("The elements of 'usecols' must " - "either be all strings, all unicode, or all integers") + msg = ("'usecols' must either be all strings, all unicode, " + "all integers or a callable") if usecols is not None: + if callable(usecols): + return usecols usecols_dtype = lib.infer_dtype(usecols) if usecols_dtype not in ('empty', 'integer', 'string', 'unicode'): @@ -1499,11 +1520,12 @@ def __init__(self, src, **kwds): self.orig_names = self.names[:] if self.usecols: - if len(self.names) > len(self.usecols): + usecols = _evaluate_usecols(self.usecols, self.orig_names) + if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) - if (i in self.usecols or n in self.usecols)] + if (i in usecols or n in usecols)] - if len(self.names) < len(self.usecols): + if len(self.names) < len(usecols): raise ValueError("Usecols do not match names.") self._set_noconvert_columns() @@ -1665,9 +1687,10 @@ def read(self, nrows=None): def _filter_usecols(self, names): # hackish - if self.usecols is not None and len(names) != len(self.usecols): + usecols = _evaluate_usecols(self.usecols, names) + if usecols is not None and len(names) != len(usecols): names = [name for i, name in enumerate(names) - if i in self.usecols or name in self.usecols] + if i in usecols or name in usecols] return names def _get_index_names(self): @@ -2291,7 +2314,9 @@ def _handle_usecols(self, columns, usecols_key): usecols_key is used if there are string usecols. """ if self.usecols is not None: - if any([isinstance(col, string_types) for col in self.usecols]): + if callable(self.usecols): + col_indices = _evaluate_usecols(self.usecols, usecols_key) + elif any([isinstance(u, string_types) for u in self.usecols]): if len(columns) > 1: raise ValueError("If using multiple headers, usecols must " "be integers.") diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 5051171ccb8f0..26b4b5b8ec7d1 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -23,8 +23,9 @@ def test_raise_on_mixed_dtype_usecols(self): 1000,2000,3000 4000,5000,6000 """ - msg = ("The elements of 'usecols' must " - "either be all strings, all unicode, or all integers") + + msg = ("'usecols' must either be all strings, all unicode, " + "all integers or a callable") usecols = [0, 'b', 2] with tm.assertRaisesRegexp(ValueError, msg): @@ -302,8 +303,8 @@ def test_usecols_with_mixed_encoding_strings(self): 3.568935038,7,False,a ''' - msg = ("The elements of 'usecols' must " - "either be all strings, all unicode, or all integers") + msg = ("'usecols' must either be all strings, all unicode, " + "all integers or a callable") with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB']) @@ -366,3 +367,31 @@ def test_np_array_usecols(self): expected = DataFrame([[1, 2]], columns=usecols) result = self.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) + + def test_callable_usecols(self): + # See gh-14154 + s = '''AaA,bBb,CCC,ddd + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + data = { + 'AaA': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'bBb': {0: 8, 1: 2, 2: 7}, + 'ddd': {0: 'a', 1: 'b', 2: 'a'} + } + expected = DataFrame(data) + df = self.read_csv(StringIO(s), usecols=lambda x: + x.upper() in ['AAA', 'BBB', 'DDD']) + tm.assert_frame_equal(df, expected) + + # Check that a callable returning only False returns + # an empty DataFrame + expected = DataFrame() + df = self.read_csv(StringIO(s), usecols=lambda x: False) + tm.assert_frame_equal(df, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 6760e822960f1..d94a4ef278dee 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -300,8 +300,9 @@ cdef class TextReader: object compression object mangle_dupe_cols object tupleize_cols + object usecols list dtype_cast_order - set noconvert, usecols + set noconvert def __cinit__(self, source, delimiter=b',', @@ -437,7 +438,10 @@ cdef class TextReader: # suboptimal if usecols is not None: self.has_usecols = 1 - self.usecols = set(usecols) + if callable(usecols): + self.usecols = usecols + else: + self.usecols = set(usecols) # XXX if skipfooter > 0: @@ -701,7 +705,6 @@ cdef class TextReader: cdef StringPath path = _string_path(self.c_encoding) header = [] - if self.parser.header_start >= 0: # Header is in the file @@ -821,7 +824,8 @@ cdef class TextReader: # 'data has %d fields' # % (passed_count, field_count)) - if self.has_usecols and self.allow_leading_cols: + if self.has_usecols and self.allow_leading_cols and \ + not callable(self.usecols): nuse = len(self.usecols) if nuse == passed_count: self.leading_cols = 0 @@ -1019,13 +1023,20 @@ cdef class TextReader: if i < self.leading_cols: # Pass through leading columns always name = i - elif self.usecols and nused == len(self.usecols): + elif self.usecols and not callable(self.usecols) and \ + nused == len(self.usecols): # Once we've gathered all requested columns, stop. GH5766 break else: name = self._get_column_name(i, nused) - if self.has_usecols and not (i in self.usecols or - name in self.usecols): + usecols = set() + if callable(self.usecols): + if self.usecols(name): + usecols = set([i]) + else: + usecols = self.usecols + if self.has_usecols and not (i in usecols or + name in usecols): continue nused += 1