ENH: Allow usecols to accept callable (GH14154) (#14234)

WillAyd · jorisvandenbossche · commit 8cdc09ef197f · 2016-12-06T12:38:06.000+01:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -126,13 +126,23 @@ index_col :  int or sequence or ``False``, default ``None``
   MultiIndex is used. If you have a malformed file with delimiters at the end of
   each line, you might consider ``index_col=False`` to force pandas to *not* use
   the first column as the index (row names).
-usecols : array-like, default ``None``
-  Return a subset of the columns. All elements in this array must either
+usecols : array-like or callable, default ``None``
+  Return a subset of the columns. If array-like, all elements must either
   be positional (i.e. integer indices into the document columns) or strings
   that correspond to column names provided either by the user in `names` or
-  inferred from the document header row(s). For example, a valid `usecols`
-  parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
-  results in much faster parsing time and lower memory usage.
+  inferred from the document header row(s). For example, a valid array-like
+  `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz'].
+
+  If callable, the callable function will be evaluated against the column names,
+  returning names where the callable function evaluates to True:
+
+  .. ipython:: python
+
+     data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
+     pd.read_csv(StringIO(data))
+     pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])
+
+  Using this parameter results in much faster parsing time and lower memory usage.
 as_recarray : boolean, default ``False``
   DEPRECATED: this argument will be removed in a future version. Please call
   ``pd.read_csv(...).to_records()`` instead.
@@ -617,14 +627,17 @@ Filtering columns (``usecols``)
 +++++++++++++++++++++++++++++++
 
 The ``usecols`` argument allows you to select any subset of the columns in a
-file, either using the column names or position numbers:
+file, either using the column names, position numbers or a callable:
+
+.. versionadded:: 0.20.0 support for callable `usecols` arguments
 
 .. ipython:: python
 
     data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'
     pd.read_csv(StringIO(data))
     pd.read_csv(StringIO(data), usecols=['b', 'd'])
     pd.read_csv(StringIO(data), usecols=[0, 2, 3])
+    pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C'])
 
 Comments and Empty Lines
 ''''''''''''''''''''''''
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -52,6 +52,7 @@ Other enhancements
 - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
 - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`)
 - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
+- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value  (:issue:`14154`)
 
 .. _whatsnew_0200.api_breaking:
 
@@ -106,4 +107,4 @@ Performance Improvements
 .. _whatsnew_0200.bug_fixes:
 
 Bug Fixes
-~~~~~~~~~
+~~~~~~~~~
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -90,13 +90,18 @@
     MultiIndex is used. If you have a malformed file with delimiters at the end
     of each line, you might consider index_col=False to force pandas to _not_
     use the first column as the index (row names)
-usecols : array-like, default None
-    Return a subset of the columns. All elements in this array must either
+usecols : array-like or callable, default None
+    Return a subset of the columns. If array-like, all elements must either
     be positional (i.e. integer indices into the document columns) or strings
     that correspond to column names provided either by the user in `names` or
-    inferred from the document header row(s). For example, a valid `usecols`
-    parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
-    results in much faster parsing time and lower memory usage.
+    inferred from the document header row(s). For example, a valid array-like
+    `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz'].
+
+    If callable, the callable function will be evaluated against the column
+    names, returning names where the callable function evaluates to True. An
+    example of a valid callable argument would be ``lambda x: x.upper() in
+    ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
+    parsing time and lower memory usage.
 as_recarray : boolean, default False
     DEPRECATED: this argument will be removed in a future version. Please call
     `pd.read_csv(...).to_records()` instead.
@@ -977,17 +982,33 @@ def _is_index_col(col):
     return col is not None and col is not False
 
 
+def _evaluate_usecols(usecols, names):
+    """
+    Check whether or not the 'usecols' parameter
+    is a callable.  If so, enumerates the 'names'
+    parameter and returns a set of indices for
+    each entry in 'names' that evaluates to True.
+    If not a callable, returns 'usecols'.
+    """
+    if callable(usecols):
+        return set([i for i, name in enumerate(names)
+                    if usecols(name)])
+    return usecols
+
+
 def _validate_usecols_arg(usecols):
     """
     Check whether or not the 'usecols' parameter
-    contains all integers (column selection by index)
-    or strings (column by name). Raises a ValueError
-    if that is not the case.
+    contains all integers (column selection by index),
+    strings (column by name) or is a callable. Raises
+    a ValueError if that is not the case.
     """
-    msg = ("The elements of 'usecols' must "
-           "either be all strings, all unicode, or all integers")
+    msg = ("'usecols' must either be all strings, all unicode, "
+           "all integers or a callable")
 
     if usecols is not None:
+        if callable(usecols):
+            return usecols
         usecols_dtype = lib.infer_dtype(usecols)
         if usecols_dtype not in ('empty', 'integer',
                                  'string', 'unicode'):
@@ -1499,11 +1520,12 @@ def __init__(self, src, **kwds):
         self.orig_names = self.names[:]
 
         if self.usecols:
-            if len(self.names) > len(self.usecols):
+            usecols = _evaluate_usecols(self.usecols, self.orig_names)
+            if len(self.names) > len(usecols):
                 self.names = [n for i, n in enumerate(self.names)
-                              if (i in self.usecols or n in self.usecols)]
+                              if (i in usecols or n in usecols)]
 
-            if len(self.names) < len(self.usecols):
+            if len(self.names) < len(usecols):
                 raise ValueError("Usecols do not match names.")
 
         self._set_noconvert_columns()
@@ -1665,9 +1687,10 @@ def read(self, nrows=None):
 
     def _filter_usecols(self, names):
         # hackish
-        if self.usecols is not None and len(names) != len(self.usecols):
+        usecols = _evaluate_usecols(self.usecols, names)
+        if usecols is not None and len(names) != len(usecols):
             names = [name for i, name in enumerate(names)
-                     if i in self.usecols or name in self.usecols]
+                     if i in usecols or name in usecols]
         return names
 
     def _get_index_names(self):
@@ -2291,7 +2314,9 @@ def _handle_usecols(self, columns, usecols_key):
         usecols_key is used if there are string usecols.
         """
         if self.usecols is not None:
-            if any([isinstance(col, string_types) for col in self.usecols]):
+            if callable(self.usecols):
+                col_indices = _evaluate_usecols(self.usecols, usecols_key)
+            elif any([isinstance(u, string_types) for u in self.usecols]):
                 if len(columns) > 1:
                     raise ValueError("If using multiple headers, usecols must "
                                      "be integers.")
diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py
@@ -23,8 +23,9 @@ def test_raise_on_mixed_dtype_usecols(self):
         1000,2000,3000
         4000,5000,6000
         """
-        msg = ("The elements of 'usecols' must "
-               "either be all strings, all unicode, or all integers")
+
+        msg = ("'usecols' must either be all strings, all unicode, "
+               "all integers or a callable")
         usecols = [0, 'b', 2]
 
         with tm.assertRaisesRegexp(ValueError, msg):
@@ -302,8 +303,8 @@ def test_usecols_with_mixed_encoding_strings(self):
         3.568935038,7,False,a
         '''
 
-        msg = ("The elements of 'usecols' must "
-               "either be all strings, all unicode, or all integers")
+        msg = ("'usecols' must either be all strings, all unicode, "
+               "all integers or a callable")
 
         with tm.assertRaisesRegexp(ValueError, msg):
             self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
@@ -366,3 +367,31 @@ def test_np_array_usecols(self):
         expected = DataFrame([[1, 2]], columns=usecols)
         result = self.read_csv(StringIO(data), usecols=usecols)
         tm.assert_frame_equal(result, expected)
+
+    def test_callable_usecols(self):
+        # See gh-14154
+        s = '''AaA,bBb,CCC,ddd
+        0.056674973,8,True,a
+        2.613230982,2,False,b
+        3.568935038,7,False,a
+        '''
+
+        data = {
+            'AaA': {
+                0: 0.056674972999999997,
+                1: 2.6132309819999997,
+                2: 3.5689350380000002
+            },
+            'bBb': {0: 8, 1: 2, 2: 7},
+            'ddd': {0: 'a', 1: 'b', 2: 'a'}
+        }
+        expected = DataFrame(data)
+        df = self.read_csv(StringIO(s), usecols=lambda x:
+                           x.upper() in ['AAA', 'BBB', 'DDD'])
+        tm.assert_frame_equal(df, expected)
+
+        # Check that a callable returning only False returns
+        # an empty DataFrame
+        expected = DataFrame()
+        df = self.read_csv(StringIO(s), usecols=lambda x: False)
+        tm.assert_frame_equal(df, expected)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -300,8 +300,9 @@ cdef class TextReader:
         object compression
         object mangle_dupe_cols
         object tupleize_cols
+        object usecols
         list dtype_cast_order
-        set noconvert, usecols
+        set noconvert
 
     def __cinit__(self, source,
                   delimiter=b',',
@@ -437,7 +438,10 @@ cdef class TextReader:
         # suboptimal
         if usecols is not None:
             self.has_usecols = 1
-            self.usecols = set(usecols)
+            if callable(usecols):
+                self.usecols = usecols
+            else:
+                self.usecols = set(usecols)
 
         # XXX
         if skipfooter > 0:
@@ -701,7 +705,6 @@ cdef class TextReader:
             cdef StringPath path = _string_path(self.c_encoding)
 
         header = []
-
         if self.parser.header_start >= 0:
 
             # Header is in the file
@@ -821,7 +824,8 @@ cdef class TextReader:
             #                        'data has %d fields'
             #                        % (passed_count, field_count))
 
-            if self.has_usecols and self.allow_leading_cols:
+            if self.has_usecols and self.allow_leading_cols and \
+                    not callable(self.usecols):
                 nuse = len(self.usecols)
                 if nuse == passed_count:
                     self.leading_cols = 0
@@ -1019,13 +1023,20 @@ cdef class TextReader:
             if i < self.leading_cols:
                 # Pass through leading columns always
                 name = i
-            elif self.usecols and nused == len(self.usecols):
+            elif self.usecols and not callable(self.usecols) and \
+                    nused == len(self.usecols):
                 # Once we've gathered all requested columns, stop. GH5766
                 break
             else:
                 name = self._get_column_name(i, nused)
-                if self.has_usecols and not (i in self.usecols or
-                                             name in self.usecols):
+                usecols = set()
+                if callable(self.usecols):
+                    if self.usecols(name):
+                        usecols = set([i])
+                else:
+                    usecols = self.usecols
+                if self.has_usecols and not (i in usecols or
+                                             name in usecols):
                     continue
                 nused += 1