ENH: Accept callable for skiprows

gfyoung · gfyoung · commit bca4a6313e89 · 2017-01-04T22:13:07.000-08:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -187,6 +187,16 @@ skipinitialspace : boolean, default ``False``
 skiprows : list-like or integer, default ``None``
   Line numbers to skip (0-indexed) or number of lines to skip (int) at the start
   of the file.
+
+  If callable, the callable function will be evaluated against the row
+  indices, returning indices where the callable function evaluates to True:
+
+  .. ipython:: python
+
+     data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
+     pd.read_csv(StringIO(data))
+     pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)
+
 skipfooter : int, default ``0``
   Number of lines at bottom of file to skip (unsupported with engine='c').
 skip_footer : int, default ``0``
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -110,6 +110,7 @@ Other enhancements
 - ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`)
 - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
 - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value  (:issue:`14154`)
+- The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value  (:issue:`10882`)
 - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
 - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
 - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -132,9 +132,13 @@
     Values to consider as False
 skipinitialspace : boolean, default False
     Skip spaces after delimiter.
-skiprows : list-like or integer, default None
+skiprows : list-like or integer or callable, default None
     Line numbers to skip (0-indexed) or number of lines to skip (int)
-    at the start of the file
+    at the start of the file.
+
+    If callable, the callable function will be evaluated against the row
+    indices, returning indices where the callable function evaluates to True.
+    An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
 skipfooter : int, default 0
     Number of lines at bottom of file to skip (Unsupported with engine='c')
 skip_footer : int, default 0
@@ -919,7 +923,10 @@ def _clean_options(self, options, engine):
         if engine != 'c':
             if is_integer(skiprows):
                 skiprows = lrange(skiprows)
-            skiprows = set() if skiprows is None else set(skiprows)
+            if skiprows is None:
+                skiprows = set()
+            elif not callable(skiprows):
+                skiprows = set(skiprows)
 
         # put stuff back
         result['names'] = names
@@ -1840,6 +1847,11 @@ def __init__(self, f, **kwds):
         self.memory_map = kwds['memory_map']
         self.skiprows = kwds['skiprows']
 
+        if callable(self.skiprows):
+            self.skipfunc = self.skiprows
+        else:
+            self.skipfunc = lambda x: x in self.skiprows
+
         self.skipfooter = kwds['skipfooter']
         self.delimiter = kwds['delimiter']
 
@@ -1995,7 +2007,7 @@ class MyDialect(csv.Dialect):
             # attempt to sniff the delimiter
             if sniff_sep:
                 line = f.readline()
-                while self.pos in self.skiprows:
+                while self.skipfunc(self.pos):
                     self.pos += 1
                     line = f.readline()
 
@@ -2402,7 +2414,7 @@ def _empty(self, line):
 
     def _next_line(self):
         if isinstance(self.data, list):
-            while self.pos in self.skiprows:
+            while self.skipfunc(self.pos):
                 self.pos += 1
 
             while True:
@@ -2421,7 +2433,7 @@ def _next_line(self):
                 except IndexError:
                     raise StopIteration
         else:
-            while self.pos in self.skiprows:
+            while self.skipfunc(self.pos):
                 self.pos += 1
                 next(self.data)
 
@@ -2673,7 +2685,7 @@ def _get_lines(self, rows=None):
                 # Check for stop rows. n.b.: self.skiprows is a set.
                 if self.skiprows:
                     new_rows = [row for i, row in enumerate(new_rows)
-                                if i + self.pos not in self.skiprows]
+                                if not self.skipfunc(i + self.pos)]
 
                 lines.extend(new_rows)
                 self.pos = new_pos
@@ -2701,7 +2713,7 @@ def _get_lines(self, rows=None):
                 except StopIteration:
                     if self.skiprows:
                         new_rows = [row for i, row in enumerate(new_rows)
-                                    if self.pos + i not in self.skiprows]
+                                    if not self.skipfunc(i + self.pos)]
                     lines.extend(new_rows)
                     if len(lines) == 0:
                         raise
diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py
@@ -12,6 +12,7 @@
 import pandas.util.testing as tm
 
 from pandas import DataFrame
+from pandas.io.common import EmptyDataError
 from pandas.compat import StringIO, range, lrange
 
 
@@ -198,3 +199,21 @@ def test_skiprows_infield_quote(self):
 
         df = self.read_csv(StringIO(data), skiprows=2)
         tm.assert_frame_equal(df, expected)
+
+    def test_skiprows_callable(self):
+        data = 'a\n1\n2\n3\n4\n5'
+
+        skiprows = lambda x: x % 2 == 0
+        expected = DataFrame({'1': [3, 5]})
+        df = self.read_csv(StringIO(data), skiprows=skiprows)
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame({'foo': [3, 5]})
+        df = self.read_csv(StringIO(data), skiprows=skiprows,
+                           header=0, names=['foo'])
+        tm.assert_frame_equal(df, expected)
+
+        skiprows = lambda x: True
+        msg = "No columns to parse from file"
+        with tm.assertRaisesRegexp(EmptyDataError, msg):
+            self.read_csv(StringIO(data), skiprows=skiprows)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -178,6 +178,7 @@ cdef extern from "parser/tokenizer.h":
         int header_end # header row end
 
         void *skipset
+        PyObject *skipfunc
         int64_t skip_first_N_rows
         int skipfooter
         double (*converter)(const char *, char **, char, char, char, int) nogil
@@ -606,9 +607,11 @@ cdef class TextReader:
     cdef _make_skiprow_set(self):
         if isinstance(self.skiprows, (int, np.integer)):
             parser_set_skipfirstnrows(self.parser, self.skiprows)
-        else:
+        elif not callable(self.skiprows):
             for i in self.skiprows:
                 parser_add_skiprow(self.parser, i)
+        else:
+            self.parser.skipfunc = <PyObject *> self.skiprows
 
     cdef _setup_parser_source(self, source):
         cdef:
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -124,6 +124,7 @@ void parser_set_default_options(parser_t *self) {
     self->thousands = '\0';
 
     self->skipset = NULL;
+    self->skipfunc = NULL;
     self->skip_first_N_rows = -1;
     self->skip_footer = 0;
 }
@@ -679,7 +680,16 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     }
 
 int skip_this_line(parser_t *self, int64_t rownum) {
-    if (self->skipset != NULL) {
+    int should_skip;
+    PyGILState_STATE state;
+
+    if (self->skipfunc != NULL) {
+        state = PyGILState_Ensure();
+        should_skip = PyObject_IsTrue(PyObject_CallFunction(
+                        self->skipfunc, "i", rownum));
+        PyGILState_Release(state);
+        return should_skip;
+    } else if (self->skipset != NULL) {
         return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) !=
                 ((kh_int64_t *)self->skipset)->n_buckets);
     } else {
diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h
@@ -198,6 +198,7 @@ typedef struct parser_t {
     int header_end;    // header row end
 
     void *skipset;
+    PyObject *skipfunc;
     int64_t skip_first_N_rows;
     int skip_footer;
     double (*converter)(const char *, char **, char, char, char, int);