ENH: Accept callable for skiprows in read_csv

gfyoung · jreback · commit 7ad6c65c3ce6 · 2017-01-14T12:03:08.000-05:00
Title is self-explanatory. xref #10882. Author: gfyoung <gfyoung17@gmail.com> Closes #15059 from gfyoung/skiprows-callable and squashes the following commits: d15e3a3 [gfyoung] ENH: Accept callable for skiprows
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -187,6 +187,16 @@ skipinitialspace : boolean, default ``False``
 skiprows : list-like or integer, default ``None``
   Line numbers to skip (0-indexed) or number of lines to skip (int) at the start
   of the file.
+
+  If callable, the callable function will be evaluated against the row
+  indices, returning True if the row should be skipped and False otherwise:
+
+  .. ipython:: python
+
+     data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
+     pd.read_csv(StringIO(data))
+     pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)
+
 skipfooter : int, default ``0``
   Number of lines at bottom of file to skip (unsupported with engine='c').
 skip_footer : int, default ``0``
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -110,6 +110,7 @@ Other enhancements
 - ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`)
 - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
 - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value  (:issue:`14154`)
+- The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value  (:issue:`10882`)
 - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
 - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
 - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -132,9 +132,13 @@
     Values to consider as False
 skipinitialspace : boolean, default False
     Skip spaces after delimiter.
-skiprows : list-like or integer, default None
+skiprows : list-like or integer or callable, default None
     Line numbers to skip (0-indexed) or number of lines to skip (int)
-    at the start of the file
+    at the start of the file.
+
+    If callable, the callable function will be evaluated against the row
+    indices, returning True if the row should be skipped and False otherwise.
+    An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
 skipfooter : int, default 0
     Number of lines at bottom of file to skip (Unsupported with engine='c')
 skip_footer : int, default 0
@@ -930,7 +934,10 @@ def _clean_options(self, options, engine):
         if engine != 'c':
             if is_integer(skiprows):
                 skiprows = lrange(skiprows)
-            skiprows = set() if skiprows is None else set(skiprows)
+            if skiprows is None:
+                skiprows = set()
+            elif not callable(skiprows):
+                skiprows = set(skiprows)
 
         # put stuff back
         result['names'] = names
@@ -1851,6 +1858,11 @@ def __init__(self, f, **kwds):
         self.memory_map = kwds['memory_map']
         self.skiprows = kwds['skiprows']
 
+        if callable(self.skiprows):
+            self.skipfunc = self.skiprows
+        else:
+            self.skipfunc = lambda x: x in self.skiprows
+
         self.skipfooter = kwds['skipfooter']
         self.delimiter = kwds['delimiter']
 
@@ -2006,7 +2018,7 @@ class MyDialect(csv.Dialect):
             # attempt to sniff the delimiter
             if sniff_sep:
                 line = f.readline()
-                while self.pos in self.skiprows:
+                while self.skipfunc(self.pos):
                     self.pos += 1
                     line = f.readline()
 
@@ -2414,7 +2426,7 @@ def _empty(self, line):
 
     def _next_line(self):
         if isinstance(self.data, list):
-            while self.pos in self.skiprows:
+            while self.skipfunc(self.pos):
                 self.pos += 1
 
             while True:
@@ -2433,7 +2445,7 @@ def _next_line(self):
                 except IndexError:
                     raise StopIteration
         else:
-            while self.pos in self.skiprows:
+            while self.skipfunc(self.pos):
                 self.pos += 1
                 next(self.data)
 
@@ -2685,7 +2697,7 @@ def _get_lines(self, rows=None):
                 # Check for stop rows. n.b.: self.skiprows is a set.
                 if self.skiprows:
                     new_rows = [row for i, row in enumerate(new_rows)
-                                if i + self.pos not in self.skiprows]
+                                if not self.skipfunc(i + self.pos)]
 
                 lines.extend(new_rows)
                 self.pos = new_pos
@@ -2713,7 +2725,7 @@ def _get_lines(self, rows=None):
                 except StopIteration:
                     if self.skiprows:
                         new_rows = [row for i, row in enumerate(new_rows)
-                                    if self.pos + i not in self.skiprows]
+                                    if not self.skipfunc(i + self.pos)]
                     lines.extend(new_rows)
                     if len(lines) == 0:
                         raise
diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py
@@ -12,6 +12,7 @@
 import pandas.util.testing as tm
 
 from pandas import DataFrame
+from pandas.io.common import EmptyDataError
 from pandas.compat import StringIO, range, lrange
 
 
@@ -198,3 +199,27 @@ def test_skiprows_infield_quote(self):
 
         df = self.read_csv(StringIO(data), skiprows=2)
         tm.assert_frame_equal(df, expected)
+
+    def test_skiprows_callable(self):
+        data = 'a\n1\n2\n3\n4\n5'
+
+        skiprows = lambda x: x % 2 == 0
+        expected = DataFrame({'1': [3, 5]})
+        df = self.read_csv(StringIO(data), skiprows=skiprows)
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame({'foo': [3, 5]})
+        df = self.read_csv(StringIO(data), skiprows=skiprows,
+                           header=0, names=['foo'])
+        tm.assert_frame_equal(df, expected)
+
+        skiprows = lambda x: True
+        msg = "No columns to parse from file"
+        with tm.assertRaisesRegexp(EmptyDataError, msg):
+            self.read_csv(StringIO(data), skiprows=skiprows)
+
+        # This is a bad callable and should raise.
+        msg = "by zero"
+        skiprows = lambda x: 1 / 0
+        with tm.assertRaisesRegexp(ZeroDivisionError, msg):
+            self.read_csv(StringIO(data), skiprows=skiprows)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -178,6 +178,7 @@ cdef extern from "parser/tokenizer.h":
         int header_end # header row end
 
         void *skipset
+        PyObject *skipfunc
         int64_t skip_first_N_rows
         int skipfooter
         double (*converter)(const char *, char **, char, char, char, int) nogil
@@ -606,9 +607,11 @@ cdef class TextReader:
     cdef _make_skiprow_set(self):
         if isinstance(self.skiprows, (int, np.integer)):
             parser_set_skipfirstnrows(self.parser, self.skiprows)
-        else:
+        elif not callable(self.skiprows):
             for i in self.skiprows:
                 parser_add_skiprow(self.parser, i)
+        else:
+            self.parser.skipfunc = <PyObject *> self.skiprows
 
     cdef _setup_parser_source(self, source):
         cdef:
@@ -2115,18 +2118,33 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL:
 cdef raise_parser_error(object base, parser_t *parser):
     cdef:
         object old_exc
+        object exc_type
         PyObject *type
         PyObject *value
         PyObject *traceback
 
     if PyErr_Occurred():
-        PyErr_Fetch(&type, &value, &traceback);
-        Py_XDECREF(type)
+        PyErr_Fetch(&type, &value, &traceback)
         Py_XDECREF(traceback)
+
         if value != NULL:
             old_exc = <object> value
             Py_XDECREF(value)
-            raise old_exc
+
+            # PyErr_Fetch only returned the error message in *value,
+            # so the Exception class must be extracted from *type.
+            if isinstance(old_exc, compat.string_types):
+                if type != NULL:
+                    exc_type = <object> type
+                else:
+                    exc_type = ParserError
+
+                Py_XDECREF(type)
+                raise exc_type(old_exc)
+            else:
+                Py_XDECREF(type)
+                raise old_exc
+
     message = '%s. C error: ' % base
     if parser.error_msg != NULL:
         if PY3:
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -124,6 +124,7 @@ void parser_set_default_options(parser_t *self) {
     self->thousands = '\0';
 
     self->skipset = NULL;
+    self->skipfunc = NULL;
     self->skip_first_N_rows = -1;
     self->skip_footer = 0;
 }
@@ -679,7 +680,27 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
     }
 
 int skip_this_line(parser_t *self, int64_t rownum) {
-    if (self->skipset != NULL) {
+    int should_skip;
+    PyObject *result;
+    PyGILState_STATE state;
+
+    if (self->skipfunc != NULL) {
+        state = PyGILState_Ensure();
+        result = PyObject_CallFunction(self->skipfunc, "i", rownum);
+
+        // Error occurred. It will be processed
+        // and caught at the Cython level.
+        if (result == NULL) {
+            should_skip = -1;
+        } else {
+            should_skip = PyObject_IsTrue(result);
+        }
+
+        Py_XDECREF(result);
+        PyGILState_Release(state);
+
+        return should_skip;
+    } else if (self->skipset != NULL) {
         return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) !=
                 ((kh_int64_t *)self->skipset)->n_buckets);
     } else {
@@ -689,6 +710,7 @@ int skip_this_line(parser_t *self, int64_t rownum) {
 
 int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
     int i, slen;
+    int should_skip;
     long maxstreamsize;
     char c;
     char *stream;
@@ -818,7 +840,11 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
 
             case START_RECORD:
                 // start of record
-                if (skip_this_line(self, self->file_lines)) {
+                should_skip = skip_this_line(self, self->file_lines);
+
+                if (should_skip == -1) {
+                    goto parsingerror;
+                } else if (should_skip) {
                     if (IS_QUOTE(c)) {
                         self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
                     } else {
diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h
@@ -198,6 +198,7 @@ typedef struct parser_t {
     int header_end;    // header row end
 
     void *skipset;
+    PyObject *skipfunc;
     int64_t skip_first_N_rows;
     int skip_footer;
     double (*converter)(const char *, char **, char, char, char, int);