Skip to content

Commit 7ad6c65

Browse files
gfyoungjreback
authored andcommitted
ENH: Accept callable for skiprows in read_csv
Title is self-explanatory. xref #10882. Author: gfyoung <[email protected]> Closes #15059 from gfyoung/skiprows-callable and squashes the following commits: d15e3a3 [gfyoung] ENH: Accept callable for skiprows
1 parent 7892077 commit 7ad6c65

File tree

7 files changed

+107
-14
lines changed

7 files changed

+107
-14
lines changed

doc/source/io.rst

+10
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,16 @@ skipinitialspace : boolean, default ``False``
187187
skiprows : list-like or integer, default ``None``
188188
Line numbers to skip (0-indexed) or number of lines to skip (int) at the start
189189
of the file.
190+
191+
If callable, the callable function will be evaluated against the row
192+
indices, returning True if the row should be skipped and False otherwise:
193+
194+
.. ipython:: python
195+
196+
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
197+
pd.read_csv(StringIO(data))
198+
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)
199+
190200
skipfooter : int, default ``0``
191201
Number of lines at bottom of file to skip (unsupported with engine='c').
192202
skip_footer : int, default ``0``

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ Other enhancements
110110
- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`)
111111
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
112112
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
113+
- The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`10882`)
113114
- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
114115
- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
115116
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)

pandas/io/parsers.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,13 @@
132132
Values to consider as False
133133
skipinitialspace : boolean, default False
134134
Skip spaces after delimiter.
135-
skiprows : list-like or integer, default None
135+
skiprows : list-like or integer or callable, default None
136136
Line numbers to skip (0-indexed) or number of lines to skip (int)
137-
at the start of the file
137+
at the start of the file.
138+
139+
If callable, the callable function will be evaluated against the row
140+
indices, returning True if the row should be skipped and False otherwise.
141+
An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
138142
skipfooter : int, default 0
139143
Number of lines at bottom of file to skip (Unsupported with engine='c')
140144
skip_footer : int, default 0
@@ -930,7 +934,10 @@ def _clean_options(self, options, engine):
930934
if engine != 'c':
931935
if is_integer(skiprows):
932936
skiprows = lrange(skiprows)
933-
skiprows = set() if skiprows is None else set(skiprows)
937+
if skiprows is None:
938+
skiprows = set()
939+
elif not callable(skiprows):
940+
skiprows = set(skiprows)
934941

935942
# put stuff back
936943
result['names'] = names
@@ -1851,6 +1858,11 @@ def __init__(self, f, **kwds):
18511858
self.memory_map = kwds['memory_map']
18521859
self.skiprows = kwds['skiprows']
18531860

1861+
if callable(self.skiprows):
1862+
self.skipfunc = self.skiprows
1863+
else:
1864+
self.skipfunc = lambda x: x in self.skiprows
1865+
18541866
self.skipfooter = kwds['skipfooter']
18551867
self.delimiter = kwds['delimiter']
18561868

@@ -2006,7 +2018,7 @@ class MyDialect(csv.Dialect):
20062018
# attempt to sniff the delimiter
20072019
if sniff_sep:
20082020
line = f.readline()
2009-
while self.pos in self.skiprows:
2021+
while self.skipfunc(self.pos):
20102022
self.pos += 1
20112023
line = f.readline()
20122024

@@ -2414,7 +2426,7 @@ def _empty(self, line):
24142426

24152427
def _next_line(self):
24162428
if isinstance(self.data, list):
2417-
while self.pos in self.skiprows:
2429+
while self.skipfunc(self.pos):
24182430
self.pos += 1
24192431

24202432
while True:
@@ -2433,7 +2445,7 @@ def _next_line(self):
24332445
except IndexError:
24342446
raise StopIteration
24352447
else:
2436-
while self.pos in self.skiprows:
2448+
while self.skipfunc(self.pos):
24372449
self.pos += 1
24382450
next(self.data)
24392451

@@ -2685,7 +2697,7 @@ def _get_lines(self, rows=None):
26852697
# Check for stop rows. n.b.: self.skiprows is a set.
26862698
if self.skiprows:
26872699
new_rows = [row for i, row in enumerate(new_rows)
2688-
if i + self.pos not in self.skiprows]
2700+
if not self.skipfunc(i + self.pos)]
26892701

26902702
lines.extend(new_rows)
26912703
self.pos = new_pos
@@ -2713,7 +2725,7 @@ def _get_lines(self, rows=None):
27132725
except StopIteration:
27142726
if self.skiprows:
27152727
new_rows = [row for i, row in enumerate(new_rows)
2716-
if self.pos + i not in self.skiprows]
2728+
if not self.skipfunc(i + self.pos)]
27172729
lines.extend(new_rows)
27182730
if len(lines) == 0:
27192731
raise

pandas/io/tests/parser/skiprows.py

+25
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import pandas.util.testing as tm
1313

1414
from pandas import DataFrame
15+
from pandas.io.common import EmptyDataError
1516
from pandas.compat import StringIO, range, lrange
1617

1718

@@ -198,3 +199,27 @@ def test_skiprows_infield_quote(self):
198199

199200
df = self.read_csv(StringIO(data), skiprows=2)
200201
tm.assert_frame_equal(df, expected)
202+
203+
def test_skiprows_callable(self):
204+
data = 'a\n1\n2\n3\n4\n5'
205+
206+
skiprows = lambda x: x % 2 == 0
207+
expected = DataFrame({'1': [3, 5]})
208+
df = self.read_csv(StringIO(data), skiprows=skiprows)
209+
tm.assert_frame_equal(df, expected)
210+
211+
expected = DataFrame({'foo': [3, 5]})
212+
df = self.read_csv(StringIO(data), skiprows=skiprows,
213+
header=0, names=['foo'])
214+
tm.assert_frame_equal(df, expected)
215+
216+
skiprows = lambda x: True
217+
msg = "No columns to parse from file"
218+
with tm.assertRaisesRegexp(EmptyDataError, msg):
219+
self.read_csv(StringIO(data), skiprows=skiprows)
220+
221+
# This is a bad callable and should raise.
222+
msg = "by zero"
223+
skiprows = lambda x: 1 / 0
224+
with tm.assertRaisesRegexp(ZeroDivisionError, msg):
225+
self.read_csv(StringIO(data), skiprows=skiprows)

pandas/parser.pyx

+22-4
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ cdef extern from "parser/tokenizer.h":
178178
int header_end # header row end
179179

180180
void *skipset
181+
PyObject *skipfunc
181182
int64_t skip_first_N_rows
182183
int skipfooter
183184
double (*converter)(const char *, char **, char, char, char, int) nogil
@@ -606,9 +607,11 @@ cdef class TextReader:
606607
cdef _make_skiprow_set(self):
607608
if isinstance(self.skiprows, (int, np.integer)):
608609
parser_set_skipfirstnrows(self.parser, self.skiprows)
609-
else:
610+
elif not callable(self.skiprows):
610611
for i in self.skiprows:
611612
parser_add_skiprow(self.parser, i)
613+
else:
614+
self.parser.skipfunc = <PyObject *> self.skiprows
612615

613616
cdef _setup_parser_source(self, source):
614617
cdef:
@@ -2115,18 +2118,33 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL:
21152118
cdef raise_parser_error(object base, parser_t *parser):
21162119
cdef:
21172120
object old_exc
2121+
object exc_type
21182122
PyObject *type
21192123
PyObject *value
21202124
PyObject *traceback
21212125

21222126
if PyErr_Occurred():
2123-
PyErr_Fetch(&type, &value, &traceback);
2124-
Py_XDECREF(type)
2127+
PyErr_Fetch(&type, &value, &traceback)
21252128
Py_XDECREF(traceback)
2129+
21262130
if value != NULL:
21272131
old_exc = <object> value
21282132
Py_XDECREF(value)
2129-
raise old_exc
2133+
2134+
# PyErr_Fetch only returned the error message in *value,
2135+
# so the Exception class must be extracted from *type.
2136+
if isinstance(old_exc, compat.string_types):
2137+
if type != NULL:
2138+
exc_type = <object> type
2139+
else:
2140+
exc_type = ParserError
2141+
2142+
Py_XDECREF(type)
2143+
raise exc_type(old_exc)
2144+
else:
2145+
Py_XDECREF(type)
2146+
raise old_exc
2147+
21302148
message = '%s. C error: ' % base
21312149
if parser.error_msg != NULL:
21322150
if PY3:

pandas/src/parser/tokenizer.c

+28-2
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ void parser_set_default_options(parser_t *self) {
124124
self->thousands = '\0';
125125

126126
self->skipset = NULL;
127+
self->skipfunc = NULL;
127128
self->skip_first_N_rows = -1;
128129
self->skip_footer = 0;
129130
}
@@ -679,7 +680,27 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
679680
}
680681

681682
int skip_this_line(parser_t *self, int64_t rownum) {
682-
if (self->skipset != NULL) {
683+
int should_skip;
684+
PyObject *result;
685+
PyGILState_STATE state;
686+
687+
if (self->skipfunc != NULL) {
688+
state = PyGILState_Ensure();
689+
result = PyObject_CallFunction(self->skipfunc, "i", rownum);
690+
691+
// Error occurred. It will be processed
692+
// and caught at the Cython level.
693+
if (result == NULL) {
694+
should_skip = -1;
695+
} else {
696+
should_skip = PyObject_IsTrue(result);
697+
}
698+
699+
Py_XDECREF(result);
700+
PyGILState_Release(state);
701+
702+
return should_skip;
703+
} else if (self->skipset != NULL) {
683704
return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) !=
684705
((kh_int64_t *)self->skipset)->n_buckets);
685706
} else {
@@ -689,6 +710,7 @@ int skip_this_line(parser_t *self, int64_t rownum) {
689710

690711
int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
691712
int i, slen;
713+
int should_skip;
692714
long maxstreamsize;
693715
char c;
694716
char *stream;
@@ -818,7 +840,11 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
818840

819841
case START_RECORD:
820842
// start of record
821-
if (skip_this_line(self, self->file_lines)) {
843+
should_skip = skip_this_line(self, self->file_lines);
844+
845+
if (should_skip == -1) {
846+
goto parsingerror;
847+
} else if (should_skip) {
822848
if (IS_QUOTE(c)) {
823849
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
824850
} else {

pandas/src/parser/tokenizer.h

+1
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ typedef struct parser_t {
198198
int header_end; // header row end
199199

200200
void *skipset;
201+
PyObject *skipfunc;
201202
int64_t skip_first_N_rows;
202203
int skip_footer;
203204
double (*converter)(const char *, char **, char, char, char, int);

0 commit comments

Comments
 (0)