Skip to content

Commit bca4a63

Browse files
committed
ENH: Accept callable for skiprows
1 parent 4de5cdc commit bca4a63

File tree

7 files changed

+66
-10
lines changed

7 files changed

+66
-10
lines changed

doc/source/io.rst

+10
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,16 @@ skipinitialspace : boolean, default ``False``
187187
skiprows : list-like or integer, default ``None``
188188
Line numbers to skip (0-indexed) or number of lines to skip (int) at the start
189189
of the file.
190+
191+
If callable, the callable function will be evaluated against the row
192+
indices, returning indices where the callable function evaluates to True:
193+
194+
.. ipython:: python
195+
196+
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
197+
pd.read_csv(StringIO(data))
198+
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)
199+
190200
skipfooter : int, default ``0``
191201
Number of lines at bottom of file to skip (unsupported with engine='c').
192202
skip_footer : int, default ``0``

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ Other enhancements
110110
- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`)
111111
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
112112
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
113+
- The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`10882`)
113114
- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
114115
- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
115116
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)

pandas/io/parsers.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,13 @@
132132
Values to consider as False
133133
skipinitialspace : boolean, default False
134134
Skip spaces after delimiter.
135-
skiprows : list-like or integer, default None
135+
skiprows : list-like or integer or callable, default None
136136
Line numbers to skip (0-indexed) or number of lines to skip (int)
137-
at the start of the file
137+
at the start of the file.
138+
139+
If callable, the callable function will be evaluated against the row
140+
indices, returning indices where the callable function evaluates to True.
141+
An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
138142
skipfooter : int, default 0
139143
Number of lines at bottom of file to skip (Unsupported with engine='c')
140144
skip_footer : int, default 0
@@ -919,7 +923,10 @@ def _clean_options(self, options, engine):
919923
if engine != 'c':
920924
if is_integer(skiprows):
921925
skiprows = lrange(skiprows)
922-
skiprows = set() if skiprows is None else set(skiprows)
926+
if skiprows is None:
927+
skiprows = set()
928+
elif not callable(skiprows):
929+
skiprows = set(skiprows)
923930

924931
# put stuff back
925932
result['names'] = names
@@ -1840,6 +1847,11 @@ def __init__(self, f, **kwds):
18401847
self.memory_map = kwds['memory_map']
18411848
self.skiprows = kwds['skiprows']
18421849

1850+
if callable(self.skiprows):
1851+
self.skipfunc = self.skiprows
1852+
else:
1853+
self.skipfunc = lambda x: x in self.skiprows
1854+
18431855
self.skipfooter = kwds['skipfooter']
18441856
self.delimiter = kwds['delimiter']
18451857

@@ -1995,7 +2007,7 @@ class MyDialect(csv.Dialect):
19952007
# attempt to sniff the delimiter
19962008
if sniff_sep:
19972009
line = f.readline()
1998-
while self.pos in self.skiprows:
2010+
while self.skipfunc(self.pos):
19992011
self.pos += 1
20002012
line = f.readline()
20012013

@@ -2402,7 +2414,7 @@ def _empty(self, line):
24022414

24032415
def _next_line(self):
24042416
if isinstance(self.data, list):
2405-
while self.pos in self.skiprows:
2417+
while self.skipfunc(self.pos):
24062418
self.pos += 1
24072419

24082420
while True:
@@ -2421,7 +2433,7 @@ def _next_line(self):
24212433
except IndexError:
24222434
raise StopIteration
24232435
else:
2424-
while self.pos in self.skiprows:
2436+
while self.skipfunc(self.pos):
24252437
self.pos += 1
24262438
next(self.data)
24272439

@@ -2673,7 +2685,7 @@ def _get_lines(self, rows=None):
26732685
# Check for stop rows. n.b.: self.skiprows is a set.
26742686
if self.skiprows:
26752687
new_rows = [row for i, row in enumerate(new_rows)
2676-
if i + self.pos not in self.skiprows]
2688+
if not self.skipfunc(i + self.pos)]
26772689

26782690
lines.extend(new_rows)
26792691
self.pos = new_pos
@@ -2701,7 +2713,7 @@ def _get_lines(self, rows=None):
27012713
except StopIteration:
27022714
if self.skiprows:
27032715
new_rows = [row for i, row in enumerate(new_rows)
2704-
if self.pos + i not in self.skiprows]
2716+
if not self.skipfunc(i + self.pos)]
27052717
lines.extend(new_rows)
27062718
if len(lines) == 0:
27072719
raise

pandas/io/tests/parser/skiprows.py

+19
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import pandas.util.testing as tm
1313

1414
from pandas import DataFrame
15+
from pandas.io.common import EmptyDataError
1516
from pandas.compat import StringIO, range, lrange
1617

1718

@@ -198,3 +199,21 @@ def test_skiprows_infield_quote(self):
198199

199200
df = self.read_csv(StringIO(data), skiprows=2)
200201
tm.assert_frame_equal(df, expected)
202+
203+
def test_skiprows_callable(self):
204+
data = 'a\n1\n2\n3\n4\n5'
205+
206+
skiprows = lambda x: x % 2 == 0
207+
expected = DataFrame({'1': [3, 5]})
208+
df = self.read_csv(StringIO(data), skiprows=skiprows)
209+
tm.assert_frame_equal(df, expected)
210+
211+
expected = DataFrame({'foo': [3, 5]})
212+
df = self.read_csv(StringIO(data), skiprows=skiprows,
213+
header=0, names=['foo'])
214+
tm.assert_frame_equal(df, expected)
215+
216+
skiprows = lambda x: True
217+
msg = "No columns to parse from file"
218+
with tm.assertRaisesRegexp(EmptyDataError, msg):
219+
self.read_csv(StringIO(data), skiprows=skiprows)

pandas/parser.pyx

+4-1
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ cdef extern from "parser/tokenizer.h":
178178
int header_end # header row end
179179

180180
void *skipset
181+
PyObject *skipfunc
181182
int64_t skip_first_N_rows
182183
int skipfooter
183184
double (*converter)(const char *, char **, char, char, char, int) nogil
@@ -606,9 +607,11 @@ cdef class TextReader:
606607
cdef _make_skiprow_set(self):
607608
if isinstance(self.skiprows, (int, np.integer)):
608609
parser_set_skipfirstnrows(self.parser, self.skiprows)
609-
else:
610+
elif not callable(self.skiprows):
610611
for i in self.skiprows:
611612
parser_add_skiprow(self.parser, i)
613+
else:
614+
self.parser.skipfunc = <PyObject *> self.skiprows
612615

613616
cdef _setup_parser_source(self, source):
614617
cdef:

pandas/src/parser/tokenizer.c

+11-1
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ void parser_set_default_options(parser_t *self) {
124124
self->thousands = '\0';
125125

126126
self->skipset = NULL;
127+
self->skipfunc = NULL;
127128
self->skip_first_N_rows = -1;
128129
self->skip_footer = 0;
129130
}
@@ -679,7 +680,16 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
679680
}
680681

681682
int skip_this_line(parser_t *self, int64_t rownum) {
682-
if (self->skipset != NULL) {
683+
int should_skip;
684+
PyGILState_STATE state;
685+
686+
if (self->skipfunc != NULL) {
687+
state = PyGILState_Ensure();
688+
should_skip = PyObject_IsTrue(PyObject_CallFunction(
689+
self->skipfunc, "i", rownum));
690+
PyGILState_Release(state);
691+
return should_skip;
692+
} else if (self->skipset != NULL) {
683693
return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) !=
684694
((kh_int64_t *)self->skipset)->n_buckets);
685695
} else {

pandas/src/parser/tokenizer.h

+1
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ typedef struct parser_t {
198198
int header_end; // header row end
199199

200200
void *skipset;
201+
PyObject *skipfunc;
201202
int64_t skip_first_N_rows;
202203
int skip_footer;
203204
double (*converter)(const char *, char **, char, char, char, int);

0 commit comments

Comments
 (0)