Skip to content

Commit a1a05ca

Browse files
committed
ENH: Allow usecols to accept callable (GH14154)
1 parent b1d9599 commit a1a05ca

File tree

5 files changed

+108
-33
lines changed

5 files changed

+108
-33
lines changed

doc/source/io.rst

+13-6
Original file line numberDiff line numberDiff line change
@@ -126,13 +126,17 @@ index_col : int or sequence or ``False``, default ``None``
126126
MultiIndex is used. If you have a malformed file with delimiters at the end of
127127
each line, you might consider ``index_col=False`` to force pandas to *not* use
128128
the first column as the index (row names).
129-
usecols : array-like, default ``None``
130-
Return a subset of the columns. All elements in this array must either
129+
usecols : array-like or callable, default ``None``
130+
Return a subset of the columns. If array-like, all elements must either
131131
be positional (i.e. integer indices into the document columns) or strings
132132
that correspond to column names provided either by the user in `names` or
133-
inferred from the document header row(s). For example, a valid `usecols`
134-
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
135-
results in much faster parsing time and lower memory usage.
133+
inferred from the document header row(s). For example, a valid array-like
134+
`usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. If callable,
135+
the callable function will be evaluated against the column names, returning
136+
names where the callable function evaluates to True. An example of a valid
137+
callable argument would be ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``.
138+
Using this parameter results in much faster parsing time and lower memory
139+
usage.
136140
as_recarray : boolean, default ``False``
137141
DEPRECATED: this argument will be removed in a future version. Please call
138142
``pd.read_csv(...).to_records()`` instead.
@@ -615,14 +619,17 @@ Filtering columns (``usecols``)
615619
+++++++++++++++++++++++++++++++
616620

617621
The ``usecols`` argument allows you to select any subset of the columns in a
618-
file, either using the column names or position numbers:
622+
file, either using the column names, position numbers or a callable:
623+
624+
.. versionadded:: 0.20.0 support for callable `usecols` arguments
619625

620626
.. ipython:: python
621627
622628
data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'
623629
pd.read_csv(StringIO(data))
624630
pd.read_csv(StringIO(data), usecols=['b', 'd'])
625631
pd.read_csv(StringIO(data), usecols=[0, 2, 3])
632+
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C'])
626633
627634
Comments and Empty Lines
628635
''''''''''''''''''''''''

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Other enhancements
3131
^^^^^^^^^^^^^^^^^^
3232

3333
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
34+
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
3435

3536

3637
.. _whatsnew_0200.api_breaking:

pandas/io/parsers.py

+33-16
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,17 @@
8686
MultiIndex is used. If you have a malformed file with delimiters at the end
8787
of each line, you might consider index_col=False to force pandas to _not_
8888
use the first column as the index (row names)
89-
usecols : array-like, default None
90-
Return a subset of the columns. All elements in this array must either
89+
usecols : array-like or callable, default None
90+
Return a subset of the columns. If array-like, all elements must either
9191
be positional (i.e. integer indices into the document columns) or strings
9292
that correspond to column names provided either by the user in `names` or
93-
inferred from the document header row(s). For example, a valid `usecols`
94-
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
95-
results in much faster parsing time and lower memory usage.
93+
inferred from the document header row(s). For example, a valid array-like
94+
`usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. If
95+
callable, the callable function will be evaluated against the column names,
96+
returning names where the callable function evaluates to True. An example
97+
of a valid callable argument would be ``lambda x: x.upper() in ['AAA',
98+
'BBB', 'DDD']``. Using this parameter results in much faster parsing
99+
time and lower memory usage.
96100
as_recarray : boolean, default False
97101
DEPRECATED: this argument will be removed in a future version. Please call
98102
`pd.read_csv(...).to_records()` instead.
@@ -976,17 +980,26 @@ def _is_index_col(col):
976980
return col is not None and col is not False
977981

978982

983+
def _evaluate_usecols(usecols, names):
984+
if callable(usecols):
985+
return set([i for i, name in enumerate(names)
986+
if usecols(name)])
987+
return usecols
988+
989+
979990
def _validate_usecols_arg(usecols):
980991
"""
981992
Check whether or not the 'usecols' parameter
982-
contains all integers (column selection by index)
983-
or strings (column by name). Raises a ValueError
984-
if that is not the case.
993+
contains all integers (column selection by index),
994+
strings (column by name) or is a callable. Raises
995+
a ValueError if that is not the case.
985996
"""
986-
msg = ("The elements of 'usecols' must "
987-
"either be all strings, all unicode, or all integers")
997+
msg = ("'usecols' must either be all strings, all unicode, "
998+
"all integers or a callable")
988999

9891000
if usecols is not None:
1001+
if callable(usecols):
1002+
return usecols
9901003
usecols_dtype = lib.infer_dtype(usecols)
9911004
if usecols_dtype not in ('empty', 'integer',
9921005
'string', 'unicode'):
@@ -1426,11 +1439,12 @@ def __init__(self, src, **kwds):
14261439
self.orig_names = self.names[:]
14271440

14281441
if self.usecols:
1429-
if len(self.names) > len(self.usecols):
1442+
usecols = _evaluate_usecols(self.usecols, self.orig_names)
1443+
if len(self.names) > len(usecols):
14301444
self.names = [n for i, n in enumerate(self.names)
1431-
if (i in self.usecols or n in self.usecols)]
1445+
if (i in usecols or n in usecols)]
14321446

1433-
if len(self.names) < len(self.usecols):
1447+
if len(self.names) < len(usecols):
14341448
raise ValueError("Usecols do not match names.")
14351449

14361450
self._set_noconvert_columns()
@@ -1592,9 +1606,10 @@ def read(self, nrows=None):
15921606

15931607
def _filter_usecols(self, names):
15941608
# hackish
1595-
if self.usecols is not None and len(names) != len(self.usecols):
1609+
usecols = _evaluate_usecols(self.usecols, names)
1610+
if usecols is not None and len(names) != len(usecols):
15961611
names = [name for i, name in enumerate(names)
1597-
if i in self.usecols or name in self.usecols]
1612+
if i in usecols or name in usecols]
15981613
return names
15991614

16001615
def _get_index_names(self):
@@ -2207,7 +2222,9 @@ def _handle_usecols(self, columns, usecols_key):
22072222
usecols_key is used if there are string usecols.
22082223
"""
22092224
if self.usecols is not None:
2210-
if any([isinstance(col, string_types) for col in self.usecols]):
2225+
if callable(self.usecols):
2226+
col_indices = _evaluate_usecols(self.usecols, usecols_key)
2227+
elif any([isinstance(u, string_types) for u in self.usecols]):
22112228
if len(columns) > 1:
22122229
raise ValueError("If using multiple headers, usecols must "
22132230
"be integers.")

pandas/io/tests/parser/usecols.py

+42-4
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ def test_raise_on_mixed_dtype_usecols(self):
2323
1000,2000,3000
2424
4000,5000,6000
2525
"""
26-
msg = ("The elements of 'usecols' must "
27-
"either be all strings, all unicode, or all integers")
26+
27+
msg = ("'usecols' must either be all strings, all unicode, "
28+
"all integers or a callable")
2829
usecols = [0, 'b', 2]
2930

3031
with tm.assertRaisesRegexp(ValueError, msg):
@@ -302,8 +303,8 @@ def test_usecols_with_mixed_encoding_strings(self):
302303
3.568935038,7,False,a
303304
'''
304305

305-
msg = ("The elements of 'usecols' must "
306-
"either be all strings, all unicode, or all integers")
306+
msg = ("'usecols' must either be all strings, all unicode, "
307+
"all integers or a callable")
307308

308309
with tm.assertRaisesRegexp(ValueError, msg):
309310
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
@@ -366,3 +367,40 @@ def test_np_array_usecols(self):
366367
expected = DataFrame([[1, 2]], columns=usecols)
367368
result = self.read_csv(StringIO(data), usecols=usecols)
368369
tm.assert_frame_equal(result, expected)
370+
371+
def test_callable_usecols(self):
372+
# See gh-14154
373+
s = '''AaA,bBb,CCC,ddd
374+
0.056674973,8,True,a
375+
2.613230982,2,False,b
376+
3.568935038,7,False,a
377+
'''
378+
379+
data = {
380+
'AaA': {
381+
0: 0.056674972999999997,
382+
1: 2.6132309819999997,
383+
2: 3.5689350380000002
384+
},
385+
'bBb': {0: 8, 1: 2, 2: 7},
386+
'ddd': {0: 'a', 1: 'b', 2: 'a'}
387+
}
388+
expected = DataFrame(data)
389+
390+
df = self.read_csv(StringIO(s), usecols=lambda x:
391+
x.upper() in ['AAA', 'BBB', 'DDD'])
392+
tm.assert_frame_equal(df, expected)
393+
394+
def test_callable_usecols_with_false_callable(self):
395+
# See gh-14154
396+
s = '''AaA,bBb,CCC,ddd
397+
0.056674973,8,True,a
398+
2.613230982,2,False,b
399+
3.568935038,7,False,a
400+
'''
401+
402+
expected = DataFrame()
403+
404+
df = self.read_csv(StringIO(s), usecols=lambda x: False)
405+
406+
tm.assert_frame_equal(df, expected)

pandas/parser.pyx

+19-7
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,9 @@ cdef class TextReader:
300300
object compression
301301
object mangle_dupe_cols
302302
object tupleize_cols
303+
object usecols
303304
list dtype_cast_order
304-
set noconvert, usecols
305+
set noconvert
305306

306307
def __cinit__(self, source,
307308
delimiter=b',',
@@ -437,7 +438,10 @@ cdef class TextReader:
437438
# suboptimal
438439
if usecols is not None:
439440
self.has_usecols = 1
440-
self.usecols = set(usecols)
441+
if callable(usecols):
442+
self.usecols = usecols
443+
else:
444+
self.usecols = set(usecols)
441445

442446
# XXX
443447
if skipfooter > 0:
@@ -701,7 +705,6 @@ cdef class TextReader:
701705
cdef StringPath path = _string_path(self.c_encoding)
702706

703707
header = []
704-
705708
if self.parser.header_start >= 0:
706709

707710
# Header is in the file
@@ -821,7 +824,8 @@ cdef class TextReader:
821824
# 'data has %d fields'
822825
# % (passed_count, field_count))
823826

824-
if self.has_usecols and self.allow_leading_cols:
827+
if self.has_usecols and self.allow_leading_cols and \
828+
not callable(self.usecols):
825829
nuse = len(self.usecols)
826830
if nuse == passed_count:
827831
self.leading_cols = 0
@@ -1019,13 +1023,20 @@ cdef class TextReader:
10191023
if i < self.leading_cols:
10201024
# Pass through leading columns always
10211025
name = i
1022-
elif self.usecols and nused == len(self.usecols):
1026+
elif self.usecols and not callable(self.usecols) and \
1027+
nused == len(self.usecols):
10231028
# Once we've gathered all requested columns, stop. GH5766
10241029
break
10251030
else:
10261031
name = self._get_column_name(i, nused)
1027-
if self.has_usecols and not (i in self.usecols or
1028-
name in self.usecols):
1032+
usecols = set()
1033+
if callable(self.usecols):
1034+
if self.usecols(name):
1035+
usecols = set([i])
1036+
else:
1037+
usecols = self.usecols
1038+
if self.has_usecols and not (i in usecols or
1039+
name in usecols):
10291040
continue
10301041
nused += 1
10311042

@@ -1341,6 +1352,7 @@ def _maybe_upcast(arr):
13411352

13421353
return arr
13431354

1355+
13441356
cdef enum StringPath:
13451357
CSTRING
13461358
UTF8

0 commit comments

Comments
 (0)