Skip to content

Commit 8cdc09e

Browse files
WillAydjorisvandenbossche
authored andcommitted
ENH: Allow usecols to accept callable (GH14154) (#14234)
1 parent 6e514da commit 8cdc09e

File tree

5 files changed

+113
-34
lines changed

5 files changed

+113
-34
lines changed

doc/source/io.rst

+19-6
Original file line numberDiff line numberDiff line change
@@ -126,13 +126,23 @@ index_col : int or sequence or ``False``, default ``None``
126126
MultiIndex is used. If you have a malformed file with delimiters at the end of
127127
each line, you might consider ``index_col=False`` to force pandas to *not* use
128128
the first column as the index (row names).
129-
usecols : array-like, default ``None``
130-
Return a subset of the columns. All elements in this array must either
129+
usecols : array-like or callable, default ``None``
130+
Return a subset of the columns. If array-like, all elements must either
131131
be positional (i.e. integer indices into the document columns) or strings
132132
that correspond to column names provided either by the user in `names` or
133-
inferred from the document header row(s). For example, a valid `usecols`
134-
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
135-
results in much faster parsing time and lower memory usage.
133+
inferred from the document header row(s). For example, a valid array-like
134+
`usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz'].
135+
136+
If callable, the callable function will be evaluated against the column names,
137+
returning names where the callable function evaluates to True:
138+
139+
.. ipython:: python
140+
141+
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
142+
pd.read_csv(StringIO(data))
143+
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])
144+
145+
Using this parameter results in much faster parsing time and lower memory usage.
136146
as_recarray : boolean, default ``False``
137147
DEPRECATED: this argument will be removed in a future version. Please call
138148
``pd.read_csv(...).to_records()`` instead.
@@ -617,14 +627,17 @@ Filtering columns (``usecols``)
617627
+++++++++++++++++++++++++++++++
618628

619629
The ``usecols`` argument allows you to select any subset of the columns in a
620-
file, either using the column names or position numbers:
630+
file, either using the column names, position numbers or a callable:
631+
632+
.. versionadded:: 0.20.0 support for callable `usecols` arguments
621633

622634
.. ipython:: python
623635
624636
data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'
625637
pd.read_csv(StringIO(data))
626638
pd.read_csv(StringIO(data), usecols=['b', 'd'])
627639
pd.read_csv(StringIO(data), usecols=[0, 2, 3])
640+
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C'])
628641
629642
Comments and Empty Lines
630643
''''''''''''''''''''''''

doc/source/whatsnew/v0.20.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ Other enhancements
5252
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
5353
- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`)
5454
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
55+
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
5556

5657
.. _whatsnew_0200.api_breaking:
5758

@@ -106,4 +107,4 @@ Performance Improvements
106107
.. _whatsnew_0200.bug_fixes:
107108

108109
Bug Fixes
109-
~~~~~~~~~
110+
~~~~~~~~~

pandas/io/parsers.py

+41-16
Original file line numberDiff line numberDiff line change
@@ -90,13 +90,18 @@
9090
MultiIndex is used. If you have a malformed file with delimiters at the end
9191
of each line, you might consider index_col=False to force pandas to _not_
9292
use the first column as the index (row names)
93-
usecols : array-like, default None
94-
Return a subset of the columns. All elements in this array must either
93+
usecols : array-like or callable, default None
94+
Return a subset of the columns. If array-like, all elements must either
9595
be positional (i.e. integer indices into the document columns) or strings
9696
that correspond to column names provided either by the user in `names` or
97-
inferred from the document header row(s). For example, a valid `usecols`
98-
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
99-
results in much faster parsing time and lower memory usage.
97+
inferred from the document header row(s). For example, a valid array-like
98+
`usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz'].
99+
100+
If callable, the callable function will be evaluated against the column
101+
names, returning names where the callable function evaluates to True. An
102+
example of a valid callable argument would be ``lambda x: x.upper() in
103+
['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
104+
parsing time and lower memory usage.
100105
as_recarray : boolean, default False
101106
DEPRECATED: this argument will be removed in a future version. Please call
102107
`pd.read_csv(...).to_records()` instead.
@@ -977,17 +982,33 @@ def _is_index_col(col):
977982
return col is not None and col is not False
978983

979984

985+
def _evaluate_usecols(usecols, names):
986+
"""
987+
Check whether or not the 'usecols' parameter
988+
is a callable. If so, enumerates the 'names'
989+
parameter and returns a set of indices for
990+
each entry in 'names' that evaluates to True.
991+
If not a callable, returns 'usecols'.
992+
"""
993+
if callable(usecols):
994+
return set([i for i, name in enumerate(names)
995+
if usecols(name)])
996+
return usecols
997+
998+
980999
def _validate_usecols_arg(usecols):
9811000
"""
9821001
Check whether or not the 'usecols' parameter
983-
contains all integers (column selection by index)
984-
or strings (column by name). Raises a ValueError
985-
if that is not the case.
1002+
contains all integers (column selection by index),
1003+
strings (column by name) or is a callable. Raises
1004+
a ValueError if that is not the case.
9861005
"""
987-
msg = ("The elements of 'usecols' must "
988-
"either be all strings, all unicode, or all integers")
1006+
msg = ("'usecols' must either be all strings, all unicode, "
1007+
"all integers or a callable")
9891008

9901009
if usecols is not None:
1010+
if callable(usecols):
1011+
return usecols
9911012
usecols_dtype = lib.infer_dtype(usecols)
9921013
if usecols_dtype not in ('empty', 'integer',
9931014
'string', 'unicode'):
@@ -1499,11 +1520,12 @@ def __init__(self, src, **kwds):
14991520
self.orig_names = self.names[:]
15001521

15011522
if self.usecols:
1502-
if len(self.names) > len(self.usecols):
1523+
usecols = _evaluate_usecols(self.usecols, self.orig_names)
1524+
if len(self.names) > len(usecols):
15031525
self.names = [n for i, n in enumerate(self.names)
1504-
if (i in self.usecols or n in self.usecols)]
1526+
if (i in usecols or n in usecols)]
15051527

1506-
if len(self.names) < len(self.usecols):
1528+
if len(self.names) < len(usecols):
15071529
raise ValueError("Usecols do not match names.")
15081530

15091531
self._set_noconvert_columns()
@@ -1665,9 +1687,10 @@ def read(self, nrows=None):
16651687

16661688
def _filter_usecols(self, names):
16671689
# hackish
1668-
if self.usecols is not None and len(names) != len(self.usecols):
1690+
usecols = _evaluate_usecols(self.usecols, names)
1691+
if usecols is not None and len(names) != len(usecols):
16691692
names = [name for i, name in enumerate(names)
1670-
if i in self.usecols or name in self.usecols]
1693+
if i in usecols or name in usecols]
16711694
return names
16721695

16731696
def _get_index_names(self):
@@ -2291,7 +2314,9 @@ def _handle_usecols(self, columns, usecols_key):
22912314
usecols_key is used if there are string usecols.
22922315
"""
22932316
if self.usecols is not None:
2294-
if any([isinstance(col, string_types) for col in self.usecols]):
2317+
if callable(self.usecols):
2318+
col_indices = _evaluate_usecols(self.usecols, usecols_key)
2319+
elif any([isinstance(u, string_types) for u in self.usecols]):
22952320
if len(columns) > 1:
22962321
raise ValueError("If using multiple headers, usecols must "
22972322
"be integers.")

pandas/io/tests/parser/usecols.py

+33-4
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ def test_raise_on_mixed_dtype_usecols(self):
2323
1000,2000,3000
2424
4000,5000,6000
2525
"""
26-
msg = ("The elements of 'usecols' must "
27-
"either be all strings, all unicode, or all integers")
26+
27+
msg = ("'usecols' must either be all strings, all unicode, "
28+
"all integers or a callable")
2829
usecols = [0, 'b', 2]
2930

3031
with tm.assertRaisesRegexp(ValueError, msg):
@@ -302,8 +303,8 @@ def test_usecols_with_mixed_encoding_strings(self):
302303
3.568935038,7,False,a
303304
'''
304305

305-
msg = ("The elements of 'usecols' must "
306-
"either be all strings, all unicode, or all integers")
306+
msg = ("'usecols' must either be all strings, all unicode, "
307+
"all integers or a callable")
307308

308309
with tm.assertRaisesRegexp(ValueError, msg):
309310
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
@@ -366,3 +367,31 @@ def test_np_array_usecols(self):
366367
expected = DataFrame([[1, 2]], columns=usecols)
367368
result = self.read_csv(StringIO(data), usecols=usecols)
368369
tm.assert_frame_equal(result, expected)
370+
371+
def test_callable_usecols(self):
372+
# See gh-14154
373+
s = '''AaA,bBb,CCC,ddd
374+
0.056674973,8,True,a
375+
2.613230982,2,False,b
376+
3.568935038,7,False,a
377+
'''
378+
379+
data = {
380+
'AaA': {
381+
0: 0.056674972999999997,
382+
1: 2.6132309819999997,
383+
2: 3.5689350380000002
384+
},
385+
'bBb': {0: 8, 1: 2, 2: 7},
386+
'ddd': {0: 'a', 1: 'b', 2: 'a'}
387+
}
388+
expected = DataFrame(data)
389+
df = self.read_csv(StringIO(s), usecols=lambda x:
390+
x.upper() in ['AAA', 'BBB', 'DDD'])
391+
tm.assert_frame_equal(df, expected)
392+
393+
# Check that a callable returning only False returns
394+
# an empty DataFrame
395+
expected = DataFrame()
396+
df = self.read_csv(StringIO(s), usecols=lambda x: False)
397+
tm.assert_frame_equal(df, expected)

pandas/parser.pyx

+18-7
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,9 @@ cdef class TextReader:
300300
object compression
301301
object mangle_dupe_cols
302302
object tupleize_cols
303+
object usecols
303304
list dtype_cast_order
304-
set noconvert, usecols
305+
set noconvert
305306

306307
def __cinit__(self, source,
307308
delimiter=b',',
@@ -437,7 +438,10 @@ cdef class TextReader:
437438
# suboptimal
438439
if usecols is not None:
439440
self.has_usecols = 1
440-
self.usecols = set(usecols)
441+
if callable(usecols):
442+
self.usecols = usecols
443+
else:
444+
self.usecols = set(usecols)
441445

442446
# XXX
443447
if skipfooter > 0:
@@ -701,7 +705,6 @@ cdef class TextReader:
701705
cdef StringPath path = _string_path(self.c_encoding)
702706

703707
header = []
704-
705708
if self.parser.header_start >= 0:
706709

707710
# Header is in the file
@@ -821,7 +824,8 @@ cdef class TextReader:
821824
# 'data has %d fields'
822825
# % (passed_count, field_count))
823826

824-
if self.has_usecols and self.allow_leading_cols:
827+
if self.has_usecols and self.allow_leading_cols and \
828+
not callable(self.usecols):
825829
nuse = len(self.usecols)
826830
if nuse == passed_count:
827831
self.leading_cols = 0
@@ -1019,13 +1023,20 @@ cdef class TextReader:
10191023
if i < self.leading_cols:
10201024
# Pass through leading columns always
10211025
name = i
1022-
elif self.usecols and nused == len(self.usecols):
1026+
elif self.usecols and not callable(self.usecols) and \
1027+
nused == len(self.usecols):
10231028
# Once we've gathered all requested columns, stop. GH5766
10241029
break
10251030
else:
10261031
name = self._get_column_name(i, nused)
1027-
if self.has_usecols and not (i in self.usecols or
1028-
name in self.usecols):
1032+
usecols = set()
1033+
if callable(self.usecols):
1034+
if self.usecols(name):
1035+
usecols = set([i])
1036+
else:
1037+
usecols = self.usecols
1038+
if self.has_usecols and not (i in usecols or
1039+
name in usecols):
10291040
continue
10301041
nused += 1
10311042

0 commit comments

Comments
 (0)