Skip to content

Commit 5f17d1d

Browse files
committed
ENH: Allow usecols to accept callable (GH14154)
1 parent 5e2f9da commit 5f17d1d

File tree

4 files changed

+76
-22
lines changed

4 files changed

+76
-22
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,7 @@ Other enhancements
515515
- ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`)
516516
- The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json <io.jsonl>` (:issue:`9180`)
517517
- :func:``read_excel`` now supports the true_values and false_values keyword arguments (:issue:`13347`)
518+
- The ``usecols`` argument now accepts a callable function as a value (:issue:`14154`)
518519

519520
.. _whatsnew_0190.api:
520521

pandas/io/parsers.py

+25-10
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from pandas.util.decorators import Appender
3434

3535
import pandas.lib as lib
36+
import pandas.core.common as com
3637
import pandas.parser as _parser
3738

3839

@@ -85,8 +86,8 @@
8586
MultiIndex is used. If you have a malformed file with delimiters at the end
8687
of each line, you might consider index_col=False to force pandas to _not_
8788
use the first column as the index (row names)
88-
usecols : array-like, default None
89-
Return a subset of the columns. All elements in this array must either
89+
usecols : array-like or callable, default None
90+
Return a subset of the columns. If array-like, all elements must either
9091
be positional (i.e. integer indices into the document columns) or strings
9192
that correspond to column names provided either by the user in `names` or
9293
inferred from the document header row(s). For example, a valid `usecols`
@@ -975,17 +976,27 @@ def _is_index_col(col):
975976
return col is not None and col is not False
976977

977978

979+
def _evaluate_usecols(usecols, names):
980+
if callable(usecols):
981+
return set([i for i, name in enumerate(names)
982+
if com._apply_if_callable(usecols, name)])
983+
else:
984+
return usecols
985+
986+
978987
def _validate_usecols_arg(usecols):
979988
"""
980989
Check whether or not the 'usecols' parameter
981990
contains all integers (column selection by index)
982991
or strings (column by name). Raises a ValueError
983992
if that is not the case.
984993
"""
985-
msg = ("The elements of 'usecols' must "
986-
"either be all strings, all unicode, or all integers")
994+
msg = ("'usecols' must either be all strings, all unicode, "
995+
"all integers or callable")
987996

988997
if usecols is not None:
998+
if callable(usecols):
999+
return usecols
9891000
usecols_dtype = lib.infer_dtype(usecols)
9901001
if usecols_dtype not in ('empty', 'integer',
9911002
'string', 'unicode'):
@@ -1425,11 +1436,12 @@ def __init__(self, src, **kwds):
14251436
self.orig_names = self.names[:]
14261437

14271438
if self.usecols:
1428-
if len(self.names) > len(self.usecols):
1439+
usecols = _evaluate_usecols(self.usecols, self.orig_names)
1440+
if len(self.names) > len(usecols):
14291441
self.names = [n for i, n in enumerate(self.names)
1430-
if (i in self.usecols or n in self.usecols)]
1442+
if (i in usecols or n in usecols)]
14311443

1432-
if len(self.names) < len(self.usecols):
1444+
if len(self.names) < len(usecols):
14331445
raise ValueError("Usecols do not match names.")
14341446

14351447
self._set_noconvert_columns()
@@ -1588,9 +1600,10 @@ def read(self, nrows=None):
15881600

15891601
def _filter_usecols(self, names):
15901602
# hackish
1591-
if self.usecols is not None and len(names) != len(self.usecols):
1603+
usecols = _evaluate_usecols(self.usecols, names)
1604+
if usecols is not None and len(names) != len(usecols):
15921605
names = [name for i, name in enumerate(names)
1593-
if i in self.usecols or name in self.usecols]
1606+
if i in usecols or name in usecols]
15941607
return names
15951608

15961609
def _get_index_names(self):
@@ -2191,7 +2204,9 @@ def _handle_usecols(self, columns, usecols_key):
21912204
usecols_key is used if there are string usecols.
21922205
"""
21932206
if self.usecols is not None:
2194-
if any([isinstance(u, string_types) for u in self.usecols]):
2207+
if callable(self.usecols):
2208+
col_indices = _evaluate_usecols(self.usecols, usecols_key)
2209+
elif any([isinstance(u, string_types) for u in self.usecols]):
21952210
if len(columns) > 1:
21962211
raise ValueError("If using multiple headers, usecols must "
21972212
"be integers.")

pandas/io/tests/parser/usecols.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ def test_raise_on_mixed_dtype_usecols(self):
2323
1000,2000,3000
2424
4000,5000,6000
2525
"""
26-
msg = ("The elements of 'usecols' must "
27-
"either be all strings, all unicode, or all integers")
26+
27+
msg = ("'usecols' must either be all strings, all unicode, "
28+
"all integers or callable")
2829
usecols = [0, 'b', 2]
2930

3031
with tm.assertRaisesRegexp(ValueError, msg):
@@ -302,8 +303,8 @@ def test_usecols_with_mixed_encoding_strings(self):
302303
3.568935038,7,False,a
303304
'''
304305

305-
msg = ("The elements of 'usecols' must "
306-
"either be all strings, all unicode, or all integers")
306+
msg = ("'usecols' must either be all strings, all unicode, "
307+
"all integers or callable")
307308

308309
with tm.assertRaisesRegexp(ValueError, msg):
309310
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
@@ -366,3 +367,25 @@ def test_np_array_usecols(self):
366367
expected = DataFrame([[1, 2]], columns=usecols)
367368
result = self.read_csv(StringIO(data), usecols=usecols)
368369
tm.assert_frame_equal(result, expected)
370+
371+
def test_callable_usecols(self):
372+
s = '''AaA,bBb,CCC,ddd
373+
0.056674973,8,True,a
374+
2.613230982,2,False,b
375+
3.568935038,7,False,a
376+
'''
377+
378+
data = {
379+
'AaA': {
380+
0: 0.056674972999999997,
381+
1: 2.6132309819999997,
382+
2: 3.5689350380000002
383+
},
384+
'bBb': {0: 8, 1: 2, 2: 7},
385+
'ddd': {0: 'a', 1: 'b', 2: 'a'}
386+
}
387+
expected = DataFrame(data)
388+
389+
df = self.read_csv(StringIO(s), usecols=lambda x:
390+
x.upper() in ['AAA', 'BBB', 'DDD'])
391+
tm.assert_frame_equal(df, expected)

pandas/parser.pyx

+23-8
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ cimport util
3535

3636
import pandas.lib as lib
3737
import pandas.compat as compat
38+
import pandas.core.common as com
3839
from pandas.types.common import (is_categorical_dtype, CategoricalDtype,
3940
is_integer_dtype, is_float_dtype,
4041
is_bool_dtype, is_object_dtype,
@@ -273,7 +274,7 @@ cdef class TextReader:
273274
object file_handle, na_fvalues
274275
object true_values, false_values
275276
object dsource
276-
bint na_filter, verbose, has_usecols, has_mi_columns
277+
bint na_filter, verbose, has_usecols, has_mi_columns, callable_usecols
277278
int parser_start
278279
list clocks
279280
char *c_encoding
@@ -297,8 +298,10 @@ cdef class TextReader:
297298
object compression
298299
object mangle_dupe_cols
299300
object tupleize_cols
301+
object usecols
300302
list dtype_cast_order
301-
set noconvert, usecols
303+
set noconvert
304+
302305

303306
def __cinit__(self, source,
304307
delimiter=b',',
@@ -434,7 +437,11 @@ cdef class TextReader:
434437
# suboptimal
435438
if usecols is not None:
436439
self.has_usecols = 1
437-
self.usecols = set(usecols)
440+
if callable(usecols):
441+
self.callable_usecols = 1
442+
self.usecols = usecols
443+
else:
444+
self.usecols = set(usecols)
438445

439446
# XXX
440447
if skipfooter > 0:
@@ -698,7 +705,6 @@ cdef class TextReader:
698705
cdef StringPath path = _string_path(self.c_encoding)
699706

700707
header = []
701-
702708
if self.parser.header_start >= 0:
703709

704710
# Header is in the file
@@ -816,7 +822,7 @@ cdef class TextReader:
816822
# 'data has %d fields'
817823
# % (passed_count, field_count))
818824

819-
if self.has_usecols and self.allow_leading_cols:
825+
if self.has_usecols and self.allow_leading_cols and not self.callable_usecols:
820826
nuse = len(self.usecols)
821827
if nuse == passed_count:
822828
self.leading_cols = 0
@@ -1010,17 +1016,25 @@ cdef class TextReader:
10101016

10111017
results = {}
10121018
nused = 0
1019+
10131020
for i in range(self.table_width):
1021+
10141022
if i < self.leading_cols:
10151023
# Pass through leading columns always
10161024
name = i
1017-
elif self.usecols and nused == len(self.usecols):
1025+
elif self.usecols and not self.callable_usecols and nused == len(self.usecols):
10181026
# Once we've gathered all requested columns, stop. GH5766
10191027
break
10201028
else:
10211029
name = self._get_column_name(i, nused)
1022-
if self.has_usecols and not (i in self.usecols or
1023-
name in self.usecols):
1030+
usecols = set()
1031+
if self.callable_usecols:
1032+
if com._apply_if_callable(self.usecols, name):
1033+
usecols = set([i])
1034+
else:
1035+
usecols = self.usecols
1036+
if self.has_usecols and not (i in usecols or
1037+
name in usecols):
10241038
continue
10251039
nused += 1
10261040

@@ -1336,6 +1350,7 @@ def _maybe_upcast(arr):
13361350

13371351
return arr
13381352

1353+
13391354
cdef enum StringPath:
13401355
CSTRING
13411356
UTF8

0 commit comments

Comments
 (0)