Skip to content

Commit bcbc7af

Browse files
committed
Merge pull request #7434 from cpcloud/with-types
ENH: select_dypes impl
2 parents 4c79a11 + 4fc5ae7 commit bcbc7af

File tree

6 files changed

+379
-3
lines changed

6 files changed

+379
-3
lines changed

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,7 @@ Attributes and underlying data
524524
DataFrame.ftypes
525525
DataFrame.get_dtype_counts
526526
DataFrame.get_ftype_counts
527+
DataFrame.select_dtypes
527528
DataFrame.values
528529
DataFrame.axes
529530
DataFrame.ndim

doc/source/basics.rst

+81
Original file line numberDiff line numberDiff line change
@@ -1552,3 +1552,84 @@ While float dtypes are unchanged.
15521552
casted = dfa[df2>0]
15531553
casted
15541554
casted.dtypes
1555+
1556+
Selecting columns based on ``dtype``
1557+
------------------------------------
1558+
1559+
.. _basics.selectdtypes:
1560+
1561+
.. versionadded:: 0.14.1
1562+
1563+
The :meth:`~pandas.DataFrame.select_dtypes` method implements subsetting of columns
1564+
based on their ``dtype``.
1565+
1566+
First, let's create a :class:`~pandas.DataFrame` with a slew of different
1567+
dtypes:
1568+
1569+
.. ipython:: python
1570+
1571+
df = DataFrame({'string': list('abc'),
1572+
'int64': list(range(1, 4)),
1573+
'uint8': np.arange(3, 6).astype('u1'),
1574+
'float64': np.arange(4.0, 7.0),
1575+
'bool1': [True, False, True],
1576+
'bool2': [False, True, False],
1577+
'dates': pd.date_range('now', periods=3).values})
1578+
df['tdeltas'] = df.dates.diff()
1579+
df['uint64'] = np.arange(3, 6).astype('u8')
1580+
df['other_dates'] = pd.date_range('20130101', periods=3).values
1581+
df
1582+
1583+
1584+
``select_dtypes`` has two parameters ``include`` and ``exclude`` that allow you to
1585+
say "give me the columns WITH these dtypes" (``include``) and/or "give the
1586+
columns WITHOUT these dtypes" (``exclude``).
1587+
1588+
For example, to select ``bool`` columns
1589+
1590+
.. ipython:: python
1591+
1592+
df.select_dtypes(include=[bool])
1593+
1594+
You can also pass the name of a dtype in the `numpy dtype hierarchy
1595+
<http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__:
1596+
1597+
.. ipython:: python
1598+
1599+
df.select_dtypes(include=['bool'])
1600+
1601+
:meth:`~pandas.DataFrame.select_dtypes` also works with generic dtypes as well.
1602+
1603+
For example, to select all numeric and boolean columns while excluding unsigned
1604+
integers
1605+
1606+
.. ipython:: python
1607+
1608+
df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger'])
1609+
1610+
To select string columns you must use the ``object`` dtype:
1611+
1612+
.. ipython:: python
1613+
1614+
df.select_dtypes(include=['object'])
1615+
1616+
To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you
1617+
can define a function that returns a tree of child dtypes:
1618+
1619+
.. ipython:: python
1620+
1621+
def subdtypes(dtype):
1622+
subs = dtype.__subclasses__()
1623+
if not subs:
1624+
return dtype
1625+
return [dtype, [subdtypes(dt) for dt in subs]]
1626+
1627+
All numpy dtypes are subclasses of ``numpy.generic``:
1628+
1629+
.. ipython:: python
1630+
1631+
subdtypes(np.generic)
1632+
1633+
.. note::
1634+
1635+
The ``include`` and ``exclude`` parameters must be non-string sequences.

doc/source/v0.14.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ Enhancements
9191

9292

9393

94+
- Add :meth:`~pandas.DataFrame.select_dtypes` method to allow selection of
95+
columns based on dtype (:issue:`7316`). See :ref:`the docs <basics.selectdtypes>`.
9496

9597

9698

pandas/core/common.py

+62
Original file line numberDiff line numberDiff line change
@@ -1603,6 +1603,66 @@ def _get_fill_func(method):
16031603
#----------------------------------------------------------------------
16041604
# Lots of little utilities
16051605

1606+
def _validate_date_like_dtype(dtype):
1607+
try:
1608+
typ = np.datetime_data(dtype)[0]
1609+
except ValueError as e:
1610+
raise TypeError('%s' % e)
1611+
if typ != 'generic' and typ != 'ns':
1612+
raise ValueError('%r is too specific of a frequency, try passing %r'
1613+
% (dtype.name, dtype.type.__name__))
1614+
1615+
1616+
def _invalidate_string_dtypes(dtype_set):
1617+
"""Change string like dtypes to object for ``DataFrame.select_dtypes()``."""
1618+
non_string_dtypes = dtype_set - _string_dtypes
1619+
if non_string_dtypes != dtype_set:
1620+
raise TypeError("string dtypes are not allowed, use 'object' instead")
1621+
1622+
1623+
def _get_dtype_from_object(dtype):
1624+
"""Get a numpy dtype.type-style object.
1625+
1626+
Notes
1627+
-----
1628+
If nothing can be found, returns ``object``.
1629+
"""
1630+
# type object from a dtype
1631+
if isinstance(dtype, type) and issubclass(dtype, np.generic):
1632+
return dtype
1633+
elif isinstance(dtype, np.dtype): # dtype object
1634+
try:
1635+
_validate_date_like_dtype(dtype)
1636+
except TypeError:
1637+
# should still pass if we don't have a datelike
1638+
pass
1639+
return dtype.type
1640+
elif isinstance(dtype, compat.string_types):
1641+
if dtype == 'datetime' or dtype == 'timedelta':
1642+
dtype += '64'
1643+
try:
1644+
return _get_dtype_from_object(getattr(np, dtype))
1645+
except AttributeError:
1646+
# handles cases like _get_dtype(int)
1647+
# i.e., python objects that are valid dtypes (unlike user-defined
1648+
# types, in general)
1649+
pass
1650+
return _get_dtype_from_object(np.dtype(dtype))
1651+
1652+
1653+
_string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type,
1654+
compat.text_type)))
1655+
1656+
1657+
def _get_info_slice(obj, indexer):
1658+
"""Slice the info axis of `obj` with `indexer`."""
1659+
if not hasattr(obj, '_info_axis_number'):
1660+
raise TypeError('object of type %r has no info axis' %
1661+
type(obj).__name__)
1662+
slices = [slice(None)] * obj.ndim
1663+
slices[obj._info_axis_number] = indexer
1664+
return tuple(slices)
1665+
16061666

16071667
def _maybe_box(indexer, values, obj, key):
16081668

@@ -1613,6 +1673,7 @@ def _maybe_box(indexer, values, obj, key):
16131673
# return the value
16141674
return values
16151675

1676+
16161677
def _maybe_box_datetimelike(value):
16171678
# turn a datetime like into a Timestamp/timedelta as needed
16181679

@@ -1797,6 +1858,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False):
17971858

17981859
return value
17991860

1861+
18001862
def _possibly_infer_to_datetimelike(value):
18011863
# we might have a array (or single object) that is datetime like,
18021864
# and no dtype is passed don't change the value unless we find a

pandas/core/frame.py

+115-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# pylint: disable=E1101,E1103
1313
# pylint: disable=W0212,W0231,W0703,W0622
1414

15+
import functools
1516
import collections
1617
import itertools
1718
import sys
@@ -25,19 +26,18 @@
2526
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
2627
_default_index, _maybe_upcast, _is_sequence,
2728
_infer_dtype_from_scalar, _values_from_object,
28-
is_list_like)
29+
is_list_like, _get_dtype)
2930
from pandas.core.generic import NDFrame, _shared_docs
3031
from pandas.core.index import Index, MultiIndex, _ensure_index
3132
from pandas.core.indexing import (_maybe_droplevels,
3233
_convert_to_index_sliceable,
33-
_check_bool_indexer, _maybe_convert_indices)
34+
_check_bool_indexer)
3435
from pandas.core.internals import (BlockManager,
3536
create_block_manager_from_arrays,
3637
create_block_manager_from_blocks)
3738
from pandas.core.series import Series
3839
import pandas.computation.expressions as expressions
3940
from pandas.computation.eval import eval as _eval
40-
from pandas.computation.scope import _ensure_scope
4141
from numpy import percentile as _quantile
4242
from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u,
4343
OrderedDict, raise_with_traceback)
@@ -1867,6 +1867,118 @@ def eval(self, expr, **kwargs):
18671867
kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers
18681868
return _eval(expr, **kwargs)
18691869

1870+
def select_dtypes(self, include=None, exclude=None):
1871+
"""Return a subset of a DataFrame including/excluding columns based on
1872+
their ``dtype``.
1873+
1874+
Parameters
1875+
----------
1876+
include, exclude : list-like
1877+
A list of dtypes or strings to be included/excluded. You must pass
1878+
in a non-empty sequence for at least one of these.
1879+
1880+
Raises
1881+
------
1882+
ValueError
1883+
* If both of ``include`` and ``exclude`` are empty
1884+
* If ``include`` and ``exclude`` have overlapping elements
1885+
* If any kind of string dtype is passed in.
1886+
TypeError
1887+
* If either of ``include`` or ``exclude`` is not a sequence
1888+
1889+
Returns
1890+
-------
1891+
subset : DataFrame
1892+
The subset of the frame including the dtypes in ``include`` and
1893+
excluding the dtypes in ``exclude``.
1894+
1895+
Notes
1896+
-----
1897+
* To select all *numeric* types use the numpy dtype ``numpy.number``
1898+
* To select strings you must use the ``object`` dtype, but note that
1899+
this will return *all* object dtype columns
1900+
* See the `numpy dtype hierarchy
1901+
<http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
1902+
1903+
Examples
1904+
--------
1905+
>>> df = pd.DataFrame({'a': np.random.randn(6).astype('f4'),
1906+
... 'b': [True, False] * 3,
1907+
... 'c': [1.0, 2.0] * 3})
1908+
>>> df
1909+
a b c
1910+
0 0.3962 True 1
1911+
1 0.1459 False 2
1912+
2 0.2623 True 1
1913+
3 0.0764 False 2
1914+
4 -0.9703 True 1
1915+
5 -1.2094 False 2
1916+
>>> df.select_dtypes(include=['float64'])
1917+
c
1918+
0 1
1919+
1 2
1920+
2 1
1921+
3 2
1922+
4 1
1923+
5 2
1924+
>>> df.select_dtypes(exclude=['floating'])
1925+
b
1926+
0 True
1927+
1 False
1928+
2 True
1929+
3 False
1930+
4 True
1931+
5 False
1932+
"""
1933+
include, exclude = include or (), exclude or ()
1934+
if not (com.is_list_like(include) and com.is_list_like(exclude)):
1935+
raise TypeError('include and exclude must both be non-string'
1936+
' sequences')
1937+
selection = tuple(map(frozenset, (include, exclude)))
1938+
1939+
if not any(selection):
1940+
raise ValueError('at least one of include or exclude must be '
1941+
'nonempty')
1942+
1943+
# convert the myriad valid dtypes object to a single representation
1944+
include, exclude = map(lambda x:
1945+
frozenset(map(com._get_dtype_from_object, x)),
1946+
selection)
1947+
for dtypes in (include, exclude):
1948+
com._invalidate_string_dtypes(dtypes)
1949+
1950+
# can't both include AND exclude!
1951+
if not include.isdisjoint(exclude):
1952+
raise ValueError('include and exclude overlap on %s'
1953+
% (include & exclude))
1954+
1955+
# empty include/exclude -> defaults to True
1956+
# three cases (we've already raised if both are empty)
1957+
# case 1: empty include, nonempty exclude
1958+
# we have True, True, ... True for include, same for exclude
1959+
# in the loop below we get the excluded
1960+
# and when we call '&' below we get only the excluded
1961+
# case 2: nonempty include, empty exclude
1962+
# same as case 1, but with include
1963+
# case 3: both nonempty
1964+
# the "union" of the logic of case 1 and case 2:
1965+
# we get the included and excluded, and return their logical and
1966+
include_these = Series(not bool(include), index=self.columns)
1967+
exclude_these = Series(not bool(exclude), index=self.columns)
1968+
1969+
def is_dtype_instance_mapper(column, dtype):
1970+
return column, functools.partial(issubclass, dtype.type)
1971+
1972+
for column, f in itertools.starmap(is_dtype_instance_mapper,
1973+
self.dtypes.iteritems()):
1974+
if include: # checks for the case of empty include or exclude
1975+
include_these[column] = any(map(f, include))
1976+
if exclude:
1977+
exclude_these[column] = not any(map(f, exclude))
1978+
1979+
dtype_indexer = include_these & exclude_these
1980+
return self.loc[com._get_info_slice(self, dtype_indexer)]
1981+
18701982
def _box_item_values(self, key, values):
18711983
items = self.columns[self.columns.get_loc(key)]
18721984
if values.ndim == 2:

0 commit comments

Comments
 (0)