diff --git a/doc/source/api.rst b/doc/source/api.rst index bc257ffa0ad6c..c3cccca3251e4 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -524,6 +524,7 @@ Attributes and underlying data DataFrame.ftypes DataFrame.get_dtype_counts DataFrame.get_ftype_counts + DataFrame.select_dtypes DataFrame.values DataFrame.axes DataFrame.ndim diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 1979b180b71b9..ec8456089f452 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1552,3 +1552,84 @@ While float dtypes are unchanged. casted = dfa[df2>0] casted casted.dtypes + +Selecting columns based on ``dtype`` +------------------------------------ + +.. _basics.selectdtypes: + +.. versionadded:: 0.14.1 + +The :meth:`~pandas.DataFrame.select_dtypes` method implements subsetting of columns +based on their ``dtype``. + +First, let's create a :class:`~pandas.DataFrame` with a slew of different +dtypes: + +.. ipython:: python + + df = DataFrame({'string': list('abc'), + 'int64': list(range(1, 4)), + 'uint8': np.arange(3, 6).astype('u1'), + 'float64': np.arange(4.0, 7.0), + 'bool1': [True, False, True], + 'bool2': [False, True, False], + 'dates': pd.date_range('now', periods=3).values}) + df['tdeltas'] = df.dates.diff() + df['uint64'] = np.arange(3, 6).astype('u8') + df['other_dates'] = pd.date_range('20130101', periods=3).values + df + + +``select_dtypes`` has two parameters ``include`` and ``exclude`` that allow you to +say "give me the columns WITH these dtypes" (``include``) and/or "give the +columns WITHOUT these dtypes" (``exclude``). + +For example, to select ``bool`` columns + +.. ipython:: python + + df.select_dtypes(include=[bool]) + +You can also pass the name of a dtype in the `numpy dtype hierarchy +`__: + +.. ipython:: python + + df.select_dtypes(include=['bool']) + +:meth:`~pandas.DataFrame.select_dtypes` also works with generic dtypes as well. + +For example, to select all numeric and boolean columns while excluding unsigned +integers + +.. ipython:: python + + df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger']) + +To select string columns you must use the ``object`` dtype: + +.. ipython:: python + + df.select_dtypes(include=['object']) + +To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you +can define a function that returns a tree of child dtypes: + +.. ipython:: python + + def subdtypes(dtype): + subs = dtype.__subclasses__() + if not subs: + return dtype + return [dtype, [subdtypes(dt) for dt in subs]] + +All numpy dtypes are subclasses of ``numpy.generic``: + +.. ipython:: python + + subdtypes(np.generic) + +.. note:: + + The ``include`` and ``exclude`` parameters must be non-string sequences. diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index 0226e5041639a..79f6a918a0e93 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -91,6 +91,8 @@ Enhancements +- Add :meth:`~pandas.DataFrame.select_dtypes` method to allow selection of + columns based on dtype (:issue:`7316`). See :ref:`the docs `. diff --git a/pandas/core/common.py b/pandas/core/common.py index 8791dcc124a6e..bb7f43511e905 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1603,6 +1603,66 @@ def _get_fill_func(method): #---------------------------------------------------------------------- # Lots of little utilities +def _validate_date_like_dtype(dtype): + try: + typ = np.datetime_data(dtype)[0] + except ValueError as e: + raise TypeError('%s' % e) + if typ != 'generic' and typ != 'ns': + raise ValueError('%r is too specific of a frequency, try passing %r' + % (dtype.name, dtype.type.__name__)) + + +def _invalidate_string_dtypes(dtype_set): + """Change string like dtypes to object for ``DataFrame.select_dtypes()``.""" + non_string_dtypes = dtype_set - _string_dtypes + if non_string_dtypes != dtype_set: + raise TypeError("string dtypes are not allowed, use 'object' instead") + + +def _get_dtype_from_object(dtype): + """Get a numpy dtype.type-style object. + + Notes + ----- + If nothing can be found, returns ``object``. + """ + # type object from a dtype + if isinstance(dtype, type) and issubclass(dtype, np.generic): + return dtype + elif isinstance(dtype, np.dtype): # dtype object + try: + _validate_date_like_dtype(dtype) + except TypeError: + # should still pass if we don't have a datelike + pass + return dtype.type + elif isinstance(dtype, compat.string_types): + if dtype == 'datetime' or dtype == 'timedelta': + dtype += '64' + try: + return _get_dtype_from_object(getattr(np, dtype)) + except AttributeError: + # handles cases like _get_dtype(int) + # i.e., python objects that are valid dtypes (unlike user-defined + # types, in general) + pass + return _get_dtype_from_object(np.dtype(dtype)) + + +_string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type, + compat.text_type))) + + +def _get_info_slice(obj, indexer): + """Slice the info axis of `obj` with `indexer`.""" + if not hasattr(obj, '_info_axis_number'): + raise TypeError('object of type %r has no info axis' % + type(obj).__name__) + slices = [slice(None)] * obj.ndim + slices[obj._info_axis_number] = indexer + return tuple(slices) + def _maybe_box(indexer, values, obj, key): @@ -1613,6 +1673,7 @@ def _maybe_box(indexer, values, obj, key): # return the value return values + def _maybe_box_datetimelike(value): # turn a datetime like into a Timestamp/timedelta as needed @@ -1797,6 +1858,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): return value + def _possibly_infer_to_datetimelike(value): # we might have a array (or single object) that is datetime like, # and no dtype is passed don't change the value unless we find a diff --git a/pandas/core/frame.py b/pandas/core/frame.py index da9fb44f80b09..413f3daa52a52 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12,6 +12,7 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0212,W0231,W0703,W0622 +import functools import collections import itertools import sys @@ -25,19 +26,18 @@ from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, _is_sequence, _infer_dtype_from_scalar, _values_from_object, - is_list_like) + is_list_like, _get_dtype) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_maybe_droplevels, _convert_to_index_sliceable, - _check_bool_indexer, _maybe_convert_indices) + _check_bool_indexer) from pandas.core.internals import (BlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval -from pandas.computation.scope import _ensure_scope from numpy import percentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -1867,6 +1867,118 @@ def eval(self, expr, **kwargs): kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers return _eval(expr, **kwargs) + def select_dtypes(self, include=None, exclude=None): + """Return a subset of a DataFrame including/excluding columns based on + their ``dtype``. + + Parameters + ---------- + include, exclude : list-like + A list of dtypes or strings to be included/excluded. You must pass + in a non-empty sequence for at least one of these. + + Raises + ------ + ValueError + * If both of ``include`` and ``exclude`` are empty + * If ``include`` and ``exclude`` have overlapping elements + * If any kind of string dtype is passed in. + TypeError + * If either of ``include`` or ``exclude`` is not a sequence + + Returns + ------- + subset : DataFrame + The subset of the frame including the dtypes in ``include`` and + excluding the dtypes in ``exclude``. + + Notes + ----- + * To select all *numeric* types use the numpy dtype ``numpy.number`` + * To select strings you must use the ``object`` dtype, but note that + this will return *all* object dtype columns + * See the `numpy dtype hierarchy + `__ + + Examples + -------- + >>> df = pd.DataFrame({'a': np.random.randn(6).astype('f4'), + ... 'b': [True, False] * 3, + ... 'c': [1.0, 2.0] * 3}) + >>> df + a b c + 0 0.3962 True 1 + 1 0.1459 False 2 + 2 0.2623 True 1 + 3 0.0764 False 2 + 4 -0.9703 True 1 + 5 -1.2094 False 2 + >>> df.select_dtypes(include=['float64']) + c + 0 1 + 1 2 + 2 1 + 3 2 + 4 1 + 5 2 + >>> df.select_dtypes(exclude=['floating']) + b + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False + """ + include, exclude = include or (), exclude or () + if not (com.is_list_like(include) and com.is_list_like(exclude)): + raise TypeError('include and exclude must both be non-string' + ' sequences') + selection = tuple(map(frozenset, (include, exclude))) + + if not any(selection): + raise ValueError('at least one of include or exclude must be ' + 'nonempty') + + # convert the myriad valid dtypes object to a single representation + include, exclude = map(lambda x: + frozenset(map(com._get_dtype_from_object, x)), + selection) + for dtypes in (include, exclude): + com._invalidate_string_dtypes(dtypes) + + # can't both include AND exclude! + if not include.isdisjoint(exclude): + raise ValueError('include and exclude overlap on %s' + % (include & exclude)) + + # empty include/exclude -> defaults to True + # three cases (we've already raised if both are empty) + # case 1: empty include, nonempty exclude + # we have True, True, ... True for include, same for exclude + # in the loop below we get the excluded + # and when we call '&' below we get only the excluded + # case 2: nonempty include, empty exclude + # same as case 1, but with include + # case 3: both nonempty + # the "union" of the logic of case 1 and case 2: + # we get the included and excluded, and return their logical and + include_these = Series(not bool(include), index=self.columns) + exclude_these = Series(not bool(exclude), index=self.columns) + + def is_dtype_instance_mapper(column, dtype): + return column, functools.partial(issubclass, dtype.type) + + for column, f in itertools.starmap(is_dtype_instance_mapper, + self.dtypes.iteritems()): + if include: # checks for the case of empty include or exclude + include_these[column] = any(map(f, include)) + if exclude: + exclude_these[column] = not any(map(f, exclude)) + + dtype_indexer = include_these & exclude_these + return self.loc[com._get_info_slice(self, dtype_indexer)] + def _box_item_values(self, key, values): items = self.columns[self.columns.get_loc(key)] if values.ndim == 2: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index d7f8d235d4229..dab61af2f6de7 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12996,6 +12996,123 @@ def test_set_index_names(self): # Check equality tm.assert_index_equal(df.set_index([df.index, df.index]).index, mi2) + def test_select_dtypes_include(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0), + 'e': [True, False, True]}) + ri = df.select_dtypes(include=[np.number]) + ei = df[['b', 'c', 'd']] + tm.assert_frame_equal(ri, ei) + + def test_select_dtypes_exclude(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0), + 'e': [True, False, True]}) + re = df.select_dtypes(exclude=[np.number]) + ee = df[['a', 'e']] + tm.assert_frame_equal(re, ee) + + def test_select_dtypes_exclude_include(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + exclude = np.datetime64, + include = np.bool_, 'integer' + r = df.select_dtypes(include=include, exclude=exclude) + e = df[['b', 'c', 'e']] + tm.assert_frame_equal(r, e) + + exclude = 'datetime', + include = 'bool', 'int' + r = df.select_dtypes(include=include, exclude=exclude) + e = df[['b', 'e']] + tm.assert_frame_equal(r, e) + + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + df['g'] = df.f.diff() + assert not hasattr(np, 'u8') + r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta']) + e = df[['a', 'b']] + tm.assert_frame_equal(r, e) + + r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]']) + e = df[['a', 'b', 'g']] + tm.assert_frame_equal(r, e) + + def test_select_dtypes_empty(self): + df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) + with tm.assertRaisesRegexp(ValueError, 'at least one of include or ' + 'exclude must be nonempty'): + df.select_dtypes() + + def test_select_dtypes_raises_on_string(self): + df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) + with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'): + df.select_dtypes(include='object') + with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'): + df.select_dtypes(exclude='object') + with tm.assertRaisesRegexp(TypeError, 'include and exclude .+ non-'): + df.select_dtypes(include=int, exclude='object') + + def test_select_dtypes_bad_datetime64(self): + df = DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + with tm.assertRaisesRegexp(ValueError, '.+ is too specific'): + df.select_dtypes(include=['datetime64[D]']) + + with tm.assertRaisesRegexp(ValueError, '.+ is too specific'): + df.select_dtypes(exclude=['datetime64[as]']) + + def test_select_dtypes_str_raises(self): + df = DataFrame({'a': list('abc'), + 'g': list(u('abc')), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + string_dtypes = set((str, 'str', np.string_, 'S1', + 'unicode', np.unicode_, 'U1')) + try: + string_dtypes.add(unicode) + except NameError: + pass + for dt in string_dtypes: + with tm.assertRaisesRegexp(TypeError, + 'string dtypes are not allowed'): + df.select_dtypes(include=[dt]) + with tm.assertRaisesRegexp(TypeError, + 'string dtypes are not allowed'): + df.select_dtypes(exclude=[dt]) + + def test_select_dtypes_bad_arg_raises(self): + df = DataFrame({'a': list('abc'), + 'g': list(u('abc')), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0), + 'e': [True, False, True], + 'f': pd.date_range('now', periods=3).values}) + with tm.assertRaisesRegexp(TypeError, 'data type.*not understood'): + df.select_dtypes(['blargy, blarg, blarg']) + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': @@ -13931,6 +14048,7 @@ def test_query_string_scalar_variable(self): for parser, engine in product(['pandas'], ENGINES): yield self.check_query_string_scalar_variable, parser, engine + class TestDataFrameEvalNumExprPandas(tm.TestCase): @classmethod