Skip to content

BUG: Allow numeric ExtensionDtypes in DataFrame.select_dtypes #38246

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 36 commits into from
Dec 14, 2020
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
503f9fc
select dtypes
andrewgsavage Jul 19, 2020
ea1796f
flake8
andrewgsavage Jul 19, 2020
3932391
flake8
andrewgsavage Jul 19, 2020
7cf6e3e
black
andrewgsavage Jul 19, 2020
0cb170a
checks
andrewgsavage Jul 19, 2020
83c84ba
create compat func
andrewgsavage Jul 31, 2020
1087ff7
create compat func
andrewgsavage Jul 31, 2020
7fe272f
create compat func
andrewgsavage Jul 31, 2020
77243fd
test
andrewgsavage Aug 1, 2020
eb475a8
lint
andrewgsavage Aug 1, 2020
9b5db9a
lint
andrewgsavage Aug 1, 2020
9cbf294
remove import from pandas.core
andrewgsavage Oct 11, 2020
4c1d67a
Merge branch 'master' into select_dtypes
andrewgsavage Oct 11, 2020
14fabfc
move files, add docstring and examples
andrewgsavage Oct 19, 2020
da6cf68
isort, remove pint_pandas example
andrewgsavage Oct 19, 2020
3801310
isort, remove pint_pandas example
andrewgsavage Oct 19, 2020
9e23d20
remove unused type comment@
andrewgsavage Oct 19, 2020
13b1531
merge master
arw2019 Dec 2, 2020
41fd75e
pd namespace usage
arw2019 Dec 2, 2020
85d0ae8
remove tests from old location
arw2019 Dec 2, 2020
4445207
unused import
arw2019 Dec 2, 2020
31ef92f
consolidate return
arw2019 Dec 2, 2020
5d9d2c2
whatsnew
arw2019 Dec 2, 2020
5d60908
minimize diff
arw2019 Dec 3, 2020
b8495c3
Merge branch 'master' of https://github.com/pandas-dev/pandas into HEAD
arw2019 Dec 3, 2020
75ea3a2
don't make separate method + ignore mypy
arw2019 Dec 5, 2020
e84ecb0
Merge branch 'master' of https://github.com/pandas-dev/pandas into GH…
arw2019 Dec 5, 2020
1df34f5
review comment
arw2019 Dec 13, 2020
0b5a9ef
Merge branch 'master' of https://github.com/pandas-dev/pandas into GH…
arw2019 Dec 13, 2020
ce63fa7
move whatsnew to 1.3
arw2019 Dec 13, 2020
93c7403
parametrize test
arw2019 Dec 13, 2020
ef095b5
add
arw2019 Dec 13, 2020
7c25c80
Merge branch 'master' of https://github.com/pandas-dev/pandas into GH…
arw2019 Dec 14, 2020
9bbd980
review comment
arw2019 Dec 14, 2020
57b6126
remove commented code
arw2019 Dec 14, 2020
f0902be
Merge branch 'master' of https://github.com/pandas-dev/pandas into GH…
arw2019 Dec 14, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,7 @@ Numeric
- Bug in :meth:`DataFrame.std` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`)
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`)
- Bug in :meth:`DataFrame.idxmax` and :meth:`DataFrame.idxmin` with mixed dtypes incorrectly raising ``TypeError`` (:issue:`38195`)
- Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`)

Conversion
^^^^^^^^^^
Expand Down
38 changes: 38 additions & 0 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,6 +1231,44 @@ def needs_i8_conversion(arr_or_dtype) -> bool:
)


def np_issubclass_compat(unique_dtype, dtypes_set) -> bool:
"""
Check whether the provided dtype is a subclass of, or has an attribute
(e.g. _is_numeric) indiciating it is a subclass of any of the dtypes in
dtypes_set.

Parameters
----------
unique_dtype : dtype
The dtype to check.
dtypes_set : array-like
The dtypes to check unique_dtype is a sublass of.

Returns
-------
boolean
Whether or not the unique_dtype is a subclass of dtype_set.

Examples
--------
>>> np_issubclass_compat(pd.Int16Dtype(), [np.bool_, np.float])
False
>>> np_issubclass_compat(pd.Int16Dtype(), [np.integer])
True
>>> np_issubclass_compat(pd.BooleanDtype(), [np.bool_])
True
>>> np_issubclass_compat(pd.Float64Dtype(), [np.float])
True
>>> np_issubclass_compat(pd.Float64Dtype(), [np.number])
True
"""
return issubclass(unique_dtype.type, tuple(dtypes_set)) or (
np.number in dtypes_set
and hasattr(unique_dtype, "_is_numeric") # is an extensionarray
and unique_dtype._is_numeric
)


def is_numeric_dtype(arr_or_dtype) -> bool:
"""
Check whether the provided array or dtype is of a numeric dtype.
Expand Down
10 changes: 3 additions & 7 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
is_object_dtype,
is_scalar,
is_sequence,
np_issubclass_compat,
pandas_dtype,
)
from pandas.core.dtypes.missing import isna, notna
Expand Down Expand Up @@ -3681,6 +3682,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
4 True 1.0
5 False 2.0
"""

if not is_list_like(include):
include = (include,) if include is not None else ()
if not is_list_like(exclude):
Expand Down Expand Up @@ -3711,13 +3713,7 @@ def extract_unique_dtypes_from_dtypes_set(
extracted_dtypes = [
unique_dtype
for unique_dtype in unique_dtypes
# error: Argument 1 to "tuple" has incompatible type
# "FrozenSet[Union[ExtensionDtype, str, Any, Type[str],
# Type[float], Type[int], Type[complex], Type[bool]]]";
# expected "Iterable[Union[type, Tuple[Any, ...]]]"
if issubclass(
unique_dtype.type, tuple(dtypes_set) # type: ignore[arg-type]
)
if np_issubclass_compat(unique_dtype, dtypes_set)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't think we need to create yet another method here. is there a reason you cannot just use something like

return issubclass(unique_dtype.type, tuple(dtypes_set)) or (
        np.number in dtypes_set and is_numeric_dtype(unique_dtype))
    )

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no reason. Done

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having now tried this I think the reason might be mypy

pandas/core/frame.py:3715: error: Argument 1 to "tuple" has incompatible type "FrozenSet[Union[ExtensionDtype, str, Any, Type[str], Type[float], Type[int], Type[complex], Type[bool], Type[object]]]"; expected "Iterable[Union[type, Tuple[Any, ...]]]"  [arg-type]

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reverted to having a separate method

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just ignore mypy here, this makes groking the code way more complicated.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok!

]
return extracted_dtypes

Expand Down
48 changes: 48 additions & 0 deletions pandas/tests/frame/methods/test_select_dtypes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,46 @@
import numpy as np
import pytest

from pandas.core.dtypes.dtypes import ExtensionDtype

import pandas as pd
from pandas import DataFrame, Timestamp
import pandas._testing as tm
from pandas.core.arrays import ExtensionArray


class DummyDtype(ExtensionDtype):
type = int

def __init__(self, numeric):
self._numeric = numeric

@property
def name(self):
return "Dummy"

@property
def _is_numeric(self):
return self._numeric


class DummyArray(ExtensionArray):
def __init__(self, data, dtype):
self.data = data
self._dtype = dtype

def __array__(self, dtype):
return self.data

@property
def dtype(self):
return self._dtype

def __len__(self) -> int:
return len(self.data)

def __getitem__(self, item):
pass


class TestSelectDtypes:
Expand Down Expand Up @@ -322,3 +359,14 @@ def test_select_dtypes_typecodes(self):
expected = df
FLOAT_TYPES = list(np.typecodes["AllFloat"])
tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected)

def test_select_dtypes_numeric(self):
# GH 35340

da = DummyArray([1, 2], dtype=DummyDtype(numeric=True))
df = DataFrame(da)
assert df.select_dtypes(np.number).shape == df.shape

da = DummyArray([1, 2], dtype=DummyDtype(numeric=False))
df = DataFrame(da)
assert df.select_dtypes(np.number).shape != df.shape