From 0a73a8bfeb2585b8872de0d9ae61319dc5fa89c3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 10 Apr 2023 22:07:49 +0200 Subject: [PATCH 1/2] BUG: describe not respecting ArrowDtype in include/exclude --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/dtypes/common.py | 2 ++ pandas/core/frame.py | 1 + pandas/tests/frame/methods/test_describe.py | 20 ++++++++++++++++++++ 4 files changed, 24 insertions(+) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index caf237fb15163..b0a07219a89a5 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -26,6 +26,7 @@ Bug fixes - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) +- Bug in :meth:`DataFrame.describe` not respecting ``ArrowDtype`` in ``include`` and ``exclude`` (:issue:`52570`) .. --------------------------------------------------------------------------- .. _whatsnew_201.other: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e22baacba0574..ca7d95d5afcc8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1499,6 +1499,8 @@ def infer_dtype_from_object(dtype) -> type: except TypeError: # Should still pass if we don't have a date-like pass + if hasattr(dtype, "numpy_dtype"): + return dtype.numpy_dtype.type return dtype.type try: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fa36fef29979c..391c009c33cd0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4720,6 +4720,7 @@ def check_int_infer_dtype(dtypes): def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: # GH 46870: BooleanDtype._is_numeric == True but should be excluded + dtype = dtype if not hasattr(dtype, "numpy_dtype") else dtype.numpy_dtype return issubclass(dtype.type, tuple(dtypes_set)) or ( np.number in dtypes_set and getattr(dtype, "_is_numeric", False) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index e2b8a0f63c31a..fbe6ff356499f 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -395,3 +395,23 @@ def test_ea_with_na(self, any_numeric_ea_dtype): dtype="Float64", ) tm.assert_frame_equal(result, expected) + + def test_describe_exclude_pa_dtype(self): + # GH#52570 + pa = pytest.importorskip("pyarrow") + df = DataFrame( + { + "a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())), + "b": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())), + "c": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())), + } + ) + result = df.describe( + include=pd.ArrowDtype(pa.int8()), exclude=pd.ArrowDtype(pa.int32()) + ) + expected = DataFrame( + {"a": [3, 2, 1, 1, 1.5, 2, 2.5, 3]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + dtype=pd.ArrowDtype(pa.float64()), + ) + tm.assert_frame_equal(result, expected) From 218da60a1d8fe771e8ff49d7a2e8ea8dbbe12e13 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 11 Apr 2023 23:17:34 +0200 Subject: [PATCH 2/2] Add comment --- pandas/core/dtypes/common.py | 2 ++ pandas/core/frame.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index ca7d95d5afcc8..b852897afa630 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1500,6 +1500,8 @@ def infer_dtype_from_object(dtype) -> type: # Should still pass if we don't have a date-like pass if hasattr(dtype, "numpy_dtype"): + # TODO: Implement this properly + # https://github.com/pandas-dev/pandas/issues/52576 return dtype.numpy_dtype.type return dtype.type diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b44b9eece474e..ba9a41049ad29 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -132,6 +132,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.arrow import ArrowDtype from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -4713,7 +4714,7 @@ def check_int_infer_dtype(dtypes): def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: # GH 46870: BooleanDtype._is_numeric == True but should be excluded - dtype = dtype if not hasattr(dtype, "numpy_dtype") else dtype.numpy_dtype + dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype return issubclass(dtype.type, tuple(dtypes_set)) or ( np.number in dtypes_set and getattr(dtype, "_is_numeric", False)