From 9b8415851bb664c5dae3cb17e826d72ad2014cab Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 12 Jul 2020 15:47:41 -0700 Subject: [PATCH 1/8] BUG: df.sum with Int64 dtype --- pandas/core/frame.py | 25 ++++++++++++++------ pandas/tests/arrays/integer/test_function.py | 9 +++++++ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cfe5621fec14e..5b31034d2c0c0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -118,6 +118,7 @@ from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.core.construction import extract_array from pandas.core.generic import NDFrame, _shared_docs from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences @@ -8499,7 +8500,14 @@ def _count_level(self, level, axis=0, numeric_only=False): return result def _reduce( - self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + self, + op, + name: str, + axis=0, + skipna=True, + numeric_only=None, + filter_type=None, + **kwds, ): assert filter_type is None or filter_type == "bool", filter_type @@ -8531,8 +8539,11 @@ def _reduce( labels = self._get_agg_axis(axis) constructor = self._constructor - def f(x): - return op(x, axis=axis, skipna=skipna, **kwds) + def func(values): + if is_extension_array_dtype(values.dtype): + return extract_array(values)._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=axis, skipna=skipna, **kwds) def _get_data(axis_matters): if filter_type is None: @@ -8599,7 +8610,7 @@ def blk_func(values): from pandas.core.apply import frame_apply opa = frame_apply( - self, func=f, result_type="expand", ignore_failures=True + self, func=func, result_type="expand", ignore_failures=True ) result = opa.get_result() if result.ndim == self.ndim: @@ -8611,7 +8622,7 @@ def blk_func(values): values = data.values try: - result = f(values) + result = func(values) except TypeError: # e.g. in nanops trying to convert strs to float @@ -8622,7 +8633,7 @@ def blk_func(values): values = data.values with np.errstate(all="ignore"): - result = f(values) + result = func(values) else: if numeric_only: @@ -8633,7 +8644,7 @@ def blk_func(values): else: data = self values = data.values - result = f(values) + result = func(values) if filter_type == "bool" and is_object_dtype(values) and axis is None: # work around https://github.com/numpy/numpy/issues/10489 diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 44c3077228e80..8774a0cf02b8f 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected): assert result == expected +def test_mixed_frame_with_integer_sum(): + # GH#34520 + df = pd.DataFrame([["a", 1]], columns=list("ab")) + df = df.astype({"b": "Int64"}) + result = df.sum() + expected = pd.Series(["a", 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + # TODO(jreback) - these need testing / are broken # shift From c89da437bd018dfd9365e702b039bc8928b7e6f7 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 14 Jul 2020 13:28:12 -0700 Subject: [PATCH 2/8] whatnsew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cfac916157649..7572472fb7434 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -913,6 +913,7 @@ Numeric - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) +- Bug in :class:`DataFrame` reductions (e.g. ``df.min``, ``df.max``) with ``ExtensionArray`` dtypes (:issue:`34520`) Conversion ^^^^^^^^^^ From 261fa32debcb6bb6b79d4711ceeead43f3d4bab8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 12 Jul 2020 21:35:44 +0200 Subject: [PATCH 3/8] add test case for GH34520, copied from GH35112 Co-authored-by: Simon Hawkins --- pandas/tests/frame/test_analytics.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index db8bb5ca3c437..9e72cc7153c47 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1303,3 +1303,12 @@ def test_preserve_timezone(self, initial: str, method): df = DataFrame([expected]) result = getattr(df, method)(axis=1) tm.assert_series_equal(result, expected) + + +def test_mixed_frame_with_integer_sum(): + # https://github.com/pandas-dev/pandas/issues/34520 + df = pd.DataFrame([["a", 1]], columns=list("ab")) + df = df.astype({"b": "Int64"}) + result = df.sum() + expected = pd.Series(["a", 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) From 9ee96691cc6f061cbda79920b7e58c86fd9c35d7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 12 Jul 2020 21:46:38 +0200 Subject: [PATCH 4/8] add test to ensure EA op is used for integer array --- pandas/tests/arrays/integer/test_function.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 8774a0cf02b8f..0bd0c67f7aab1 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -142,6 +142,15 @@ def test_mixed_frame_with_integer_sum(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"]) +def test_dataframe_reductions(op): + # https://github.com/pandas-dev/pandas/pull/32867 + # ensure the integers are not cast to float during reductions + df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")}) + result = df.max() + assert isinstance(result["a"], np.int64) + + # TODO(jreback) - these need testing / are broken # shift From 390b9bbd0dcad3c068592cda5b9c06c5108ffe6b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 12 Jul 2020 22:32:12 +0200 Subject: [PATCH 5/8] add test for GH32651, copied from GH34210 Co-authored-by: Simon Hawkins --- pandas/tests/frame/test_analytics.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 9e72cc7153c47..9d6b9f39a0578 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1312,3 +1312,17 @@ def test_mixed_frame_with_integer_sum(): result = df.sum() expected = pd.Series(["a", 1], index=["a", "b"]) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("numeric_only", [True, False, None]) +@pytest.mark.parametrize("method", ["min", "max"]) +def test_minmax_extensionarray(method, numeric_only): + # https://github.com/pandas-dev/pandas/issues/32651 + int64_info = np.iinfo("int64") + ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype()) + df = DataFrame({"Int64": ser}) + result = getattr(df, method)(numeric_only=numeric_only) + expected = Series( + [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object") + ) + tm.assert_series_equal(result, expected) From 312cb9caee2860f4214b300f1d100d9534b9baeb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jul 2020 14:41:51 +0200 Subject: [PATCH 6/8] remove now duplicated test --- pandas/tests/arrays/integer/test_function.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 0bd0c67f7aab1..a81434339fdae 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -133,15 +133,6 @@ def test_integer_array_numpy_sum(values, expected): assert result == expected -def test_mixed_frame_with_integer_sum(): - # GH#34520 - df = pd.DataFrame([["a", 1]], columns=list("ab")) - df = df.astype({"b": "Int64"}) - result = df.sum() - expected = pd.Series(["a", 1], index=["a", "b"]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("op", ["sum", "prod", "min", "max"]) def test_dataframe_reductions(op): # https://github.com/pandas-dev/pandas/pull/32867 From 0f33353eef872d0410816eacb7f9836a15303618 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jul 2020 14:50:04 +0200 Subject: [PATCH 7/8] add self._mgr.any_extension_types check --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b31034d2c0c0..3b6eb9e3a27c0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8590,7 +8590,7 @@ def blk_func(values): out[:] = coerce_to_dtypes(out.values, df.dtypes) return out - if not self._is_homogeneous_type: + if not self._is_homogeneous_type or self._mgr.any_extension_types: # try to avoid self.values call if filter_type is None and axis == 0 and len(self) > 0: From babedb9a55fc4d080521ceffb6a04e236af22026 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Jul 2020 20:15:17 +0200 Subject: [PATCH 8/8] add issue number to whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7572472fb7434..9bc1499d5511e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -913,7 +913,7 @@ Numeric - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) -- Bug in :class:`DataFrame` reductions (e.g. ``df.min``, ``df.max``) with ``ExtensionArray`` dtypes (:issue:`34520`) +- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`) Conversion ^^^^^^^^^^