Skip to content

Commit 595208b

Browse files
authored
BUG: Use correct ExtensionArray reductions in DataFrame reductions (#35254)
1 parent f329d8e commit 595208b

File tree

4 files changed

+52
-8
lines changed

4 files changed

+52
-8
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,7 @@ Numeric
916916
- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
917917
- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
918918
- Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
919+
- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`)
919920

920921
Conversion
921922
^^^^^^^^^^

pandas/core/frame.py

+19-8
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@
118118
from pandas.core.arrays import Categorical, ExtensionArray
119119
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
120120
from pandas.core.arrays.sparse import SparseFrameAccessor
121+
from pandas.core.construction import extract_array
121122
from pandas.core.generic import NDFrame, _shared_docs
122123
from pandas.core.indexes import base as ibase
123124
from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
@@ -8512,7 +8513,14 @@ def _count_level(self, level, axis=0, numeric_only=False):
85128513
return result
85138514

85148515
def _reduce(
8515-
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
8516+
self,
8517+
op,
8518+
name: str,
8519+
axis=0,
8520+
skipna=True,
8521+
numeric_only=None,
8522+
filter_type=None,
8523+
**kwds,
85168524
):
85178525

85188526
assert filter_type is None or filter_type == "bool", filter_type
@@ -8544,8 +8552,11 @@ def _reduce(
85448552
labels = self._get_agg_axis(axis)
85458553
constructor = self._constructor
85468554

8547-
def f(x):
8548-
return op(x, axis=axis, skipna=skipna, **kwds)
8555+
def func(values):
8556+
if is_extension_array_dtype(values.dtype):
8557+
return extract_array(values)._reduce(name, skipna=skipna, **kwds)
8558+
else:
8559+
return op(values, axis=axis, skipna=skipna, **kwds)
85498560

85508561
def _get_data(axis_matters):
85518562
if filter_type is None:
@@ -8592,7 +8603,7 @@ def blk_func(values):
85928603
out[:] = coerce_to_dtypes(out.values, df.dtypes)
85938604
return out
85948605

8595-
if not self._is_homogeneous_type:
8606+
if not self._is_homogeneous_type or self._mgr.any_extension_types:
85968607
# try to avoid self.values call
85978608

85988609
if filter_type is None and axis == 0 and len(self) > 0:
@@ -8612,7 +8623,7 @@ def blk_func(values):
86128623
from pandas.core.apply import frame_apply
86138624

86148625
opa = frame_apply(
8615-
self, func=f, result_type="expand", ignore_failures=True
8626+
self, func=func, result_type="expand", ignore_failures=True
86168627
)
86178628
result = opa.get_result()
86188629
if result.ndim == self.ndim:
@@ -8624,7 +8635,7 @@ def blk_func(values):
86248635
values = data.values
86258636

86268637
try:
8627-
result = f(values)
8638+
result = func(values)
86288639

86298640
except TypeError:
86308641
# e.g. in nanops trying to convert strs to float
@@ -8635,7 +8646,7 @@ def blk_func(values):
86358646

86368647
values = data.values
86378648
with np.errstate(all="ignore"):
8638-
result = f(values)
8649+
result = func(values)
86398650

86408651
else:
86418652
if numeric_only:
@@ -8646,7 +8657,7 @@ def blk_func(values):
86468657
else:
86478658
data = self
86488659
values = data.values
8649-
result = f(values)
8660+
result = func(values)
86508661

86518662
if filter_type == "bool" and is_object_dtype(values) and axis is None:
86528663
# work around https://github.com/numpy/numpy/issues/10489

pandas/tests/arrays/integer/test_function.py

+9
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected):
133133
assert result == expected
134134

135135

136+
@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"])
137+
def test_dataframe_reductions(op):
138+
# https://github.com/pandas-dev/pandas/pull/32867
139+
# ensure the integers are not cast to float during reductions
140+
df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
141+
result = df.max()
142+
assert isinstance(result["a"], np.int64)
143+
144+
136145
# TODO(jreback) - these need testing / are broken
137146

138147
# shift

pandas/tests/frame/test_analytics.py

+23
Original file line numberDiff line numberDiff line change
@@ -1303,3 +1303,26 @@ def test_preserve_timezone(self, initial: str, method):
13031303
df = DataFrame([expected])
13041304
result = getattr(df, method)(axis=1)
13051305
tm.assert_series_equal(result, expected)
1306+
1307+
1308+
def test_mixed_frame_with_integer_sum():
1309+
# https://github.com/pandas-dev/pandas/issues/34520
1310+
df = pd.DataFrame([["a", 1]], columns=list("ab"))
1311+
df = df.astype({"b": "Int64"})
1312+
result = df.sum()
1313+
expected = pd.Series(["a", 1], index=["a", "b"])
1314+
tm.assert_series_equal(result, expected)
1315+
1316+
1317+
@pytest.mark.parametrize("numeric_only", [True, False, None])
1318+
@pytest.mark.parametrize("method", ["min", "max"])
1319+
def test_minmax_extensionarray(method, numeric_only):
1320+
# https://github.com/pandas-dev/pandas/issues/32651
1321+
int64_info = np.iinfo("int64")
1322+
ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
1323+
df = DataFrame({"Int64": ser})
1324+
result = getattr(df, method)(numeric_only=numeric_only)
1325+
expected = Series(
1326+
[getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object")
1327+
)
1328+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)