Skip to content

Commit 3de5714

Browse files
jbrockmendelfangchenli
authored andcommitted
BUG: Use correct ExtensionArray reductions in DataFrame reductions (pandas-dev#35254)
1 parent 5c88f92 commit 3de5714

File tree

4 files changed

+52
-8
lines changed

4 files changed

+52
-8
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,7 @@ Numeric
916916
- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
917917
- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
918918
- Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
919+
- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`)
919920

920921
Conversion
921922
^^^^^^^^^^

pandas/core/frame.py

+19-8
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@
117117
from pandas.core.arrays import Categorical, ExtensionArray
118118
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
119119
from pandas.core.arrays.sparse import SparseFrameAccessor
120+
from pandas.core.construction import extract_array
120121
from pandas.core.generic import NDFrame, _shared_docs
121122
from pandas.core.indexes import base as ibase
122123
from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
@@ -8509,7 +8510,14 @@ def _count_level(self, level, axis=0, numeric_only=False):
85098510
return result
85108511

85118512
def _reduce(
8512-
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
8513+
self,
8514+
op,
8515+
name: str,
8516+
axis=0,
8517+
skipna=True,
8518+
numeric_only=None,
8519+
filter_type=None,
8520+
**kwds,
85138521
):
85148522

85158523
assert filter_type is None or filter_type == "bool", filter_type
@@ -8541,8 +8549,11 @@ def _reduce(
85418549
labels = self._get_agg_axis(axis)
85428550
constructor = self._constructor
85438551

8544-
def f(x):
8545-
return op(x, axis=axis, skipna=skipna, **kwds)
8552+
def func(values):
8553+
if is_extension_array_dtype(values.dtype):
8554+
return extract_array(values)._reduce(name, skipna=skipna, **kwds)
8555+
else:
8556+
return op(values, axis=axis, skipna=skipna, **kwds)
85468557

85478558
def _get_data(axis_matters):
85488559
if filter_type is None:
@@ -8589,7 +8600,7 @@ def blk_func(values):
85898600
out[:] = coerce_to_dtypes(out.values, df.dtypes)
85908601
return out
85918602

8592-
if not self._is_homogeneous_type:
8603+
if not self._is_homogeneous_type or self._mgr.any_extension_types:
85938604
# try to avoid self.values call
85948605

85958606
if filter_type is None and axis == 0 and len(self) > 0:
@@ -8609,7 +8620,7 @@ def blk_func(values):
86098620
from pandas.core.apply import frame_apply
86108621

86118622
opa = frame_apply(
8612-
self, func=f, result_type="expand", ignore_failures=True
8623+
self, func=func, result_type="expand", ignore_failures=True
86138624
)
86148625
result = opa.get_result()
86158626
if result.ndim == self.ndim:
@@ -8621,7 +8632,7 @@ def blk_func(values):
86218632
values = data.values
86228633

86238634
try:
8624-
result = f(values)
8635+
result = func(values)
86258636

86268637
except TypeError:
86278638
# e.g. in nanops trying to convert strs to float
@@ -8632,7 +8643,7 @@ def blk_func(values):
86328643

86338644
values = data.values
86348645
with np.errstate(all="ignore"):
8635-
result = f(values)
8646+
result = func(values)
86368647

86378648
else:
86388649
if numeric_only:
@@ -8643,7 +8654,7 @@ def blk_func(values):
86438654
else:
86448655
data = self
86458656
values = data.values
8646-
result = f(values)
8657+
result = func(values)
86478658

86488659
if filter_type == "bool" and is_object_dtype(values) and axis is None:
86498660
# work around https://github.com/numpy/numpy/issues/10489

pandas/tests/arrays/integer/test_function.py

+9
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected):
133133
assert result == expected
134134

135135

136+
@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"])
137+
def test_dataframe_reductions(op):
138+
# https://github.com/pandas-dev/pandas/pull/32867
139+
# ensure the integers are not cast to float during reductions
140+
df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
141+
result = df.max()
142+
assert isinstance(result["a"], np.int64)
143+
144+
136145
# TODO(jreback) - these need testing / are broken
137146

138147
# shift

pandas/tests/frame/test_analytics.py

+23
Original file line numberDiff line numberDiff line change
@@ -1303,3 +1303,26 @@ def test_preserve_timezone(self, initial: str, method):
13031303
df = DataFrame([expected])
13041304
result = getattr(df, method)(axis=1)
13051305
tm.assert_series_equal(result, expected)
1306+
1307+
1308+
def test_mixed_frame_with_integer_sum():
1309+
# https://github.com/pandas-dev/pandas/issues/34520
1310+
df = pd.DataFrame([["a", 1]], columns=list("ab"))
1311+
df = df.astype({"b": "Int64"})
1312+
result = df.sum()
1313+
expected = pd.Series(["a", 1], index=["a", "b"])
1314+
tm.assert_series_equal(result, expected)
1315+
1316+
1317+
@pytest.mark.parametrize("numeric_only", [True, False, None])
1318+
@pytest.mark.parametrize("method", ["min", "max"])
1319+
def test_minmax_extensionarray(method, numeric_only):
1320+
# https://github.com/pandas-dev/pandas/issues/32651
1321+
int64_info = np.iinfo("int64")
1322+
ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
1323+
df = DataFrame({"Int64": ser})
1324+
result = getattr(df, method)(numeric_only=numeric_only)
1325+
expected = Series(
1326+
[getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object")
1327+
)
1328+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)