Skip to content

BUG: Use correct ExtensionArray reductions in DataFrame reductions #35254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 15, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -913,6 +913,7 @@ Numeric
- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
- Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`)

Conversion
^^^^^^^^^^
Expand Down
27 changes: 19 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.construction import extract_array
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
Expand Down Expand Up @@ -8499,7 +8500,14 @@ def _count_level(self, level, axis=0, numeric_only=False):
return result

def _reduce(
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
self,
op,
name: str,
axis=0,
skipna=True,
numeric_only=None,
filter_type=None,
**kwds,
):

assert filter_type is None or filter_type == "bool", filter_type
Expand Down Expand Up @@ -8531,8 +8539,11 @@ def _reduce(
labels = self._get_agg_axis(axis)
constructor = self._constructor

def f(x):
return op(x, axis=axis, skipna=skipna, **kwds)
def func(values):
if is_extension_array_dtype(values.dtype):
return extract_array(values)._reduce(name, skipna=skipna, **kwds)
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we not use the same exact path for non extension arrays

these if/then for extension arrays are terrible

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we'd have to put the analogous check in each of the nanops.nanfoo functions, which I know @jorisvandenbossche wouldn't like.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is func now effectively the same as blk_func on L8575?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, doing it here is IMO much cleaner than in each nanops function.

is func now effectively the same as blk_func on L8575?

Not exactly, the axis handling is different (also blk_func knows it gets an array, so the extract_array stuff is not needed).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this is ok as a short term check, but seeing these constant if is_extension_array do one thing, else do something else basically means the api is missing lots of things. I would much rather fix it than keep doing this kind of thing. This makes code extremly hard to read and understand.

I remember being very excited when we removed the is_ndarray check here and this because much simpler, now we are going backwards.

return op(values, axis=axis, skipna=skipna, **kwds)

def _get_data(axis_matters):
if filter_type is None:
Expand Down Expand Up @@ -8579,7 +8590,7 @@ def blk_func(values):
out[:] = coerce_to_dtypes(out.values, df.dtypes)
return out

if not self._is_homogeneous_type:
if not self._is_homogeneous_type or self._mgr.any_extension_types:
# try to avoid self.values call

if filter_type is None and axis == 0 and len(self) > 0:
Expand All @@ -8599,7 +8610,7 @@ def blk_func(values):
from pandas.core.apply import frame_apply

opa = frame_apply(
self, func=f, result_type="expand", ignore_failures=True
self, func=func, result_type="expand", ignore_failures=True
)
result = opa.get_result()
if result.ndim == self.ndim:
Expand All @@ -8611,7 +8622,7 @@ def blk_func(values):
values = data.values

try:
result = f(values)
result = func(values)

except TypeError:
# e.g. in nanops trying to convert strs to float
Expand All @@ -8622,7 +8633,7 @@ def blk_func(values):

values = data.values
with np.errstate(all="ignore"):
result = f(values)
result = func(values)

else:
if numeric_only:
Expand All @@ -8633,7 +8644,7 @@ def blk_func(values):
else:
data = self
values = data.values
result = f(values)
result = func(values)

if filter_type == "bool" and is_object_dtype(values) and axis is None:
# work around https://github.com/numpy/numpy/issues/10489
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/arrays/integer/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected):
assert result == expected


@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"])
def test_dataframe_reductions(op):
# https://github.com/pandas-dev/pandas/pull/32867
# ensure the integers are not cast to float during reductions
df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
result = df.max()
assert isinstance(result["a"], np.int64)


# TODO(jreback) - these need testing / are broken

# shift
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1303,3 +1303,26 @@ def test_preserve_timezone(self, initial: str, method):
df = DataFrame([expected])
result = getattr(df, method)(axis=1)
tm.assert_series_equal(result, expected)


def test_mixed_frame_with_integer_sum():
# https://github.com/pandas-dev/pandas/issues/34520
df = pd.DataFrame([["a", 1]], columns=list("ab"))
df = df.astype({"b": "Int64"})
result = df.sum()
expected = pd.Series(["a", 1], index=["a", "b"])
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("numeric_only", [True, False, None])
@pytest.mark.parametrize("method", ["min", "max"])
def test_minmax_extensionarray(method, numeric_only):
# https://github.com/pandas-dev/pandas/issues/32651
int64_info = np.iinfo("int64")
ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
df = DataFrame({"Int64": ser})
result = getattr(df, method)(numeric_only=numeric_only)
expected = Series(
[getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object")
)
tm.assert_series_equal(result, expected)