Skip to content

BUG: DataFrame reductions with object dtype and axis=1 #50224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
898759d
BUG: DataFrame reductions with object dtype and axis=1
rhshadrach Nov 10, 2022
cd696ae
Use intp
rhshadrach Nov 12, 2022
1e330fa
Merge branch 'main' of https://github.com/pandas-dev/pandas into obje…
rhshadrach Nov 12, 2022
993c4bb
Merge branch 'main' of https://github.com/pandas-dev/pandas into obje…
rhshadrach Nov 21, 2022
20b1c9c
Merge branch 'main' of https://github.com/pandas-dev/pandas into obje…
rhshadrach Nov 23, 2022
9de5b27
Merge cleanup
rhshadrach Nov 23, 2022
ceee136
Merge branch 'main' of https://github.com/pandas-dev/pandas into obje…
rhshadrach Dec 2, 2022
a373775
Update fixture name
rhshadrach Dec 2, 2022
b4def23
Merge branch 'main' of https://github.com/pandas-dev/pandas into obje…
rhshadrach Dec 2, 2022
10568da
WIP
rhshadrach Dec 6, 2022
b3e29ba
Merge branch 'main' of https://github.com/pandas-dev/pandas into obje…
rhshadrach Dec 10, 2022
d0307d9
Fixups
rhshadrach Dec 10, 2022
e4a47cc
BUG: DataFrame reductions with object dtype and axis=1
rhshadrach Nov 10, 2022
18d3163
Merge branch 'object_reduction_axis_1_attempt_2' of https://github.co…
rhshadrach Dec 13, 2022
1b58cb0
Merge branch 'main' of https://github.com/pandas-dev/pandas into obje…
rhshadrach Dec 13, 2022
4ec9237
fixups
rhshadrach Dec 14, 2022
aeea9b7
int64
rhshadrach Dec 15, 2022
5495f0c
type-ignore; int64 fix
rhshadrach Dec 15, 2022
85a66b6
fixup
rhshadrach Dec 15, 2022
b6a96c2
Change behavior to float for certain reductions
rhshadrach Dec 19, 2022
1708588
Merge branch 'main' of https://github.com/pandas-dev/pandas into obje…
rhshadrach Dec 19, 2022
3dbff78
Merge branch 'object_reduction_axis_1_attempt_2' of https://github.co…
rhshadrach Dec 19, 2022
70fc879
Cleanup
rhshadrach Dec 19, 2022
fa4f006
Merge branch 'main' of https://github.com/pandas-dev/pandas into obje…
rhshadrach Dec 20, 2022
6d936e9
Merge branch 'main' into object_reduction_axis_1_attempt_2
rhshadrach Dec 20, 2022
501cc69
cleanup
rhshadrach Dec 20, 2022
609d789
Merge branch 'main' of https://github.com/pandas-dev/pandas into obje…
rhshadrach Dec 20, 2022
7f7fa56
cleanup
rhshadrach Dec 20, 2022
0b61010
Merge branch 'object_reduction_axis_1_attempt_2' of https://github.co…
rhshadrach Dec 20, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,7 @@ Timezones
Numeric
^^^^^^^
- Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`)
- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`)
- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would have results unnecessarily coerced to float; coercion still occurs for reductions that necessarily result in floats (``mean``, ``var``, ``std``, ``skew``) (:issue:`49603`)
- Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`)

Conversion
Expand Down
81 changes: 33 additions & 48 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10493,10 +10493,6 @@ def _reduce(
axis = self._get_axis_number(axis)
assert axis in [0, 1]

def func(values: np.ndarray):
# We only use this in the case that operates on self.values
return op(values, axis=axis, skipna=skipna, **kwds)

def blk_func(values, axis: Axis = 1):
if isinstance(values, ExtensionArray):
if not is_1d_only_ea_dtype(values.dtype) and not isinstance(
Expand All @@ -10516,51 +10512,40 @@ def _get_data() -> DataFrame:
data = self._get_bool_data()
return data

if numeric_only or axis == 0:
# For numeric_only non-None and axis non-None, we know
# which blocks to use and no try/except is needed.
# For numeric_only=None only the case with axis==0 and no object
# dtypes are unambiguous can be handled with BlockManager.reduce
# Case with EAs see GH#35881
df = self
if numeric_only:
df = _get_data()
if axis == 1:
df = df.T
axis = 0

# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager.reduce
res = df._mgr.reduce(blk_func)
out = df._constructor(res).iloc[0]
if out_dtype is not None:
out = out.astype(out_dtype)
if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)

return out

assert not numeric_only and axis == 1

data = self
values = data.values
result = func(values)

if hasattr(result, "dtype"):
if filter_type == "bool" and notna(result).all():
result = result.astype(np.bool_)
elif filter_type is None and is_object_dtype(result.dtype):
try:
result = result.astype(np.float64)
except (ValueError, TypeError):
# try to coerce to the original dtypes item by item if we can
pass
# Case with EAs see GH#35881
df = self
if numeric_only:
df = _get_data()
if axis == 1:
if len(df.index) == 0:
# Taking a transpose would result in no columns, losing the dtype.
# In the empty case, reducing along axis 0 or 1 gives the same
# result dtype, so reduce with axis=0 and ignore values
result = df._reduce(
op,
name,
axis=0,
skipna=skipna,
numeric_only=False,
filter_type=filter_type,
**kwds,
).iloc[:0]
result.index = df.index
return result
df = df.T

# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager.reduce
res = df._mgr.reduce(blk_func)
out = df._constructor(res).iloc[0]
if out_dtype is not None:
out = out.astype(out_dtype)
if len(self) == 0 and name in ["sum", "prod"]:
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)

labels = self._get_agg_axis(axis)
result = self._constructor_sliced(result, index=labels)
return result
return out

def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
"""
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -986,8 +986,9 @@ def reduce(self: T, func: Callable) -> T:
else:
# error: Argument 1 to "append" of "list" has incompatible type
# "ExtensionArray"; expected "ndarray"
dtype = arr.dtype if res is NaT else None
result_arrays.append(
sanitize_array([res], None) # type: ignore[arg-type]
sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type]
)

index = Index._simple_new(np.array([None], dtype=object)) # placeholder
Expand Down
14 changes: 11 additions & 3 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ def nansum(
3.0
"""
values, mask, dtype, dtype_max, _ = _get_values(
values, skipna, fill_value=0, mask=mask
values, skipna, fill_value=0.0, mask=mask
)
dtype_sum = dtype_max
if is_float_dtype(dtype):
Expand Down Expand Up @@ -1389,7 +1389,7 @@ def nanprod(

if skipna and mask is not None:
values = values.copy()
values[mask] = 1
values[mask] = 1.0
result = values.prod(axis)
# error: Incompatible return value type (got "Union[ndarray, float]", expected
# "float")
Expand Down Expand Up @@ -1500,7 +1500,15 @@ def _maybe_null_out(
result[null_mask] = None
elif result is not NaT:
if check_below_min_count(shape, mask, min_count):
result = np.nan
result_dtype = getattr(result, "dtype", None)
if is_float_dtype(result_dtype):
# Preserve dtype when possible
# mypy doesn't infer result_dtype is not None
result = getattr(
np, f"float{8 * result_dtype.itemsize}" # type: ignore[union-attr]
)("nan")
else:
result = np.nan

return result

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1432,7 +1432,7 @@ def test_apply_datetime_tz_issue():
def test_mixed_column_raises(df, method):
# GH 16832
if method == "sum":
msg = r'can only concatenate str \(not "int"\) to str'
msg = r'can only concatenate str \(not "float"\) to str'
else:
msg = "not supported between instances of 'str' and 'float'"
with pytest.raises(TypeError, match=msg):
Expand Down
59 changes: 54 additions & 5 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,11 +318,26 @@ def wrapper(x):
DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
],
)
def test_stat_operators_attempt_obj_array(self, method, df):
def test_stat_operators_attempt_obj_array(
self, method, df, using_array_manager, axis
):
# GH#676
assert df.values.dtype == np.object_
result = getattr(df, method)(1)
expected = getattr(df.astype("f8"), method)(1)
result = getattr(df, method)(axis=axis)
expected = getattr(df.astype("f8"), method)(axis=axis)
# With values an np.array with dtype object:
# - When using blocks, `values.sum(axis=1, ...)` returns a np.array of dim 1
# and this remains object dtype
# - When using arrays, `values.sum(axis=0, ...)` returns a Python float
if not using_array_manager and method in ("sum", "prod", "min", "max"):
expected = expected.astype(object)
elif (
using_array_manager
and axis in (0, "index")
and method in ("min", "max")
and 0 in df.columns
):
expected = expected.astype("int64")
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
Expand Down Expand Up @@ -695,6 +710,42 @@ def test_sum_corner(self):
assert len(axis0) == 0
assert len(axis1) == 0

@pytest.mark.parametrize(
"index",
[
tm.makeRangeIndex(0),
tm.makeDateIndex(0),
tm.makeNumericIndex(0, dtype=int),
tm.makeNumericIndex(0, dtype=float),
tm.makeDateIndex(0, freq="M"),
tm.makePeriodIndex(0),
],
)
def test_axis_1_empty(self, all_reductions, index, using_array_manager):
df = DataFrame(columns=["a"], index=index)
result = getattr(df, all_reductions)(axis=1)
expected_dtype = {
"any": "bool",
"all": "bool",
"count": "int64",
"sum": "float",
"prod": "float",
"skew": "float",
"kurt": "float",
"sem": "float",
}.get(all_reductions, "object")
if using_array_manager and all_reductions in (
"max",
"min",
"mean",
"std",
"var",
"median",
):
expected_dtype = "float"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On main, there is no behavior change for array manager in this PR. For block manager, the reducers on L738-743 return float type on main but object type here.

expected = Series([], index=index, dtype=expected_dtype)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
@pytest.mark.parametrize("numeric_only", [None, True, False])
def test_sum_prod_nanops(self, method, unit, numeric_only):
Expand Down Expand Up @@ -1355,13 +1406,11 @@ def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture):
res = df.min(axis=1, skipna=False)
expected = Series([df.loc[0, "a"], pd.NaT])
assert expected.dtype == df["a"].dtype

tm.assert_series_equal(res, expected)

res = df.max(axis=1, skipna=False)
expected = Series([df.loc[0, "b"], pd.NaT])
assert expected.dtype == df["a"].dtype

tm.assert_series_equal(res, expected)

def test_min_max_dt64_api_consistency_with_NaT(self):
Expand Down