Skip to content

Commit b836a88

Browse files
authored
BUG: DataFrame reductions dtypes on object input (#51335)
* BUG: DataFrame reductions dtypes * whatsnew * dtype fixup; whatsnew * Add test, fix whatsnew * Add datetime test * result_dtype.type * xfail * type-ignore
1 parent 1beec62 commit b836a88

File tree

8 files changed

+126
-72
lines changed

8 files changed

+126
-72
lines changed

doc/source/whatsnew/v2.0.0.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -814,7 +814,7 @@ Other API changes
814814
- The levels of the index of the :class:`Series` returned from ``Series.sparse.from_coo`` now always have dtype ``int32``. Previously they had dtype ``int64`` (:issue:`50926`)
815815
- :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`)
816816
- The methods :meth:`Series.round`, :meth:`DataFrame.__invert__`, :meth:`Series.__invert__`, :meth:`DataFrame.swapaxes`, :meth:`DataFrame.first`, :meth:`DataFrame.last`, :meth:`Series.first`, :meth:`Series.last` and :meth:`DataFrame.align` will now always return new objects (:issue:`51032`)
817-
- :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`)
817+
- :class:`DataFrame` and :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`, :issue:`49603`)
818818
- Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
819819
-
820820

@@ -1226,11 +1226,11 @@ Numeric
12261226
^^^^^^^
12271227
- Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`)
12281228
- Bug in arithmetic operations on :class:`Series` not propagating mask when combining masked dtypes and numpy dtypes (:issue:`45810`, :issue:`42630`)
1229-
- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`)
12301229
- Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`)
12311230
- Bug in :meth:`Series.__add__` casting to object for list and masked :class:`Series` (:issue:`22962`)
12321231
- Bug in :meth:`~arrays.ArrowExtensionArray.mode` where ``dropna=False`` was not respected when there was ``NA`` values (:issue:`50982`)
12331232
- Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`)
1233+
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing ``pd.NaT`` and ``axis=1`` would return incorrect results (:issue:`51242`)
12341234

12351235
Conversion
12361236
^^^^^^^^^^

pandas/conftest.py

+8
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,14 @@ def ordered(request):
293293
return request.param
294294

295295

296+
@pytest.fixture(params=[True, False])
297+
def skipna(request):
298+
"""
299+
Boolean 'skipna' parameter.
300+
"""
301+
return request.param
302+
303+
296304
@pytest.fixture(params=["first", "last", False])
297305
def keep(request):
298306
"""

pandas/core/frame.py

+36-47
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,6 @@
141141
is_integer_dtype,
142142
is_iterator,
143143
is_list_like,
144-
is_object_dtype,
145144
is_scalar,
146145
is_sequence,
147146
needs_i8_conversion,
@@ -10458,54 +10457,44 @@ def _get_data() -> DataFrame:
1045810457
data = self._get_bool_data()
1045910458
return data
1046010459

10461-
if numeric_only or axis == 0:
10462-
# For numeric_only non-None and axis non-None, we know
10463-
# which blocks to use and no try/except is needed.
10464-
# For numeric_only=None only the case with axis==0 and no object
10465-
# dtypes are unambiguous can be handled with BlockManager.reduce
10466-
# Case with EAs see GH#35881
10467-
df = self
10468-
if numeric_only:
10469-
df = _get_data()
10470-
if axis == 1:
10471-
df = df.T
10472-
axis = 0
10473-
10474-
# After possibly _get_data and transposing, we are now in the
10475-
# simple case where we can use BlockManager.reduce
10476-
res = df._mgr.reduce(blk_func)
10477-
out = df._constructor(res).iloc[0]
10478-
if out_dtype is not None:
10479-
out = out.astype(out_dtype)
10480-
if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
10481-
# Even if we are object dtype, follow numpy and return
10482-
# float64, see test_apply_funcs_over_empty
10483-
out = out.astype(np.float64)
10484-
10485-
return out
10486-
10487-
assert not numeric_only and axis in (1, None)
10488-
10489-
data = self
10490-
values = data.values
10491-
result = func(values)
10492-
10493-
if hasattr(result, "dtype"):
10494-
if filter_type == "bool" and notna(result).all():
10495-
result = result.astype(np.bool_)
10496-
elif filter_type is None and is_object_dtype(result.dtype):
10497-
try:
10498-
result = result.astype(np.float64)
10499-
except (ValueError, TypeError):
10500-
# try to coerce to the original dtypes item by item if we can
10501-
pass
10502-
10460+
# Case with EAs see GH#35881
10461+
df = self
10462+
if numeric_only:
10463+
df = _get_data()
1050310464
if axis is None:
10504-
return result
10465+
return func(df.values)
10466+
elif axis == 1:
10467+
if len(df.index) == 0:
10468+
# Taking a transpose would result in no columns, losing the dtype.
10469+
# In the empty case, reducing along axis 0 or 1 gives the same
10470+
# result dtype, so reduce with axis=0 and ignore values
10471+
result = df._reduce(
10472+
op,
10473+
name,
10474+
axis=0,
10475+
skipna=skipna,
10476+
numeric_only=False,
10477+
filter_type=filter_type,
10478+
**kwds,
10479+
).iloc[:0]
10480+
result.index = df.index
10481+
return result
10482+
df = df.T
10483+
10484+
# After possibly _get_data and transposing, we are now in the
10485+
# simple case where we can use BlockManager.reduce
10486+
res = df._mgr.reduce(blk_func)
10487+
out = df._constructor(res).iloc[0]
10488+
if out_dtype is not None:
10489+
out = out.astype(out_dtype)
10490+
elif (df._mgr.get_dtypes() == object).any():
10491+
out = out.astype(object)
10492+
elif len(self) == 0 and name in ("sum", "prod"):
10493+
# Even if we are object dtype, follow numpy and return
10494+
# float64, see test_apply_funcs_over_empty
10495+
out = out.astype(np.float64)
1050510496

10506-
labels = self._get_agg_axis(axis)
10507-
result = self._constructor_sliced(result, index=labels)
10508-
return result
10497+
return out
1050910498

1051010499
def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
1051110500
"""

pandas/core/internals/array_manager.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -984,14 +984,10 @@ def reduce(self: T, func: Callable) -> T:
984984
# TODO NaT doesn't preserve dtype, so we need to ensure to create
985985
# a timedelta result array if original was timedelta
986986
# what if datetime results in timedelta? (eg std)
987-
if res is NaT and is_timedelta64_ns_dtype(arr.dtype):
988-
result_arrays.append(np.array(["NaT"], dtype="timedelta64[ns]"))
989-
else:
990-
# error: Argument 1 to "append" of "list" has incompatible type
991-
# "ExtensionArray"; expected "ndarray"
992-
result_arrays.append(
993-
sanitize_array([res], None) # type: ignore[arg-type]
994-
)
987+
dtype = arr.dtype if res is NaT else None
988+
result_arrays.append(
989+
sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type]
990+
)
995991

996992
index = Index._simple_new(np.array([None], dtype=object)) # placeholder
997993
columns = self.items

pandas/core/nanops.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1535,7 +1535,12 @@ def _maybe_null_out(
15351535
result[null_mask] = None
15361536
elif result is not NaT:
15371537
if check_below_min_count(shape, mask, min_count):
1538-
result = np.nan
1538+
result_dtype = getattr(result, "dtype", None)
1539+
if is_float_dtype(result_dtype):
1540+
# error: Item "None" of "Optional[Any]" has no attribute "type"
1541+
result = result_dtype.type("nan") # type: ignore[union-attr]
1542+
else:
1543+
result = np.nan
15391544

15401545
return result
15411546

pandas/tests/apply/test_frame_apply.py

+2
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ def test_apply_funcs_over_empty(func):
135135

136136
result = df.apply(getattr(np, func))
137137
expected = getattr(df, func)()
138+
if func in ("sum", "prod"):
139+
expected = expected.astype(float)
138140
tm.assert_series_equal(result, expected)
139141

140142

pandas/tests/frame/test_reductions.py

+68-6
Original file line numberDiff line numberDiff line change
@@ -317,11 +317,11 @@ def wrapper(x):
317317
DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
318318
],
319319
)
320-
def test_stat_operators_attempt_obj_array(self, method, df):
320+
def test_stat_operators_attempt_obj_array(self, method, df, axis):
321321
# GH#676
322322
assert df.values.dtype == np.object_
323-
result = getattr(df, method)(1)
324-
expected = getattr(df.astype("f8"), method)(1)
323+
result = getattr(df, method)(axis=axis)
324+
expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
325325
tm.assert_series_equal(result, expected)
326326

327327
@pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
@@ -424,7 +424,7 @@ def test_mean_mixed_string_decimal(self):
424424
with pytest.raises(TypeError, match="unsupported operand type"):
425425
df.mean()
426426
result = df[["A", "C"]].mean()
427-
expected = Series([2.7, 681.6], index=["A", "C"])
427+
expected = Series([2.7, 681.6], index=["A", "C"], dtype=object)
428428
tm.assert_series_equal(result, expected)
429429

430430
def test_var_std(self, datetime_frame):
@@ -687,6 +687,29 @@ def test_std_timedelta64_skipna_false(self):
687687
expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)])
688688
tm.assert_series_equal(result, expected)
689689

690+
@pytest.mark.parametrize(
691+
"values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]]
692+
)
693+
def test_std_datetime64_with_nat(
694+
self, values, skipna, using_array_manager, request
695+
):
696+
# GH#51335
697+
if using_array_manager and (
698+
not skipna or all(value is pd.NaT for value in values)
699+
):
700+
mark = pytest.mark.xfail(
701+
reason="GH#51446: Incorrect type inference on NaT in reduction result"
702+
)
703+
request.node.add_marker(mark)
704+
df = DataFrame({"a": to_datetime(values)})
705+
result = df.std(skipna=skipna)
706+
if not skipna or all(value is pd.NaT for value in values):
707+
expected = Series({"a": pd.NaT}, dtype="timedelta64[ns]")
708+
else:
709+
# 86400000000000ns == 1 day
710+
expected = Series({"a": 86400000000000}, dtype="timedelta64[ns]")
711+
tm.assert_series_equal(result, expected)
712+
690713
def test_sum_corner(self):
691714
empty_frame = DataFrame()
692715

@@ -697,6 +720,29 @@ def test_sum_corner(self):
697720
assert len(axis0) == 0
698721
assert len(axis1) == 0
699722

723+
@pytest.mark.parametrize(
724+
"index",
725+
[
726+
tm.makeRangeIndex(0),
727+
tm.makeDateIndex(0),
728+
tm.makeNumericIndex(0, dtype=int),
729+
tm.makeNumericIndex(0, dtype=float),
730+
tm.makeDateIndex(0, freq="M"),
731+
tm.makePeriodIndex(0),
732+
],
733+
)
734+
def test_axis_1_empty(self, all_reductions, index, using_array_manager):
735+
df = DataFrame(columns=["a"], index=index)
736+
result = getattr(df, all_reductions)(axis=1)
737+
if all_reductions in ("any", "all"):
738+
expected_dtype = "bool"
739+
elif all_reductions == "count":
740+
expected_dtype = "int64"
741+
else:
742+
expected_dtype = "object"
743+
expected = Series([], index=index, dtype=expected_dtype)
744+
tm.assert_series_equal(result, expected)
745+
700746
@pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
701747
@pytest.mark.parametrize("numeric_only", [None, True, False])
702748
def test_sum_prod_nanops(self, method, unit, numeric_only):
@@ -1418,6 +1464,21 @@ def test_preserve_timezone(self, initial: str, method):
14181464
result = getattr(df, method)(axis=1)
14191465
tm.assert_series_equal(result, expected)
14201466

1467+
@pytest.mark.parametrize("method", ["min", "max"])
1468+
def test_minmax_tzaware_skipna_axis_1(self, method, skipna):
1469+
# GH#51242
1470+
val = to_datetime("1900-01-01", utc=True)
1471+
df = DataFrame(
1472+
{"a": Series([pd.NaT, pd.NaT, val]), "b": Series([pd.NaT, val, val])}
1473+
)
1474+
op = getattr(df, method)
1475+
result = op(axis=1, skipna=skipna)
1476+
if skipna:
1477+
expected = Series([pd.NaT, val, val])
1478+
else:
1479+
expected = Series([pd.NaT, pd.NaT, val])
1480+
tm.assert_series_equal(result, expected)
1481+
14211482
def test_frame_any_with_timedelta(self):
14221483
# GH#17667
14231484
df = DataFrame(
@@ -1609,12 +1670,13 @@ def test_prod_sum_min_count_mixed_object():
16091670

16101671

16111672
@pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
1612-
def test_reduction_axis_none_returns_scalar(method):
1673+
@pytest.mark.parametrize("numeric_only", [True, False])
1674+
def test_reduction_axis_none_returns_scalar(method, numeric_only):
16131675
# GH#21597 As of 2.0, axis=None reduces over all axes.
16141676

16151677
df = DataFrame(np.random.randn(4, 4))
16161678

1617-
result = getattr(df, method)(axis=None)
1679+
result = getattr(df, method)(axis=None, numeric_only=numeric_only)
16181680
np_arr = df.to_numpy()
16191681
if method in {"skew", "kurt"}:
16201682
comp_mod = pytest.importorskip("scipy.stats")

pandas/tests/test_nanops.py

-8
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,6 @@
2121
use_bn = nanops._USE_BOTTLENECK
2222

2323

24-
@pytest.fixture(params=[True, False])
25-
def skipna(request):
26-
"""
27-
Fixture to pass skipna to nanops functions.
28-
"""
29-
return request.param
30-
31-
3224
@pytest.fixture
3325
def disable_bottleneck(monkeypatch):
3426
with monkeypatch.context() as m:

0 commit comments

Comments
 (0)