diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b006d3820889f..3cc55f8682670 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -778,7 +778,9 @@ Other API changes - The levels of the index of the :class:`Series` returned from ``Series.sparse.from_coo`` now always have dtype ``int32``. Previously they had dtype ``int64`` (:issue:`50926`) - :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`) - The methods :meth:`Series.round`, :meth:`DataFrame.__invert__`, :meth:`Series.__invert__`, :meth:`DataFrame.swapaxes`, :meth:`DataFrame.first`, :meth:`DataFrame.last`, :meth:`Series.first`, :meth:`Series.last` and :meth:`DataFrame.align` will now always return new objects (:issue:`51032`) +- :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`) - Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`) +- .. --------------------------------------------------------------------------- .. _whatsnew_200.deprecations: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ed063a2987188..e42566bfa11a0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1495,6 +1495,9 @@ def _agg_py_fallback( # TODO: if we ever get "rank" working, exclude it here. res_values = type(values)._from_sequence(res_values, dtype=values.dtype) + elif ser.dtype == object: + res_values = res_values.astype(object, copy=False) + # If we are DataFrameGroupBy and went through a SeriesGroupByPath # then we need to reshape # GH#32223 includes case with IntegerArray values, ndarray res_values @@ -1537,8 +1540,7 @@ def array_func(values: ArrayLike) -> ArrayLike: new_mgr = data.grouped_reduce(array_func) res = self._wrap_agged_manager(new_mgr) out = self._wrap_aggregated_output(res) - if data.ndim == 2: - # TODO: don't special-case DataFrame vs Series + if self.axis == 1: out = out.infer_objects(copy=False) return out diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e7be78be55620..210dba8bbc44c 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -258,6 +258,7 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( result_dtype_dict ) + tm.assert_frame_equal(result, expected) @@ -675,6 +676,7 @@ def test_agg_split_object_part_datetime(): "F": [1], }, index=np.array([0]), + dtype=object, ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index eb667016b1e62..aad1218190a84 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -517,6 +517,7 @@ def test_sum_uint64_overflow(): expected = DataFrame( {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, index=index, + dtype=object, ) expected.index.name = 0 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index e93dd022f46ac..1f081daf41b75 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1509,6 +1509,12 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "sum", "diff", "pct_change", + "var", + "mean", + "median", + "min", + "max", + "prod", ) # Test default behavior; kernels that fail may be enabled in the future but kernels diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a7bd89942ea79..e175f6dda980f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2380,7 +2380,9 @@ def test_groupby_duplicate_columns(): ).astype(object) df.columns = ["A", "B", "B"] result = df.groupby([0, 0, 0, 0]).min() - expected = DataFrame([["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"]) + expected = DataFrame( + [["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 2a997b3c84216..11f62c5d03c49 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -148,9 +148,13 @@ def test_aggregate_numeric_object_dtype(): {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, ).astype(object) result = df.groupby("key").min() - expected = DataFrame( - {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]} - ).set_index("key") + expected = ( + DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}, + ) + .set_index("key") + .astype(object) + ) tm.assert_frame_equal(result, expected) # same but with numbers @@ -158,9 +162,11 @@ def test_aggregate_numeric_object_dtype(): {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, ).astype(object) result = df.groupby("key").min() - expected = DataFrame( - {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]} - ).set_index("key") + expected = ( + DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}) + .set_index("key") + .astype(object) + ) tm.assert_frame_equal(result, expected)