From 822ce66299ef03fb1e2d62d4795174b63d37cc71 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 6 Feb 2023 15:20:57 -0800 Subject: [PATCH 1/2] API: dont infer dtype for object-dtype groupby reductions --- doc/source/whatsnew/v2.0.0.rst | 2 ++ pandas/core/groupby/groupby.py | 6 ++++-- .../tests/groupby/aggregate/test_aggregate.py | 2 ++ pandas/tests/groupby/aggregate/test_other.py | 1 + pandas/tests/groupby/test_function.py | 6 ++++++ pandas/tests/groupby/test_groupby.py | 4 +++- pandas/tests/groupby/test_min_max.py | 18 ++++++++++++------ 7 files changed, 30 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index df30a31889a99..9afc1bae6d6d4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -760,6 +760,8 @@ Other API changes - The levels of the index of the :class:`Series` returned from ``Series.sparse.from_coo`` now always have dtype ``int32``. Previously they had dtype ``int64`` (:issue:`50926`) - :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`) - The methods :meth:`Series.round`, :meth:`DataFrame.__invert__`, :meth:`Series.__invert__`, :meth:`DataFrame.swapaxes`, :meth:`DataFrame.first`, :meth:`DataFrame.last`, :meth:`Series.first`, :meth:`Series.last` and :meth:`DataFrame.align` will now always return new objects (:issue:`51032`) +- :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`??`) +- .. --------------------------------------------------------------------------- .. _whatsnew_200.deprecations: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5f5bb1c8833da..8158729e7f779 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1482,6 +1482,9 @@ def _agg_py_fallback( # TODO: if we ever get "rank" working, exclude it here. res_values = type(values)._from_sequence(res_values, dtype=values.dtype) + elif ser.dtype == object: + res_values = res_values.astype(object, copy=False) + # If we are DataFrameGroupBy and went through a SeriesGroupByPath # then we need to reshape # GH#32223 includes case with IntegerArray values, ndarray res_values @@ -1524,8 +1527,7 @@ def array_func(values: ArrayLike) -> ArrayLike: new_mgr = data.grouped_reduce(array_func) res = self._wrap_agged_manager(new_mgr) out = self._wrap_aggregated_output(res) - if data.ndim == 2: - # TODO: don't special-case DataFrame vs Series + if self.axis == 1: out = out.infer_objects(copy=False) return out diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e7be78be55620..210dba8bbc44c 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -258,6 +258,7 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( result_dtype_dict ) + tm.assert_frame_equal(result, expected) @@ -675,6 +676,7 @@ def test_agg_split_object_part_datetime(): "F": [1], }, index=np.array([0]), + dtype=object, ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index eb667016b1e62..aad1218190a84 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -517,6 +517,7 @@ def test_sum_uint64_overflow(): expected = DataFrame( {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, index=index, + dtype=object, ) expected.index.name = 0 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1fd61e6eb268e..b51fc31eaef6d 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1509,6 +1509,12 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): "sum", "diff", "pct_change", + "var", + "mean", + "median", + "min", + "max", + "prod", ) # Test default behavior; kernels that fail may be enabled in the future but kernels diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d7b015fa7104a..10118bb83a4a3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2408,7 +2408,9 @@ def test_groupby_duplicate_columns(): ).astype(object) df.columns = ["A", "B", "B"] result = df.groupby([0, 0, 0, 0]).min() - expected = DataFrame([["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"]) + expected = DataFrame( + [["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py index 2a997b3c84216..11f62c5d03c49 100644 --- a/pandas/tests/groupby/test_min_max.py +++ b/pandas/tests/groupby/test_min_max.py @@ -148,9 +148,13 @@ def test_aggregate_numeric_object_dtype(): {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, ).astype(object) result = df.groupby("key").min() - expected = DataFrame( - {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]} - ).set_index("key") + expected = ( + DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}, + ) + .set_index("key") + .astype(object) + ) tm.assert_frame_equal(result, expected) # same but with numbers @@ -158,9 +162,11 @@ def test_aggregate_numeric_object_dtype(): {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, ).astype(object) result = df.groupby("key").min() - expected = DataFrame( - {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]} - ).set_index("key") + expected = ( + DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}) + .set_index("key") + .astype(object) + ) tm.assert_frame_equal(result, expected) From 730e10e8bb267abd15340642f885cb001d7e0b16 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 6 Feb 2023 15:22:02 -0800 Subject: [PATCH 2/2] GH ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9afc1bae6d6d4..2f1ec69d80773 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -760,7 +760,7 @@ Other API changes - The levels of the index of the :class:`Series` returned from ``Series.sparse.from_coo`` now always have dtype ``int32``. Previously they had dtype ``int64`` (:issue:`50926`) - :func:`to_datetime` with ``unit`` of either "Y" or "M" will now raise if a sequence contains a non-round ``float`` value, matching the ``Timestamp`` behavior (:issue:`50301`) - The methods :meth:`Series.round`, :meth:`DataFrame.__invert__`, :meth:`Series.__invert__`, :meth:`DataFrame.swapaxes`, :meth:`DataFrame.first`, :meth:`DataFrame.last`, :meth:`Series.first`, :meth:`Series.last` and :meth:`DataFrame.align` will now always return new objects (:issue:`51032`) -- :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`??`) +- :class:`DataFrameGroupBy` aggregations (e.g. "sum") with object-dtype columns no longer infer non-object dtypes for their results, explicitly call ``result.infer_objects(copy=False)`` on the result to obtain the old behavior (:issue:`51205`) - .. ---------------------------------------------------------------------------