diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index eed712ae7de0f..754426036acaa 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -336,13 +336,14 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - +- Bug in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` with ``ExtensionDtype`` columns incorrectly casting results too aggressively (:issue:`38254`) - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`) - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) - Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`) - Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`38934`) -- Bug in :meth:`.Resampler.aggregate` and :meth:`DataFrame.transform` raising ``TypeError`` instead of ``SpecificationError`` when missing keys had mixed dtypes (:issue:`39025`) +- Bug in :meth:`Resampler.aggregate` and :meth:`DataFrame.transform` raising ``TypeError`` instead of ``SpecificationError`` when missing keys had mixed dtypes (:issue:`39025`) - Bug in :meth:`.DataFrameGroupBy.idxmin` and :meth:`.DataFrameGroupBy.idxmax` with ``ExtensionDtype`` columns (:issue:`38733`) +- Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 517b848742541..4c04ba75a26fe 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -29,11 +29,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.cast import ( - maybe_cast_result, - maybe_cast_result_dtype, - maybe_downcast_to_dtype, -) +from pandas.core.dtypes.cast import maybe_cast_result_dtype, maybe_downcast_to_dtype from pandas.core.dtypes.common import ( ensure_float, ensure_float64, @@ -756,7 +752,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): result[label] = res result = lib.maybe_convert_objects(result, try_float=0) - result = maybe_cast_result(result, obj, numeric_only=True) + # TODO: cast to EA once _from_sequence is reliably strict GH#38254 return result, counts diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 08768bda312ba..94ae8b8b9906d 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -449,7 +449,8 @@ def test_groupby_agg(): ) # single key, selected column - expected = pd.Series(to_decimal([data[0], data[3]])) + # GH#38254 until _from_sequence is reliably strict, we cant retain dtype + expected = pd.Series(to_decimal([data[0], data[3]])).astype(object) result = df.groupby("id1")["decimals"].agg(lambda x: x.iloc[0]) tm.assert_series_equal(result, expected, check_names=False) result = df["decimals"].groupby(df["id1"]).agg(lambda x: x.iloc[0]) @@ -459,14 +460,16 @@ def test_groupby_agg(): expected = pd.Series( to_decimal([data[0], data[1], data[3]]), index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 1)]), - ) + ).astype(object) result = df.groupby(["id1", "id2"])["decimals"].agg(lambda x: x.iloc[0]) tm.assert_series_equal(result, expected, check_names=False) result = df["decimals"].groupby([df["id1"], df["id2"]]).agg(lambda x: x.iloc[0]) tm.assert_series_equal(result, expected, check_names=False) # multiple columns - expected = pd.DataFrame({"id2": [0, 1], "decimals": to_decimal([data[0], data[3]])}) + expected = pd.DataFrame( + {"id2": [0, 1], "decimals": to_decimal([data[0], data[3]]).astype(object)} + ) result = df.groupby("id1").agg(lambda x: x.iloc[0]) tm.assert_frame_equal(result, expected, check_names=False) @@ -482,7 +485,11 @@ def DecimalArray__my_sum(self): data = make_data()[:5] df = pd.DataFrame({"id": [0, 0, 0, 1, 1], "decimals": DecimalArray(data)}) - expected = pd.Series(to_decimal([data[0] + data[1] + data[2], data[3] + data[4]])) + + # GH#38254 until _from_sequence is reliably strict, we cant retain dtype + expected = pd.Series( + to_decimal([data[0] + data[1] + data[2], data[3] + data[4]]) + ).astype(object) result = df.groupby("id")["decimals"].agg(lambda x: x.values.my_sum()) tm.assert_series_equal(result, expected, check_names=False) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 5d0f6d6262899..67b99678ebec5 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -432,10 +432,13 @@ def test_agg_over_numpy_arrays(): tm.assert_frame_equal(result, expected) -def test_agg_tzaware_non_datetime_result(): +@pytest.mark.parametrize("as_period", [True, False]) +def test_agg_tzaware_non_datetime_result(as_period): # discussed in GH#29589, fixed in GH#29641, operating on tzaware values # with function that is not dtype-preserving dti = pd.date_range("2012-01-01", periods=4, tz="UTC") + if as_period: + dti = dti.tz_localize(None).to_period("D") df = DataFrame({"a": [0, 0, 1, 1], "b": dti}) gb = df.groupby("a") @@ -454,6 +457,9 @@ def test_agg_tzaware_non_datetime_result(): result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") expected.index.name = "a" + if as_period: + expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b") + expected.index.name = "a" tm.assert_series_equal(result, expected) @@ -627,7 +633,8 @@ def test_groupby_agg_err_catching(err_cls): {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} ) - expected = Series(to_decimal([data[0], data[3]])) + # GH#38254 until _from_sequence is strict, we cannot reliably cast agg results + expected = Series(to_decimal([data[0], data[3]])).astype(object) def weird_func(x): # weird function that raise something other than TypeError or IndexError