diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2abe85f042af1..8025f7762f110 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -626,6 +626,54 @@ consistent with the behaviour of :class:`DataFrame` and :class:`Index`. DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. Series([], dtype: float64) +Result dtype inference changes for resample operations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The rules for the result dtype in :meth:`DataFrame.resample` aggregations have changed for extension types (:issue:`31359`). +Previously, pandas would attempt to convert the result back to the original dtype, falling back to the usual +inference rules if that was not possible. Now, pandas will only return a result of the original dtype if the +scalar values in the result are instances of the extension dtype's scalar type. + +.. ipython:: python + + df = pd.DataFrame({"A": ['a', 'b']}, dtype='category', + index=pd.date_range('2000', periods=2)) + df + + +*pandas 0.25.x* + +.. code-block:: python + + >>> df.resample("2D").agg(lambda x: 'a').A.dtype + CategoricalDtype(categories=['a', 'b'], ordered=False) + +*pandas 1.0.0* + +.. ipython:: python + + df.resample("2D").agg(lambda x: 'a').A.dtype + +This fixes an inconsistency between ``resample`` and ``groupby``. +This also fixes a potential bug, where the **values** of the result might change +depending on how the results are cast back to the original dtype. + +*pandas 0.25.x* + +.. code-block:: python + + >>> df.resample("2D").agg(lambda x: 'c') + + A + 0 NaN + +*pandas 1.0.0* + +.. ipython:: python + + df.resample("2D").agg(lambda x: 'c') + + .. _whatsnew_100.api_breaking.python: Increased minimum version for Python diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index aa21aa452be95..02e9383314d36 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -813,9 +813,10 @@ def _try_cast(self, result, obj, numeric_only: bool = False): # datetime64tz is handled correctly in agg_series, # so is excluded here. - # return the same type (Series) as our caller - cls = dtype.construct_array_type() - result = try_cast_to_ea(cls, result, dtype=dtype) + if len(result) and isinstance(result[0], dtype.type): + cls = dtype.construct_array_type() + result = try_cast_to_ea(cls, result, dtype=dtype) + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 679d3668523c2..2e95daa392976 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -543,6 +543,17 @@ def _cython_operation( if mask.any(): result = result.astype("float64") result[mask] = np.nan + elif ( + how == "add" + and is_integer_dtype(orig_values.dtype) + and is_extension_array_dtype(orig_values.dtype) + ): + # We need this to ensure that Series[Int64Dtype].resample().sum() + # remains int64 dtype. + # Two options for avoiding this special case + # 1. mask-aware ops and avoid casting to float with NaN above + # 2. specify the result dtype when calling this method + result = result.astype("int64") if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 67bdcc246579e..2d31996a8a964 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -663,6 +663,27 @@ def test_aggregate_mixed_types(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(reason="Not implemented.") +def test_aggregate_udf_na_extension_type(): + # https://github.com/pandas-dev/pandas/pull/31359 + # This is currently failing to cast back to Int64Dtype. + # The presence of the NA causes two problems + # 1. NA is not an instance of Int64Dtype.type (numpy.int64) + # 2. The presence of an NA forces object type, so the non-NA values is + # a Python int rather than a NumPy int64. Python ints aren't + # instances of numpy.int64. + def aggfunc(x): + if all(x > 2): + return 1 + else: + return pd.NA + + df = pd.DataFrame({"A": pd.array([1, 2, 3])}) + result = df.groupby([1, 1, 2]).agg(aggfunc) + expected = pd.DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2]) + tm.assert_frame_equal(result, expected) + + class TestLambdaMangling: def test_basic(self): df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 9323946581a0d..1c2de8c8c223f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1342,3 +1342,37 @@ def test_series_groupby_categorical_aggregation_getitem(): result = groups["foo"].agg("mean") expected = groups.agg("mean")["foo"] tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "func, expected_values", + [(pd.Series.nunique, [1, 1, 2]), (pd.Series.count, [1, 2, 2])], +) +def test_groupby_agg_categorical_columns(func, expected_values): + # 31256 + df = pd.DataFrame( + { + "id": [0, 1, 2, 3, 4], + "groups": [0, 1, 1, 2, 2], + "value": pd.Categorical([0, 0, 0, 0, 1]), + } + ).set_index("id") + result = df.groupby("groups").agg(func) + + expected = pd.DataFrame( + {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_agg_non_numeric(): + df = pd.DataFrame( + {"A": pd.Categorical(["a", "a", "b"], categories=["a", "b", "c"])} + ) + expected = pd.DataFrame({"A": [2, 1]}, index=[1, 2]) + + result = df.groupby([1, 2, 1]).agg(pd.Series.nunique) + tm.assert_frame_equal(result, expected) + + result = df.groupby([1, 2, 1]).nunique() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 4860329718f54..3ad82b9e075a8 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -122,7 +122,9 @@ def test_resample_integerarray(): result = ts.resample("3T").mean() expected = Series( - [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64" + [1, 4, 7], + index=pd.date_range("1/1/2000", periods=3, freq="3T"), + dtype="float64", ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index d1bcdc55cb509..a4d14f127b80e 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -105,7 +105,7 @@ def test_resample_categorical_data_with_timedeltaindex(): index=pd.to_timedelta([0, 10], unit="s"), ) expected = expected.reindex(["Group_obj", "Group"], axis=1) - expected["Group"] = expected["Group_obj"].astype("category") + expected["Group"] = expected["Group_obj"] tm.assert_frame_equal(result, expected)