BUG: regression when applying groupby aggregation on categorical columns (#31359)

charlesdong1991 · web-flow · commit 06ef193a5c19 · 2020-01-29T14:20:52.000-06:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -626,6 +626,54 @@ consistent with the behaviour of :class:`DataFrame` and :class:`Index`.
    DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
    Series([], dtype: float64)
 
+Result dtype inference changes for resample operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The rules for the result dtype in :meth:`DataFrame.resample` aggregations have changed for extension types (:issue:`31359`).
+Previously, pandas would attempt to convert the result back to the original dtype, falling back to the usual
+inference rules if that was not possible. Now, pandas will only return a result of the original dtype if the
+scalar values in the result are instances of the extension dtype's scalar type.
+
+.. ipython:: python
+
+   df = pd.DataFrame({"A": ['a', 'b']}, dtype='category',
+                     index=pd.date_range('2000', periods=2))
+   df
+
+
+*pandas 0.25.x*
+
+.. code-block:: python
+
+   >>> df.resample("2D").agg(lambda x: 'a').A.dtype
+   CategoricalDtype(categories=['a', 'b'], ordered=False)
+
+*pandas 1.0.0*
+
+.. ipython:: python
+
+   df.resample("2D").agg(lambda x: 'a').A.dtype
+
+This fixes an inconsistency between ``resample`` and ``groupby``.
+This also fixes a potential bug, where the **values** of the result might change
+depending on how the results are cast back to the original dtype.
+
+*pandas 0.25.x*
+
+.. code-block:: python
+
+   >>> df.resample("2D").agg(lambda x: 'c')
+
+        A
+   0  NaN
+
+*pandas 1.0.0*
+
+.. ipython:: python
+
+   df.resample("2D").agg(lambda x: 'c')
+
+
 .. _whatsnew_100.api_breaking.python:
 
 Increased minimum version for Python
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -813,9 +813,10 @@ def _try_cast(self, result, obj, numeric_only: bool = False):
                 # datetime64tz is handled correctly in agg_series,
                 #  so is excluded here.
 
-                # return the same type (Series) as our caller
-                cls = dtype.construct_array_type()
-                result = try_cast_to_ea(cls, result, dtype=dtype)
+                if len(result) and isinstance(result[0], dtype.type):
+                    cls = dtype.construct_array_type()
+                    result = try_cast_to_ea(cls, result, dtype=dtype)
+
             elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
                 result = maybe_downcast_to_dtype(result, dtype)
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -543,6 +543,17 @@ def _cython_operation(
             if mask.any():
                 result = result.astype("float64")
                 result[mask] = np.nan
+        elif (
+            how == "add"
+            and is_integer_dtype(orig_values.dtype)
+            and is_extension_array_dtype(orig_values.dtype)
+        ):
+            # We need this to ensure that Series[Int64Dtype].resample().sum()
+            # remains int64 dtype.
+            # Two options for avoiding this special case
+            # 1. mask-aware ops and avoid casting to float with NaN above
+            # 2. specify the result dtype when calling this method
+            result = result.astype("int64")
 
         if kind == "aggregate" and self._filter_empty_groups and not counts.all():
             assert result.ndim != 2
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -663,6 +663,27 @@ def test_aggregate_mixed_types():
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.xfail(reason="Not implemented.")
+def test_aggregate_udf_na_extension_type():
+    # https://github.com/pandas-dev/pandas/pull/31359
+    # This is currently failing to cast back to Int64Dtype.
+    # The presence of the NA causes two problems
+    # 1. NA is not an instance of Int64Dtype.type (numpy.int64)
+    # 2. The presence of an NA forces object type, so the non-NA values is
+    #    a Python int rather than a NumPy int64. Python ints aren't
+    #    instances of numpy.int64.
+    def aggfunc(x):
+        if all(x > 2):
+            return 1
+        else:
+            return pd.NA
+
+    df = pd.DataFrame({"A": pd.array([1, 2, 3])})
+    result = df.groupby([1, 1, 2]).agg(aggfunc)
+    expected = pd.DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2])
+    tm.assert_frame_equal(result, expected)
+
+
 class TestLambdaMangling:
     def test_basic(self):
         df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1342,3 +1342,37 @@ def test_series_groupby_categorical_aggregation_getitem():
     result = groups["foo"].agg("mean")
     expected = groups.agg("mean")["foo"]
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func, expected_values",
+    [(pd.Series.nunique, [1, 1, 2]), (pd.Series.count, [1, 2, 2])],
+)
+def test_groupby_agg_categorical_columns(func, expected_values):
+    # 31256
+    df = pd.DataFrame(
+        {
+            "id": [0, 1, 2, 3, 4],
+            "groups": [0, 1, 1, 2, 2],
+            "value": pd.Categorical([0, 0, 0, 0, 1]),
+        }
+    ).set_index("id")
+    result = df.groupby("groups").agg(func)
+
+    expected = pd.DataFrame(
+        {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_agg_non_numeric():
+    df = pd.DataFrame(
+        {"A": pd.Categorical(["a", "a", "b"], categories=["a", "b", "c"])}
+    )
+    expected = pd.DataFrame({"A": [2, 1]}, index=[1, 2])
+
+    result = df.groupby([1, 2, 1]).agg(pd.Series.nunique)
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby([1, 2, 1]).nunique()
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
@@ -122,7 +122,9 @@ def test_resample_integerarray():
 
     result = ts.resample("3T").mean()
     expected = Series(
-        [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64"
+        [1, 4, 7],
+        index=pd.date_range("1/1/2000", periods=3, freq="3T"),
+        dtype="float64",
     )
     tm.assert_series_equal(result, expected)
 
diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py
@@ -105,7 +105,7 @@ def test_resample_categorical_data_with_timedeltaindex():
         index=pd.to_timedelta([0, 10], unit="s"),
     )
     expected = expected.reindex(["Group_obj", "Group"], axis=1)
-    expected["Group"] = expected["Group_obj"].astype("category")
+    expected["Group"] = expected["Group_obj"]
     tm.assert_frame_equal(result, expected)
 
 

Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,9 @@ def test_resample_integerarray():`
`122`	`122`
`123`	`123`	`result = ts.resample("3T").mean()`
`124`	`124`	`expected = Series(`
`125`		`- [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64"`
	`125`	`+ [1, 4, 7],`
	`126`	`+ index=pd.date_range("1/1/2000", periods=3, freq="3T"),`
	`127`	`+ dtype="float64",`
`126`	`128`	`)`
`127`	`129`	`tm.assert_series_equal(result, expected)`
`128`	`130`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ def test_resample_categorical_data_with_timedeltaindex():`
`105`	`105`	`index=pd.to_timedelta([0, 10], unit="s"),`
`106`	`106`	`)`
`107`	`107`	`expected = expected.reindex(["Group_obj", "Group"], axis=1)`
`108`		`- expected["Group"] = expected["Group_obj"].astype("category")`
	`108`	`+ expected["Group"] = expected["Group_obj"]`
`109`	`109`	`tm.assert_frame_equal(result, expected)`
`110`	`110`
`111`	`111`