Skip to content

BUG: regression when applying groupby aggregation on categorical colu… #31428

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,54 @@ consistent with the behaviour of :class:`DataFrame` and :class:`Index`.
DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
Series([], dtype: float64)

Result dtype inference changes for resample operations
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The rules for the result dtype in :meth:`DataFrame.resample` aggregations have changed for extension types (:issue:`31359`).
Previously, pandas would attempt to convert the result back to the original dtype, falling back to the usual
inference rules if that was not possible. Now, pandas will only return a result of the original dtype if the
scalar values in the result are instances of the extension dtype's scalar type.

.. ipython:: python

df = pd.DataFrame({"A": ['a', 'b']}, dtype='category',
index=pd.date_range('2000', periods=2))
df


*pandas 0.25.x*

.. code-block:: python

>>> df.resample("2D").agg(lambda x: 'a').A.dtype
CategoricalDtype(categories=['a', 'b'], ordered=False)

*pandas 1.0.0*

.. ipython:: python

df.resample("2D").agg(lambda x: 'a').A.dtype

This fixes an inconsistency between ``resample`` and ``groupby``.
This also fixes a potential bug, where the **values** of the result might change
depending on how the results are cast back to the original dtype.

*pandas 0.25.x*

.. code-block:: python

>>> df.resample("2D").agg(lambda x: 'c')

A
0 NaN

*pandas 1.0.0*

.. ipython:: python

df.resample("2D").agg(lambda x: 'c')


.. _whatsnew_100.api_breaking.python:

Increased minimum version for Python
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,9 +813,10 @@ def _try_cast(self, result, obj, numeric_only: bool = False):
# datetime64tz is handled correctly in agg_series,
# so is excluded here.

# return the same type (Series) as our caller
cls = dtype.construct_array_type()
result = try_cast_to_ea(cls, result, dtype=dtype)
if len(result) and isinstance(result[0], dtype.type):
cls = dtype.construct_array_type()
result = try_cast_to_ea(cls, result, dtype=dtype)

elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)

Expand Down
11 changes: 11 additions & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,17 @@ def _cython_operation(
if mask.any():
result = result.astype("float64")
result[mask] = np.nan
elif (
how == "add"
and is_integer_dtype(orig_values.dtype)
and is_extension_array_dtype(orig_values.dtype)
):
# We need this to ensure that Series[Int64Dtype].resample().sum()
# remains int64 dtype.
# Two options for avoiding this special case
# 1. mask-aware ops and avoid casting to float with NaN above
# 2. specify the result dtype when calling this method
result = result.astype("int64")

if kind == "aggregate" and self._filter_empty_groups and not counts.all():
assert result.ndim != 2
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,43 @@ def test_lambda_named_agg(func):
tm.assert_frame_equal(result, expected)


def test_aggregate_mixed_types():
# GH 16916
df = pd.DataFrame(
data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc")
)
df["grouping"] = ["group 1", "group 1", 2]
result = df.groupby("grouping").aggregate(lambda x: x.tolist())
expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]]
expected = pd.DataFrame(
expected_data,
index=Index([2, "group 1"], dtype="object", name="grouping"),
columns=Index(["X", "Y", "Z"], dtype="object"),
)
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(reason="Not implemented.")
def test_aggregate_udf_na_extension_type():
# https://github.com/pandas-dev/pandas/pull/31359
# This is currently failing to cast back to Int64Dtype.
# The presence of the NA causes two problems
# 1. NA is not an instance of Int64Dtype.type (numpy.int64)
# 2. The presence of an NA forces object type, so the non-NA values is
# a Python int rather than a NumPy int64. Python ints aren't
# instances of numpy.int64.
def aggfunc(x):
if all(x > 2):
return 1
else:
return pd.NA

df = pd.DataFrame({"A": pd.array([1, 2, 3])})
result = df.groupby([1, 1, 2]).agg(aggfunc)
expected = pd.DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2])
tm.assert_frame_equal(result, expected)


class TestLambdaMangling:
def test_maybe_mangle_lambdas_passthrough(self):
assert _maybe_mangle_lambdas("mean") == "mean"
Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1342,3 +1342,37 @@ def test_series_groupby_categorical_aggregation_getitem():
result = groups["foo"].agg("mean")
expected = groups.agg("mean")["foo"]
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"func, expected_values",
[(pd.Series.nunique, [1, 1, 2]), (pd.Series.count, [1, 2, 2])],
)
def test_groupby_agg_categorical_columns(func, expected_values):
# 31256
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4],
"groups": [0, 1, 1, 2, 2],
"value": pd.Categorical([0, 0, 0, 0, 1]),
}
).set_index("id")
result = df.groupby("groups").agg(func)

expected = pd.DataFrame(
{"value": expected_values}, index=pd.Index([0, 1, 2], name="groups"),
)
tm.assert_frame_equal(result, expected)


def test_groupby_agg_non_numeric():
df = pd.DataFrame(
{"A": pd.Categorical(["a", "a", "b"], categories=["a", "b", "c"])}
)
expected = pd.DataFrame({"A": [2, 1]}, index=[1, 2])

result = df.groupby([1, 2, 1]).agg(pd.Series.nunique)
tm.assert_frame_equal(result, expected)

result = df.groupby([1, 2, 1]).nunique()
tm.assert_frame_equal(result, expected)
4 changes: 3 additions & 1 deletion pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ def test_resample_integerarray():

result = ts.resample("3T").mean()
expected = Series(
[1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64"
[1, 4, 7],
index=pd.date_range("1/1/2000", periods=3, freq="3T"),
dtype="float64",
)
tm.assert_series_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/resample/test_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_resample_categorical_data_with_timedeltaindex():
index=pd.to_timedelta([0, 10], unit="s"),
)
expected = expected.reindex(["Group_obj", "Group"], axis=1)
expected["Group"] = expected["Group_obj"].astype("category")
expected["Group"] = expected["Group_obj"]
tm.assert_frame_equal(result, expected)


Expand Down