Skip to content

Commit 624a1be

Browse files
authored
BUG: Dataframe.groupby aggregations with categorical columns lead to incorrect results. (#32546)
1 parent 151b6e0 commit 624a1be

File tree

3 files changed

+36
-0
lines changed

3 files changed

+36
-0
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -976,6 +976,7 @@ Groupby/resample/rolling
976976

977977
- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`)
978978
- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`)
979+
- Bug in :meth:`Groupby.transform` was returning the wrong result when grouping by multiple keys of which some were categorical and others not (:issue:`32494`)
979980
- Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by column contains NaNs (:issue:`32841`)
980981
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`)
981982
- Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` where a large negative number would be returned when the number of non-null values was below ``min_count`` for nullable integer dtypes (:issue:`32861`)

pandas/core/groupby/generic.py

+2
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,7 @@ def _transform_fast(self, result, func_nm: str) -> Series:
546546
builtin/cythonizable functions
547547
"""
548548
ids, _, ngroup = self.grouper.group_info
549+
result = result.reindex(self.grouper.result_index, copy=False)
549550
cast = self._transform_should_cast(func_nm)
550551
out = algorithms.take_1d(result._values, ids)
551552
if cast:
@@ -1496,6 +1497,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame:
14961497
# for each col, reshape to to size of original frame
14971498
# by take operation
14981499
ids, _, ngroup = self.grouper.group_info
1500+
result = result.reindex(self.grouper.result_index, copy=False)
14991501
output = []
15001502
for i, _ in enumerate(result.columns):
15011503
res = algorithms.take_1d(result.iloc[:, i].values, ids)

pandas/tests/groupby/transform/test_transform.py

+33
Original file line numberDiff line numberDiff line change
@@ -1205,3 +1205,36 @@ def test_transform_lambda_indexing():
12051205
),
12061206
)
12071207
tm.assert_frame_equal(result, expected)
1208+
1209+
1210+
def test_categorical_and_not_categorical_key(observed):
1211+
# Checks that groupby-transform, when grouping by both a categorical
1212+
# and a non-categorical key, doesn't try to expand the output to include
1213+
# non-observed categories but instead matches the input shape.
1214+
# GH 32494
1215+
df_with_categorical = pd.DataFrame(
1216+
{
1217+
"A": pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"]),
1218+
"B": [1, 2, 3],
1219+
"C": ["a", "b", "a"],
1220+
}
1221+
)
1222+
df_without_categorical = pd.DataFrame(
1223+
{"A": ["a", "b", "a"], "B": [1, 2, 3], "C": ["a", "b", "a"]}
1224+
)
1225+
1226+
# DataFrame case
1227+
result = df_with_categorical.groupby(["A", "C"], observed=observed).transform("sum")
1228+
expected = df_without_categorical.groupby(["A", "C"]).transform("sum")
1229+
tm.assert_frame_equal(result, expected)
1230+
expected_explicit = pd.DataFrame({"B": [4, 2, 4]})
1231+
tm.assert_frame_equal(result, expected_explicit)
1232+
1233+
# Series case
1234+
result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform(
1235+
"sum"
1236+
)
1237+
expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum")
1238+
tm.assert_series_equal(result, expected)
1239+
expected_explicit = pd.Series([4, 2, 4], name="B")
1240+
tm.assert_series_equal(result, expected_explicit)

0 commit comments

Comments
 (0)