Skip to content

Commit c39773e

Browse files
authored
BUG: DataFrame.groupby(., dropna=True, axis=0) incorrectly throws ShapeError (#35751)
1 parent e06fbbe commit c39773e

File tree

5 files changed

+58
-27
lines changed

5 files changed

+58
-27
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ Indexing
224224
Missing
225225
^^^^^^^
226226

227+
- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True`` (:issue:`35612`)
227228
-
228229
-
229230

pandas/core/groupby/generic.py

-1
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,6 @@ def _transform_general(self, func, *args, **kwargs):
553553
result = maybe_downcast_numeric(result, self._selected_obj.dtype)
554554

555555
result.name = self._selected_obj.name
556-
result.index = self._selected_obj.index
557556
return result
558557

559558
def _transform_fast(self, result) -> Series:

pandas/core/groupby/groupby.py

+22-8
Original file line numberDiff line numberDiff line change
@@ -729,14 +729,28 @@ def _set_result_index_ordered(
729729
# set the result index on the passed values object and
730730
# return the new object, xref 8046
731731

732-
# the values/counts are repeated according to the group index
733-
# shortcut if we have an already ordered grouper
734-
if not self.grouper.is_monotonic:
735-
index = Index(np.concatenate(self._get_indices(self.grouper.result_index)))
736-
result.set_axis(index, axis=self.axis, inplace=True)
737-
result = result.sort_index(axis=self.axis)
738-
739-
result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
732+
if self.grouper.is_monotonic:
733+
# shortcut if we have an already ordered grouper
734+
result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
735+
return result
736+
737+
# row order is scrambled => sort the rows by position in original index
738+
original_positions = Index(
739+
np.concatenate(self._get_indices(self.grouper.result_index))
740+
)
741+
result.set_axis(original_positions, axis=self.axis, inplace=True)
742+
result = result.sort_index(axis=self.axis)
743+
744+
dropped_rows = len(result.index) < len(self.obj.index)
745+
746+
if dropped_rows:
747+
# get index by slicing original index according to original positions
748+
# slice drops attrs => use set_axis when no rows were dropped
749+
sorted_indexer = result.index
750+
result.index = self._selected_obj.index[sorted_indexer]
751+
else:
752+
result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
753+
740754
return result
741755

742756
@final

pandas/tests/groupby/test_groupby_dropna.py

+34-17
Original file line numberDiff line numberDiff line change
@@ -171,36 +171,53 @@ def test_grouper_dropna_propagation(dropna):
171171

172172

173173
@pytest.mark.parametrize(
174-
"dropna,df_expected,s_expected",
174+
"dropna,input_index,expected_data,expected_index",
175175
[
176-
pytest.param(
176+
(True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)),
177+
(True, list("abcd"), {"B": [2, 2, 1]}, list("abc")),
178+
(
177179
True,
178-
pd.DataFrame({"B": [2, 2, 1]}),
179-
pd.Series(data=[2, 2, 1], name="B"),
180-
marks=pytest.mark.xfail(raises=ValueError),
180+
pd.MultiIndex.from_tuples(
181+
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
182+
),
183+
{"B": [2, 2, 1]},
184+
pd.MultiIndex.from_tuples(
185+
[(1, "R"), (1, "B"), (2, "R")], names=["num", "col"]
186+
),
181187
),
188+
(False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)),
189+
(False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")),
182190
(
183191
False,
184-
pd.DataFrame({"B": [2, 2, 1, 1]}),
185-
pd.Series(data=[2, 2, 1, 1], name="B"),
192+
pd.MultiIndex.from_tuples(
193+
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
194+
),
195+
{"B": [2, 2, 1, 1]},
196+
pd.MultiIndex.from_tuples(
197+
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
198+
),
186199
),
187200
],
188201
)
189-
def test_slice_groupby_then_transform(dropna, df_expected, s_expected):
190-
# GH35014
202+
def test_groupby_dataframe_slice_then_transform(
203+
dropna, input_index, expected_data, expected_index
204+
):
205+
# GH35014 & GH35612
191206

192-
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
207+
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index)
193208
gb = df.groupby("A", dropna=dropna)
194209

195-
res = gb.transform(len)
196-
tm.assert_frame_equal(res, df_expected)
210+
result = gb.transform(len)
211+
expected = pd.DataFrame(expected_data, index=expected_index)
212+
tm.assert_frame_equal(result, expected)
197213

198-
gb_slice = gb[["B"]]
199-
res = gb_slice.transform(len)
200-
tm.assert_frame_equal(res, df_expected)
214+
result = gb[["B"]].transform(len)
215+
expected = pd.DataFrame(expected_data, index=expected_index)
216+
tm.assert_frame_equal(result, expected)
201217

202-
res = gb["B"].transform(len)
203-
tm.assert_series_equal(res, s_expected)
218+
result = gb["B"].transform(len)
219+
expected = pd.Series(expected_data["B"], index=expected_index, name="B")
220+
tm.assert_series_equal(result, expected)
204221

205222

206223
@pytest.mark.parametrize(

pandas/tests/groupby/test_grouping.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ def test_list_grouper_with_nat(self):
626626
[
627627
(
628628
"transform",
629-
Series(name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)),
629+
Series(name=2, dtype=np.float64, index=Index([])),
630630
),
631631
(
632632
"agg",

0 commit comments

Comments
 (0)