Skip to content

Commit 25077a3

Browse files
authored
REF: _cython_transform operate blockwise (#41344)
1 parent afab286 commit 25077a3

File tree

5 files changed

+110
-41
lines changed

5 files changed

+110
-41
lines changed

pandas/_libs/groupby.pyx

+16-11
Original file line numberDiff line numberDiff line change
@@ -1136,19 +1136,24 @@ def group_rank(float64_t[:, ::1] out,
11361136
This method modifies the `out` parameter rather than returning an object
11371137
"""
11381138
cdef:
1139+
Py_ssize_t i, k, N
11391140
ndarray[float64_t, ndim=1] result
11401141

1141-
result = rank_1d(
1142-
values=values[:, 0],
1143-
labels=labels,
1144-
is_datetimelike=is_datetimelike,
1145-
ties_method=ties_method,
1146-
ascending=ascending,
1147-
pct=pct,
1148-
na_option=na_option
1149-
)
1150-
for i in range(len(result)):
1151-
out[i, 0] = result[i]
1142+
N = values.shape[1]
1143+
1144+
for k in range(N):
1145+
result = rank_1d(
1146+
values=values[:, k],
1147+
labels=labels,
1148+
is_datetimelike=is_datetimelike,
1149+
ties_method=ties_method,
1150+
ascending=ascending,
1151+
pct=pct,
1152+
na_option=na_option
1153+
)
1154+
for i in range(len(result)):
1155+
# TODO: why cant we do out[:, k] = result?
1156+
out[i, k] = result[i]
11521157

11531158

11541159
# ----------------------------------------------------------------------

pandas/core/groupby/generic.py

+50
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,26 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
528528
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
529529
)
530530

531+
def _cython_transform(
532+
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
533+
):
534+
assert axis == 0 # handled by caller
535+
536+
obj = self._selected_obj
537+
538+
is_numeric = is_numeric_dtype(obj.dtype)
539+
if numeric_only and not is_numeric:
540+
raise DataError("No numeric types to aggregate")
541+
542+
try:
543+
result = self.grouper._cython_operation(
544+
"transform", obj._values, how, axis, **kwargs
545+
)
546+
except (NotImplementedError, TypeError):
547+
raise DataError("No numeric types to aggregate")
548+
549+
return obj._constructor(result, index=self.obj.index, name=obj.name)
550+
531551
def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
532552
"""
533553
Transform with a callable func`.
@@ -1247,6 +1267,36 @@ def _wrap_applied_output_series(
12471267

12481268
return self._reindex_output(result)
12491269

1270+
def _cython_transform(
1271+
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
1272+
) -> DataFrame:
1273+
assert axis == 0 # handled by caller
1274+
# TODO: no tests with self.ndim == 1 for DataFrameGroupBy
1275+
1276+
# With self.axis == 0, we have multi-block tests
1277+
# e.g. test_rank_min_int, test_cython_transform_frame
1278+
# test_transform_numeric_ret
1279+
# With self.axis == 1, _get_data_to_aggregate does a transpose
1280+
# so we always have a single block.
1281+
mgr: Manager2D = self._get_data_to_aggregate()
1282+
if numeric_only:
1283+
mgr = mgr.get_numeric_data(copy=False)
1284+
1285+
def arr_func(bvalues: ArrayLike) -> ArrayLike:
1286+
return self.grouper._cython_operation(
1287+
"transform", bvalues, how, 1, **kwargs
1288+
)
1289+
1290+
# We could use `mgr.apply` here and not have to set_axis, but
1291+
# we would have to do shape gymnastics for ArrayManager compat
1292+
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
1293+
res_mgr.set_axis(1, mgr.axes[1])
1294+
1295+
res_df = self.obj._constructor(res_mgr)
1296+
if self.axis == 1:
1297+
res_df = res_df.T
1298+
return res_df
1299+
12501300
def _transform_general(self, func, *args, **kwargs):
12511301
from pandas.core.reshape.concat import concat
12521302

pandas/core/groupby/groupby.py

+1-23
Original file line numberDiff line numberDiff line change
@@ -1350,32 +1350,10 @@ def _cython_agg_general(
13501350
):
13511351
raise AbstractMethodError(self)
13521352

1353-
@final
13541353
def _cython_transform(
13551354
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
13561355
):
1357-
output: dict[base.OutputKey, ArrayLike] = {}
1358-
1359-
for idx, obj in enumerate(self._iterate_slices()):
1360-
name = obj.name
1361-
is_numeric = is_numeric_dtype(obj.dtype)
1362-
if numeric_only and not is_numeric:
1363-
continue
1364-
1365-
try:
1366-
result = self.grouper._cython_operation(
1367-
"transform", obj._values, how, axis, **kwargs
1368-
)
1369-
except (NotImplementedError, TypeError):
1370-
continue
1371-
1372-
key = base.OutputKey(label=name, position=idx)
1373-
output[key] = result
1374-
1375-
if not output:
1376-
raise DataError("No numeric types to aggregate")
1377-
1378-
return self._wrap_transformed_output(output)
1356+
raise AbstractMethodError(self)
13791357

13801358
@final
13811359
def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

pandas/tests/apply/test_frame_transform.py

+13
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,19 @@ def test_transform_groupby_kernel(axis, float_frame, op, request):
5151
result = float_frame.transform(op, axis, *args)
5252
tm.assert_frame_equal(result, expected)
5353

54+
# same thing, but ensuring we have multiple blocks
55+
assert "E" not in float_frame.columns
56+
float_frame["E"] = float_frame["A"].copy()
57+
assert len(float_frame._mgr.arrays) > 1
58+
59+
if axis == 0 or axis == "index":
60+
ones = np.ones(float_frame.shape[0])
61+
else:
62+
ones = np.ones(float_frame.shape[1])
63+
expected2 = float_frame.groupby(ones, axis=axis).transform(op, *args)
64+
result2 = float_frame.transform(op, axis, *args)
65+
tm.assert_frame_equal(result2, expected2)
66+
5467

5568
@pytest.mark.parametrize(
5669
"ops, names",

pandas/tests/groupby/test_rank.py

+30-7
Original file line numberDiff line numberDiff line change
@@ -584,21 +584,23 @@ def test_rank_multiindex():
584584
# GH27721
585585
df = concat(
586586
{
587-
"a": DataFrame({"col1": [1, 2], "col2": [3, 4]}),
587+
"a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
588588
"b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
589589
},
590590
axis=1,
591591
)
592592

593-
result = df.groupby(level=0, axis=1).rank(axis=1, ascending=False, method="first")
593+
gb = df.groupby(level=0, axis=1)
594+
result = gb.rank(axis=1)
595+
594596
expected = concat(
595-
{
596-
"a": DataFrame({"col1": [2.0, 2.0], "col2": [1.0, 1.0]}),
597-
"b": DataFrame({"col3": [2.0, 2.0], "col4": [1.0, 1.0]}),
598-
},
597+
[
598+
df["a"].rank(axis=1),
599+
df["b"].rank(axis=1),
600+
],
599601
axis=1,
602+
keys=["a", "b"],
600603
)
601-
602604
tm.assert_frame_equal(result, expected)
603605

604606

@@ -615,3 +617,24 @@ def test_groupby_axis0_rank_axis1():
615617
# This should match what we get when "manually" operating group-by-group
616618
expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
617619
tm.assert_frame_equal(res, expected)
620+
621+
# check that we haven't accidentally written a case that coincidentally
622+
# matches rank(axis=0)
623+
alt = gb.rank(axis=0)
624+
assert not alt.equals(expected)
625+
626+
627+
def test_groupby_axis0_cummax_axis1():
628+
# case where groupby axis is 0 and axis keyword in transform is 1
629+
630+
# df has mixed dtype -> multiple blocks
631+
df = DataFrame(
632+
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
633+
index=["a", "a", "b", "b"],
634+
)
635+
gb = df.groupby(level=0, axis=0)
636+
637+
cmax = gb.cummax(axis=1)
638+
expected = df[[0, 1]].astype(np.float64)
639+
expected[2] = expected[1]
640+
tm.assert_frame_equal(cmax, expected)

0 commit comments

Comments
 (0)