Skip to content

REF: _cython_transform operate blockwise #41344

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 16 additions & 11 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1136,19 +1136,24 @@ def group_rank(float64_t[:, ::1] out,
This method modifies the `out` parameter rather than returning an object
"""
cdef:
Py_ssize_t i, k, N
ndarray[float64_t, ndim=1] result

result = rank_1d(
values=values[:, 0],
labels=labels,
is_datetimelike=is_datetimelike,
ties_method=ties_method,
ascending=ascending,
pct=pct,
na_option=na_option
)
for i in range(len(result)):
out[i, 0] = result[i]
N = values.shape[1]

for k in range(N):
result = rank_1d(
values=values[:, k],
labels=labels,
is_datetimelike=is_datetimelike,
ties_method=ties_method,
ascending=ascending,
pct=pct,
na_option=na_option
)
for i in range(len(result)):
# TODO: why cant we do out[:, k] = result?
out[i, k] = result[i]


# ----------------------------------------------------------------------
Expand Down
50 changes: 50 additions & 0 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,26 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
)

def _cython_transform(
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
):
assert axis == 0 # handled by caller

obj = self._selected_obj

is_numeric = is_numeric_dtype(obj.dtype)
if numeric_only and not is_numeric:
raise DataError("No numeric types to aggregate")

try:
result = self.grouper._cython_operation(
"transform", obj._values, how, axis, **kwargs
)
except (NotImplementedError, TypeError):
raise DataError("No numeric types to aggregate")

return obj._constructor(result, index=self.obj.index, name=obj.name)

def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
"""
Transform with a callable func`.
Expand Down Expand Up @@ -1258,6 +1278,36 @@ def _wrap_applied_output_series(

return self._reindex_output(result)

def _cython_transform(
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
) -> DataFrame:
assert axis == 0 # handled by caller
# TODO: no tests with self.ndim == 1 for DataFrameGroupBy

# With self.axis == 0, we have multi-block tests
# e.g. test_rank_min_int, test_cython_transform_frame
# test_transform_numeric_ret
# With self.axis == 1, _get_data_to_aggregate does a transpose
# so we always have a single block.
mgr: Manager2D = self._get_data_to_aggregate()
if numeric_only:
mgr = mgr.get_numeric_data(copy=False)

def arr_func(bvalues: ArrayLike) -> ArrayLike:
return self.grouper._cython_operation(
"transform", bvalues, how, 1, **kwargs
)

# We could use `mgr.apply` here and not have to set_axis, but
# we would have to do shape gymnastics for ArrayManager compat
res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True)
res_mgr.set_axis(1, mgr.axes[1])

res_df = self.obj._constructor(res_mgr)
if self.axis == 1:
res_df = res_df.T
return res_df

def _transform_general(self, func, *args, **kwargs):
from pandas.core.reshape.concat import concat

Expand Down
24 changes: 1 addition & 23 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1361,32 +1361,10 @@ def _cython_agg_general(
):
raise AbstractMethodError(self)

@final
def _cython_transform(
self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
):
output: dict[base.OutputKey, ArrayLike] = {}

for idx, obj in enumerate(self._iterate_slices()):
name = obj.name
is_numeric = is_numeric_dtype(obj.dtype)
if numeric_only and not is_numeric:
continue

try:
result = self.grouper._cython_operation(
"transform", obj._values, how, axis, **kwargs
)
except (NotImplementedError, TypeError):
continue

key = base.OutputKey(label=name, position=idx)
output[key] = result

if not output:
raise DataError("No numeric types to aggregate")

return self._wrap_transformed_output(output)
raise AbstractMethodError(self)

@final
def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/apply/test_frame_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,19 @@ def test_transform_groupby_kernel(axis, float_frame, op, request):
result = float_frame.transform(op, axis, *args)
tm.assert_frame_equal(result, expected)

# same thing, but ensuring we have multiple blocks
assert "E" not in float_frame.columns
float_frame["E"] = float_frame["A"].copy()
assert len(float_frame._mgr.arrays) > 1

if axis == 0 or axis == "index":
ones = np.ones(float_frame.shape[0])
else:
ones = np.ones(float_frame.shape[1])
expected2 = float_frame.groupby(ones, axis=axis).transform(op, *args)
result2 = float_frame.transform(op, axis, *args)
tm.assert_frame_equal(result2, expected2)


@pytest.mark.parametrize(
"ops, names",
Expand Down
37 changes: 30 additions & 7 deletions pandas/tests/groupby/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,21 +584,23 @@ def test_rank_multiindex():
# GH27721
df = concat(
{
"a": DataFrame({"col1": [1, 2], "col2": [3, 4]}),
"a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
"b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
},
axis=1,
)

result = df.groupby(level=0, axis=1).rank(axis=1, ascending=False, method="first")
gb = df.groupby(level=0, axis=1)
result = gb.rank(axis=1)

expected = concat(
{
"a": DataFrame({"col1": [2.0, 2.0], "col2": [1.0, 1.0]}),
"b": DataFrame({"col3": [2.0, 2.0], "col4": [1.0, 1.0]}),
},
[
df["a"].rank(axis=1),
df["b"].rank(axis=1),
],
axis=1,
keys=["a", "b"],
)

tm.assert_frame_equal(result, expected)


Expand All @@ -615,3 +617,24 @@ def test_groupby_axis0_rank_axis1():
# This should match what we get when "manually" operating group-by-group
expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
tm.assert_frame_equal(res, expected)

# check that we haven't accidentally written a case that coincidentally
# matches rank(axis=0)
alt = gb.rank(axis=0)
assert not alt.equals(expected)


def test_groupby_axis0_cummax_axis1():
# case where groupby axis is 0 and axis keyword in transform is 1

# df has mixed dtype -> multiple blocks
df = DataFrame(
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
index=["a", "a", "b", "b"],
)
gb = df.groupby(level=0, axis=0)

cmax = gb.cummax(axis=1)
expected = df[[0, 1]].astype(np.float64)
expected[2] = expected[1]
tm.assert_frame_equal(cmax, expected)