Skip to content

REF: Groupby.pad/backfill operate blockwise #43478

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def group_shift_indexer(
periods: int,
) -> None: ...
def group_fillna_indexer(
out: np.ndarray, # ndarray[int64_t]
out: np.ndarray, # ndarray[intp_t]
labels: np.ndarray, # ndarray[int64_t]
mask: np.ndarray, # ndarray[uint8_t]
direction: Literal["ffill", "bfill"],
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,15 @@ def group_shift_indexer(int64_t[::1] out, const intp_t[::1] labels,

@cython.wraparound(False)
@cython.boundscheck(False)
def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels,
def group_fillna_indexer(ndarray[intp_t] out, ndarray[intp_t] labels,
ndarray[uint8_t] mask, str direction,
int64_t limit, bint dropna) -> None:
"""
Indexes how to fill values forwards or backwards within a group.

Parameters
----------
out : np.ndarray[np.int64]
out : np.ndarray[np.intp]
Values into which this method will write its results.
labels : np.ndarray[np.intp]
Array containing unique label for each group, with its ordering
Expand Down
66 changes: 48 additions & 18 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,10 @@ def _wrap_transformed_output(
Series or DataFrame
Series for SeriesGroupBy, DataFrame for DataFrameGroupBy
"""
result = self._indexed_output_to_ndframe(output)
if isinstance(output, (Series, DataFrame)):
result = output
else:
result = self._indexed_output_to_ndframe(output)

if self.axis == 1:
# Only relevant for DataFrameGroupBy
Expand Down Expand Up @@ -2258,17 +2261,55 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit=None):
if limit is None:
limit = -1

return self._get_cythonized_result(
ids, _, _ = self.grouper.group_info

col_func = partial(
libgroupby.group_fillna_indexer,
numeric_only=False,
needs_mask=True,
cython_dtype=np.dtype(np.int64),
result_is_index=True,
labels=ids,
direction=direction,
limit=limit,
dropna=self.dropna,
)

def blk_func(values: ArrayLike) -> ArrayLike:
mask = isna(values)
if values.ndim == 1:
indexer = np.empty(values.shape, dtype=np.intp)
col_func(out=indexer, mask=mask)
return algorithms.take_nd(values, indexer)

else:
# We broadcast algorithms.take_nd analogous to
# np.take_along_axis

# Note: we only get here with backfill/pad,
# so if we have a dtype that cannot hold NAs,
# then there will be no -1s in indexer, so we can use
# the original dtype (no need to ensure_dtype_can_hold_na)
if isinstance(values, np.ndarray):
out = np.empty(values.shape, dtype=values.dtype)
else:
out = type(values)._empty(values.shape, dtype=values.dtype)

for i in range(len(values)):
# call group_fillna_indexer column-wise
indexer = np.empty(values.shape[1], dtype=np.intp)
col_func(out=indexer, mask=mask[i])
out[i, :] = algorithms.take_nd(values[i], indexer)
return out

obj = self._obj_with_exclusions
if self.axis == 1:
obj = obj.T
mgr = obj._mgr
res_mgr = mgr.apply(blk_func)

new_obj = obj._constructor(res_mgr)
if isinstance(new_obj, Series):
new_obj.name = obj.name

return self._wrap_transformed_output(new_obj)

@final
@Substitution(name="groupby")
def pad(self, limit=None):
Expand Down Expand Up @@ -2948,7 +2989,6 @@ def _get_cythonized_result(
min_count: int | None = None,
needs_mask: bool = False,
needs_ngroups: bool = False,
result_is_index: bool = False,
pre_processing=None,
post_processing=None,
fill_value=None,
Expand Down Expand Up @@ -2985,9 +3025,6 @@ def _get_cythonized_result(
needs_nullable : bool, default False
Whether a bool specifying if the input is nullable is part
of the Cython call signature
result_is_index : bool, default False
Whether the result of the Cython operation is an index of
values to be retrieved, instead of the actual values themselves
pre_processing : function, default None
Function to be applied to `values` prior to passing to Cython.
Function should return a tuple where the first element is the
Expand All @@ -3013,8 +3050,6 @@ def _get_cythonized_result(
"""
numeric_only = self._resolve_numeric_only(numeric_only)

if result_is_index and aggregate:
raise ValueError("'result_is_index' and 'aggregate' cannot both be True!")
if post_processing and not callable(post_processing):
raise ValueError("'post_processing' must be a callable!")
if pre_processing:
Expand Down Expand Up @@ -3086,14 +3121,9 @@ def blk_func(values: ArrayLike) -> ArrayLike:

func(**kwargs) # Call func to modify indexer values in place

if result_is_index:
result = algorithms.take_nd(values, result, fill_value=fill_value)

if real_2d and values.ndim == 1:
assert result.shape[1] == 1, result.shape
# error: No overload variant of "__getitem__" of "ExtensionArray"
# matches argument type "Tuple[slice, int]"
result = result[:, 0] # type: ignore[call-overload]
result = result[:, 0]
if needs_mask:
mask = mask[:, 0]

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/groupby/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ def test_ffill_handles_nan_groups(dropna, method, has_nan_group):

ridx = expected_rows.get((method, dropna, has_nan_group))
expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
# columns are a 'take' on df.columns, which are object dtype
expected.columns = expected.columns.astype(object)

tm.assert_frame_equal(result, expected)

Expand Down