Skip to content

CLN: Clean groupby ops from unreached code paths #48698

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 23, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 24 additions & 82 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,6 @@
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import FloatingDtype
from pandas.core.arrays.integer import IntegerDtype
from pandas.core.arrays.masked import (
BaseMaskedArray,
BaseMaskedDtype,
Expand Down Expand Up @@ -147,26 +144,6 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
},
}

# "group_any" and "group_all" are also support masks, but don't go
# through WrappedCythonOp
_MASKED_CYTHON_FUNCTIONS = {
"cummin",
"cummax",
"min",
"max",
"last",
"first",
"rank",
"sum",
"ohlc",
"cumprod",
"cumsum",
"prod",
"mean",
"var",
"median",
}

_cython_arity = {"ohlc": 4} # OHLC

# Note: we make this a classmethod and pass kind+how so that caching
Expand Down Expand Up @@ -220,8 +197,8 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
"""
how = self.how

if how in ["median"]:
# these two only have float64 implementations
if how == "median":
# median only has a float64 implementation
# We should only get here with is_numeric, as non-numeric cases
# should raise in _get_cython_function
values = ensure_float64(values)
Expand Down Expand Up @@ -293,7 +270,7 @@ def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape:

out_shape: Shape
if how == "ohlc":
out_shape = (ngroups, 4)
out_shape = (ngroups, arity)
elif arity > 1:
raise NotImplementedError(
"arity of more than 1 is not supported for the 'how' argument"
Expand Down Expand Up @@ -342,9 +319,6 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype:
return np.dtype(np.float64)
return dtype

def uses_mask(self) -> bool:
return self.how in self._MASKED_CYTHON_FUNCTIONS

@final
def _ea_wrap_cython_operation(
self,
Expand All @@ -358,7 +332,7 @@ def _ea_wrap_cython_operation(
If we have an ExtensionArray, unwrap, call _cython_operation, and
re-wrap if appropriate.
"""
if isinstance(values, BaseMaskedArray) and self.uses_mask():
if isinstance(values, BaseMaskedArray):
return self._masked_ea_wrap_cython_operation(
values,
min_count=min_count,
Expand All @@ -367,7 +341,7 @@ def _ea_wrap_cython_operation(
**kwargs,
)

elif isinstance(values, Categorical) and self.uses_mask():
elif isinstance(values, Categorical):
assert self.how == "rank" # the only one implemented ATM
assert values.ordered # checked earlier
mask = values.isna()
Expand Down Expand Up @@ -398,7 +372,7 @@ def _ea_wrap_cython_operation(
)

if self.how in self.cast_blocklist:
# i.e. how in ["rank"], since other cast_blocklist methods dont go
# i.e. how in ["rank"], since other cast_blocklist methods don't go
# through cython_operation
return res_values

Expand All @@ -411,12 +385,6 @@ def _ea_to_cython_values(self, values: ExtensionArray) -> np.ndarray:
# All of the functions implemented here are ordinal, so we can
# operate on the tz-naive equivalents
npvalues = values._ndarray.view("M8[ns]")
elif isinstance(values.dtype, (BooleanDtype, IntegerDtype)):
# IntegerArray or BooleanArray
npvalues = values.to_numpy("float64", na_value=np.nan)
elif isinstance(values.dtype, FloatingDtype):
# FloatingArray
npvalues = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
elif isinstance(values.dtype, StringDtype):
# StringArray
npvalues = values.to_numpy(object, na_value=np.nan)
Expand All @@ -440,12 +408,6 @@ def _reconstruct_ea_result(
string_array_cls = dtype.construct_array_type()
return string_array_cls._from_sequence(res_values, dtype=dtype)

elif isinstance(values.dtype, BaseMaskedDtype):
new_dtype = self._get_result_dtype(values.dtype.numpy_dtype)
dtype = BaseMaskedDtype.from_numpy_dtype(new_dtype)
masked_array_cls = dtype.construct_array_type()
return masked_array_cls._from_sequence(res_values, dtype=dtype)

elif isinstance(values, (DatetimeArray, TimedeltaArray, PeriodArray)):
# In to_cython_values we took a view as M8[ns]
assert res_values.dtype == "M8[ns]"
Expand Down Expand Up @@ -489,7 +451,9 @@ def _masked_ea_wrap_cython_operation(
)

if self.how == "ohlc":
result_mask = np.tile(result_mask, (4, 1)).T
result_mask = np.tile(
result_mask, (self._cython_arity.get(self.how, 1), 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick can you get the arity in a separate line

).T

# res_values should already have the correct dtype, we just need to
# wrap in a MaskedArray
Expand Down Expand Up @@ -580,7 +544,7 @@ def _call_cython_op(
result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
if self.kind == "aggregate":
counts = np.zeros(ngroups, dtype=np.int64)
if self.how in ["min", "max", "mean", "last", "first"]:
if self.how in ["min", "max", "mean", "last", "first", "sum"]:
func(
out=result,
counts=counts,
Expand All @@ -591,18 +555,6 @@ def _call_cython_op(
result_mask=result_mask,
is_datetimelike=is_datetimelike,
)
elif self.how in ["sum"]:
# We support datetimelike
func(
out=result,
counts=counts,
values=values,
labels=comp_ids,
mask=mask,
result_mask=result_mask,
min_count=min_count,
is_datetimelike=is_datetimelike,
)
elif self.how in ["var", "ohlc", "prod", "median"]:
func(
result,
Expand All @@ -615,31 +567,21 @@ def _call_cython_op(
**kwargs,
)
else:
func(result, counts, values, comp_ids, min_count)
raise NotImplementedError(f"{self.how} is not implemented")
else:
# TODO: min_count
if self.uses_mask():
if self.how != "rank":
# TODO: should rank take result_mask?
kwargs["result_mask"] = result_mask
func(
out=result,
values=values,
labels=comp_ids,
ngroups=ngroups,
is_datetimelike=is_datetimelike,
mask=mask,
**kwargs,
)
else:
func(
out=result,
values=values,
labels=comp_ids,
ngroups=ngroups,
is_datetimelike=is_datetimelike,
**kwargs,
)
if self.how != "rank":
# TODO: should rank take result_mask?
kwargs["result_mask"] = result_mask
func(
out=result,
values=values,
labels=comp_ids,
ngroups=ngroups,
is_datetimelike=is_datetimelike,
mask=mask,
**kwargs,
)

if self.kind == "aggregate":
# i.e. counts is defined. Locations where count<min_count
Expand All @@ -650,7 +592,7 @@ def _call_cython_op(
cutoff = max(0 if self.how in ["sum", "prod"] else 1, min_count)
empty_groups = counts < cutoff
if empty_groups.any():
if result_mask is not None and self.uses_mask():
if result_mask is not None:
assert result_mask[empty_groups].all()
else:
# Note: this conversion could be lossy, see GH#40767
Expand Down