-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
SubClassedDataFrame.groupby().mean()
etc. use method of SubClassedDataFrame
#51765
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 28 commits
df9b39a
de8c231
d057cd0
e56b488
5338d3f
5fe7f82
8f32b12
4aa2b85
8d7346d
37ae233
aa57cc2
b3df075
adc132a
31868ff
c0b2ad7
98b7986
053c865
2efa052
1505a1c
af9ac26
f4bc548
12a9fa8
f46eea9
185a3c1
bf9bde6
036d662
f348648
48ceb0a
a6be1ea
f0ed14a
6631b1e
94dc186
310c339
27c4ed9
9bafd9a
963b3fe
8a9f30f
518d42e
6320057
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -284,7 +284,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |
) | ||
|
||
# result is a dict whose keys are the elements of result_index | ||
result = Series(result, index=self.grouper.result_index) | ||
result = self._obj_1d_constructor( | ||
result, index=self.grouper.result_index | ||
) | ||
result = self._wrap_aggregated_output(result) | ||
return result | ||
|
||
|
@@ -681,14 +683,20 @@ def value_counts( | |
|
||
index_names = self.grouper.names + [self.obj.name] | ||
|
||
constructor_1d = ( | ||
self._obj_1d_constructor | ||
if isinstance(self._obj_1d_constructor, Series) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think |
||
else Series | ||
) | ||
|
||
if is_categorical_dtype(val.dtype) or ( | ||
bins is not None and not np.iterable(bins) | ||
): | ||
# scalar bins cannot be done at top level | ||
# in a backward compatible way | ||
# GH38672 relates to categorical dtype | ||
ser = self.apply( | ||
Series.value_counts, | ||
constructor_1d.value_counts, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this also be something like (didn't look in detail at how this code is working, so potentially this doesn't make sense at all) |
||
normalize=normalize, | ||
sort=sort, | ||
ascending=ascending, | ||
|
@@ -707,7 +715,7 @@ def value_counts( | |
llab = lambda lab, inc: lab[inc] | ||
else: | ||
# lab is a Categorical with categories an IntervalIndex | ||
cat_ser = cut(Series(val), bins, include_lowest=True) | ||
cat_ser = cut(self.obj._constructor(val), bins, include_lowest=True) | ||
cat_obj = cast("Categorical", cat_ser._values) | ||
lev = cat_obj.categories | ||
lab = lev.take( | ||
|
@@ -1308,9 +1316,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |
elif relabeling: | ||
# this should be the only (non-raising) case with relabeling | ||
# used reordered index of columns | ||
result = cast(DataFrame, result) | ||
result = cast(self.obj._constructor, result) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these will be wrong if _constructor is not a class |
||
result = result.iloc[:, order] | ||
result = cast(DataFrame, result) | ||
result = cast(self.obj._constructor, result) | ||
# error: Incompatible types in assignment (expression has type | ||
# "Optional[List[str]]", variable has type | ||
# "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], | ||
|
@@ -1353,7 +1361,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |
else: | ||
# GH#32040, GH#35246 | ||
# e.g. test_groupby_as_index_select_column_sum_empty_df | ||
result = cast(DataFrame, result) | ||
result = cast(self._constructor, result) | ||
result.columns = self._obj_with_exclusions.columns.copy() | ||
|
||
if not self.as_index: | ||
|
@@ -1481,7 +1489,7 @@ def _wrap_applied_output_series( | |
is_transform: bool, | ||
) -> DataFrame | Series: | ||
kwargs = first_not_none._construct_axes_dict() | ||
backup = Series(**kwargs) | ||
backup = self._obj_1d_constructor(**kwargs) | ||
values = [x if (x is not None) else backup for x in values] | ||
|
||
all_indexed_same = all_indexes_same(x.index for x in values) | ||
|
@@ -1876,7 +1884,9 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: | |
|
||
if not len(results): | ||
# concat would raise | ||
res_df = DataFrame([], columns=columns, index=self.grouper.result_index) | ||
res_df = self.obj._constructor( | ||
[], columns=columns, index=self.grouper.result_index | ||
) | ||
else: | ||
res_df = concat(results, keys=columns, axis=1) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1859,14 +1859,26 @@ def mean( | |
Name: B, dtype: float64 | ||
""" | ||
|
||
if not ( | ||
type(self.obj).mean is Series.mean or type(self.obj).mean is DataFrame.mean | ||
): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might be a bad idea, but what about a Boolean flag as a class attribute on Series / DataFrame that subclasses can override if they want to use their methods? Defaults to False. Perhaps if only to introduce this with a deprecation warning. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No:
Using my example where the mean should always be 1. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. However transform() does work:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for checking. In my opinion that isn't necessarily a blocker for this, but we are expanding the behavior here and introducing new bugs that should get fixed. If we are to move forward with some form of implementation for this, I think the test coverage here should be expanded to include at least agg, apply, and transform (even if they are currently producing incorrect results). |
||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).mean(numeric_only=numeric_only) | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_mean | ||
|
||
return self._numba_agg_general(sliding_mean, engine_kwargs) | ||
else: | ||
result = self._cython_agg_general( | ||
"mean", | ||
alt=lambda x: Series(x).mean(numeric_only=numeric_only), | ||
alt=lambda x: self._obj_1d_constructor(x).mean( | ||
numeric_only=numeric_only | ||
), | ||
numeric_only=numeric_only, | ||
) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
@@ -1892,9 +1904,20 @@ def median(self, numeric_only: bool = False): | |
Series or DataFrame | ||
Median of values within each group. | ||
""" | ||
if not ( | ||
type(self.obj).median is Series.median | ||
or type(self.obj).median is DataFrame.median | ||
): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if this pattern is going to show up a lot, does it merit a decorator? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've had a first pass at making a decorator, not sure how to deal with |
||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).median(numeric_only=numeric_only) | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
result = self._cython_agg_general( | ||
"median", | ||
alt=lambda x: Series(x).median(numeric_only=numeric_only), | ||
alt=lambda x: self._obj_1d_constructor(x).median(numeric_only=numeric_only), | ||
numeric_only=numeric_only, | ||
) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
@@ -1950,6 +1973,16 @@ def std( | |
Series or DataFrame | ||
Standard deviation of values within each group. | ||
""" | ||
if not ( | ||
type(self.obj).std is Series.std or type(self.obj).std is DataFrame.std | ||
): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).std(numeric_only=numeric_only) | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_var | ||
|
||
|
@@ -2013,14 +2046,24 @@ def var( | |
Series or DataFrame | ||
Variance of values within each group. | ||
""" | ||
if not ( | ||
type(self.obj).var is Series.var or type(self.obj).var is DataFrame.var | ||
): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).var(numeric_only=numeric_only) | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_var | ||
|
||
return self._numba_agg_general(sliding_var, engine_kwargs, ddof) | ||
else: | ||
return self._cython_agg_general( | ||
"var", | ||
alt=lambda x: Series(x).var(ddof=ddof), | ||
alt=lambda x: self._obj_1d_constructor(x).var(ddof=ddof), | ||
numeric_only=numeric_only, | ||
ddof=ddof, | ||
) | ||
|
@@ -2184,6 +2227,17 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): | |
Series or DataFrame | ||
Standard error of the mean of values within each group. | ||
""" | ||
# TODO: think sem() needs considering more closely | ||
if not ( | ||
type(self.obj).sem is Series.sem or type(self.obj).sem is DataFrame.sem | ||
): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).sem(numeric_only=numeric_only) | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): | ||
raise TypeError( | ||
f"{type(self).__name__}.sem called with " | ||
|
@@ -2236,6 +2290,16 @@ def sum( | |
engine: str | None = None, | ||
engine_kwargs: dict[str, bool] | None = None, | ||
): | ||
if not ( | ||
type(self.obj).sum is Series.sum or type(self.obj).sum is DataFrame.sum | ||
): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).sum(numeric_only=numeric_only) | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_sum | ||
|
||
|
@@ -2260,6 +2324,16 @@ def sum( | |
@final | ||
@doc(_groupby_agg_method_template, fname="prod", no=False, mc=0) | ||
def prod(self, numeric_only: bool = False, min_count: int = 0): | ||
if not ( | ||
type(self.obj).prod is Series.prod or type(self.obj).prod is DataFrame.prod | ||
): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).prod(numeric_only=numeric_only) | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
return self._agg_general( | ||
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod | ||
) | ||
|
@@ -2273,6 +2347,16 @@ def min( | |
engine: str | None = None, | ||
engine_kwargs: dict[str, bool] | None = None, | ||
): | ||
if not ( | ||
type(self.obj).min is Series.min or type(self.obj).min is DataFrame.min | ||
): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).min(numeric_only=numeric_only) | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_min_max | ||
|
||
|
@@ -2294,6 +2378,16 @@ def max( | |
engine: str | None = None, | ||
engine_kwargs: dict[str, bool] | None = None, | ||
): | ||
if not ( | ||
type(self.obj).max is Series.max or type(self.obj).max is DataFrame.max | ||
): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).max(numeric_only=numeric_only) | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_min_max | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think this belongs below in the Groupby/resample/rolling section?