-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
SubClassedDataFrame.groupby().mean()
etc. use method of SubClassedDataFrame
#51765
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
df9b39a
de8c231
d057cd0
e56b488
5338d3f
5fe7f82
8f32b12
4aa2b85
8d7346d
37ae233
aa57cc2
b3df075
adc132a
31868ff
c0b2ad7
98b7986
053c865
2efa052
1505a1c
af9ac26
f4bc548
12a9fa8
f46eea9
185a3c1
bf9bde6
036d662
f348648
48ceb0a
a6be1ea
f0ed14a
6631b1e
94dc186
310c339
27c4ed9
9bafd9a
963b3fe
8a9f30f
518d42e
6320057
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -284,7 +284,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |
) | ||
|
||
# result is a dict whose keys are the elements of result_index | ||
result = Series(result, index=self.grouper.result_index) | ||
result = self._obj_1d_constructor( | ||
result, index=self.grouper.result_index | ||
) | ||
result = self._wrap_aggregated_output(result) | ||
return result | ||
|
||
|
@@ -681,14 +683,20 @@ def value_counts( | |
|
||
index_names = self.grouper.names + [self.obj.name] | ||
|
||
constructor_1d = ( | ||
self._obj_1d_constructor | ||
if isinstance(self._obj_1d_constructor, Series) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think |
||
else Series | ||
) | ||
|
||
if is_categorical_dtype(val.dtype) or ( | ||
bins is not None and not np.iterable(bins) | ||
): | ||
# scalar bins cannot be done at top level | ||
# in a backward compatible way | ||
# GH38672 relates to categorical dtype | ||
ser = self.apply( | ||
Series.value_counts, | ||
constructor_1d.value_counts, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could this also be something like (didn't look in detail at how this code is working, so potentially this doesn't make sense at all) |
||
normalize=normalize, | ||
sort=sort, | ||
ascending=ascending, | ||
|
@@ -707,7 +715,7 @@ def value_counts( | |
llab = lambda lab, inc: lab[inc] | ||
else: | ||
# lab is a Categorical with categories an IntervalIndex | ||
cat_ser = cut(Series(val), bins, include_lowest=True) | ||
cat_ser = cut(self.obj._constructor(val), bins, include_lowest=True) | ||
cat_obj = cast("Categorical", cat_ser._values) | ||
lev = cat_obj.categories | ||
lab = lev.take( | ||
|
@@ -1290,9 +1298,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |
elif relabeling: | ||
# this should be the only (non-raising) case with relabeling | ||
# used reordered index of columns | ||
result = cast(DataFrame, result) | ||
result = cast(self.obj._constructor, result) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these will be wrong if _constructor is not a class |
||
result = result.iloc[:, order] | ||
result = cast(DataFrame, result) | ||
result = cast(self.obj._constructor, result) | ||
# error: Incompatible types in assignment (expression has type | ||
# "Optional[List[str]]", variable has type | ||
# "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], | ||
|
@@ -1335,7 +1343,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |
else: | ||
# GH#32040, GH#35246 | ||
# e.g. test_groupby_as_index_select_column_sum_empty_df | ||
result = cast(DataFrame, result) | ||
result = cast(self._obj_1d_constructor, result) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. _constructor, not _1d_constructor. Also i think this is OK as is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have also wrestled with whether results should stick to the subclass or just be DataFrames There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does mypy resolve |
||
result.columns = self._obj_with_exclusions.columns.copy() | ||
|
||
if not self.as_index: | ||
|
@@ -1463,7 +1471,7 @@ def _wrap_applied_output_series( | |
is_transform: bool, | ||
) -> DataFrame | Series: | ||
kwargs = first_not_none._construct_axes_dict() | ||
backup = Series(**kwargs) | ||
backup = self._obj_1d_constructor(**kwargs) | ||
values = [x if (x is not None) else backup for x in values] | ||
|
||
all_indexed_same = all_indexes_same(x.index for x in values) | ||
|
@@ -1858,7 +1866,9 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: | |
|
||
if not len(results): | ||
# concat would raise | ||
res_df = DataFrame([], columns=columns, index=self.grouper.result_index) | ||
res_df = self.obj._constructor( | ||
[], columns=columns, index=self.grouper.result_index | ||
) | ||
else: | ||
res_df = concat(results, keys=columns, axis=1) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1629,7 +1629,7 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray: | |
|
||
@final | ||
@property | ||
def _obj_1d_constructor(self) -> Callable: | ||
def _obj_1d_constructor(self): | ||
# GH28330 preserve subclassed Series/DataFrames | ||
if isinstance(self.obj, DataFrame): | ||
return self.obj._constructor_sliced | ||
|
@@ -1837,14 +1837,24 @@ def mean( | |
Name: B, dtype: float64 | ||
""" | ||
|
||
if not (self.obj.mean is Series.mean or self.obj.mean is DataFrame.mean): | ||
AdamOrmondroyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).mean() | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_mean | ||
|
||
return self._numba_agg_general(sliding_mean, engine_kwargs) | ||
else: | ||
result = self._cython_agg_general( | ||
"mean", | ||
alt=lambda x: Series(x).mean(numeric_only=numeric_only), | ||
alt=lambda x: self._obj_1d_constructor(x).mean( | ||
numeric_only=numeric_only | ||
), | ||
numeric_only=numeric_only, | ||
) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
@@ -1870,9 +1880,19 @@ def median(self, numeric_only: bool = False): | |
Series or DataFrame | ||
Median of values within each group. | ||
""" | ||
if not ( | ||
self.obj.median is Series.median or self.obj.median is DataFrame.median | ||
): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if this pattern is going to show up a lot, does it merit a decorator? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've had a first pass at making a decorator, not sure how to deal with |
||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).median() | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
result = self._cython_agg_general( | ||
"median", | ||
alt=lambda x: Series(x).median(numeric_only=numeric_only), | ||
alt=lambda x: self._obj_1d_constructor(x).median(numeric_only=numeric_only), | ||
numeric_only=numeric_only, | ||
) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
@@ -1928,6 +1948,14 @@ def std( | |
Series or DataFrame | ||
Standard deviation of values within each group. | ||
""" | ||
if not (self.obj.std is Series.std or self.obj.std is DataFrame.std): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).std() | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_var | ||
|
||
|
@@ -2011,14 +2039,22 @@ def var( | |
Series or DataFrame | ||
Variance of values within each group. | ||
""" | ||
if not (self.obj.var is Series.var or self.obj.var is DataFrame.var): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).var() | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_var | ||
|
||
return self._numba_agg_general(sliding_var, engine_kwargs, ddof) | ||
else: | ||
return self._cython_agg_general( | ||
"var", | ||
alt=lambda x: Series(x).var(ddof=ddof), | ||
alt=lambda x: self._obj_1d_constructor(x).var(ddof=ddof), | ||
numeric_only=numeric_only, | ||
ddof=ddof, | ||
) | ||
|
@@ -2180,6 +2216,15 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): | |
Series or DataFrame | ||
Standard error of the mean of values within each group. | ||
""" | ||
# TODO: think sem() needs considering more closely | ||
if not (self.obj.sem is Series.sem or self.obj.sem is DataFrame.sem): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).sem() | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype): | ||
raise TypeError( | ||
f"{type(self).__name__}.sem called with " | ||
|
@@ -2238,6 +2283,14 @@ def sum( | |
engine: str | None = None, | ||
engine_kwargs: dict[str, bool] | None = None, | ||
): | ||
if not (self.obj.sum is Series.sum or self.obj.sum is DataFrame.sum): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).sum() | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_sum | ||
|
||
|
@@ -2262,6 +2315,14 @@ def sum( | |
@final | ||
@doc(_groupby_agg_method_template, fname="prod", no=False, mc=0) | ||
def prod(self, numeric_only: bool = False, min_count: int = 0): | ||
if not (self.obj.prod is Series.prod or self.obj.prod is DataFrame.prod): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).prod() | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
return self._agg_general( | ||
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod | ||
) | ||
|
@@ -2275,6 +2336,14 @@ def min( | |
engine: str | None = None, | ||
engine_kwargs: dict[str, bool] | None = None, | ||
): | ||
if not (self.obj.min is Series.min or self.obj.min is DataFrame.min): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).min() | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_min_max | ||
|
||
|
@@ -2296,6 +2365,14 @@ def max( | |
engine: str | None = None, | ||
engine_kwargs: dict[str, bool] | None = None, | ||
): | ||
if not (self.obj.max is Series.max or self.obj.max is DataFrame.max): | ||
|
||
def f(df, *args, **kwargs): | ||
return self.obj._constructor(df).max() | ||
|
||
result = self.agg(f) | ||
return result.__finalize__(self.obj, method="groupby") | ||
|
||
if maybe_use_numba(engine): | ||
from pandas.core._numba.kernels import sliding_min_max | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think this belongs below in the Groupby/resample/rolling section?