From fa96aff292ddc12fbf536eef2abdb6b7739a99c1 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 1 Oct 2021 11:47:06 -0700 Subject: [PATCH 1/5] Merge master i think --- pandas/core/groupby/generic.py | 6 ++++++ pandas/core/groupby/groupby.py | 14 ++++++++++++++ pandas/core/groupby/ops.py | 8 ++++++++ pandas/tests/groupby/test_groupby.py | 24 ++++++++++++++++++++++++ 4 files changed, 52 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 59c57cf4a1ea0..2e4b5dc42a80d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1145,6 +1145,10 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): index=self.grouper.result_index, columns=data.columns ) result = result.astype(data.dtypes.to_dict(), copy=False) + # TODO: will this astype mess up with non-unique columns? + #if not len(values): + # breakpoint() + # #result = result[[]] return result # GH12824 @@ -1824,6 +1828,7 @@ def func(df): result = [index[i] if i >= 0 else np.nan for i in indices] return df._constructor_sliced(result, index=res.index) + func.__name__ = "idxmax" return self._python_apply_general(func, self._obj_with_exclusions) @Appender(DataFrame.idxmin.__doc__) @@ -1845,6 +1850,7 @@ def func(df): result = [index[i] if i >= 0 else np.nan for i in indices] return df._constructor_sliced(result, index=res.index) + func.__name__ = "idxmin" return self._python_apply_general(func, self._obj_with_exclusions) boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 939cff16bf1ae..6b4dec1374561 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -942,6 +942,11 @@ def curried(x): if name in base.plotting_methods: return self.apply(curried) + #if self.ndim == 1 and self._obj_with_exclusions.size == 0 and name not in ["idxmax", "idxmin"]:# and self._obj_with_exclusions.dtype != object: + # # make sure the function is called at least once so we get + # # a TypeError if appropriate + # curried(self._obj_with_exclusions.iloc[:0]) + return self._python_apply_general(curried, self._obj_with_exclusions) wrapper.__name__ = name @@ -1257,6 +1262,14 @@ def f(g): # ignore SettingWithCopy here in case the user mutates with option_context("mode.chained_assignment", None): + if not callable(f): + if hasattr(self, f): + res = getattr(self, f) + if inspect.isfunction(res) or inspect.ismethod(res): + return res() + return res + + raise TypeError(f"invalid func {f}") try: result = self._python_apply_general(f, self._selected_obj) except TypeError: @@ -1302,6 +1315,7 @@ def _python_apply_general( def _python_agg_general(self, func, *args, **kwargs): func = com.is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) + f.__name__ = func.__name__ # iterate through "columns" ex exclusions to populate output dict output: dict[base.OutputKey, ArrayLike] = {} diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 6d8881d12dbb7..ce57898739431 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -720,6 +720,8 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): group_keys = self._get_group_keys() result_values = None + assert callable(f) + if data.ndim == 2 and any( isinstance(x, ExtensionArray) for x in data._iter_column_arrays() ): @@ -783,6 +785,12 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = True result_values.append(res) + if not isinstance(f, str): # TODO: WTF? + if len(group_keys) == 0 and f.__name__ not in ["idxmin", "idxmax", "nanargmin", "nanargmax"]: + # there have been zero function calls, so we do one dummy call + # so that we can raise TypeError where appropriate + f(data.iloc[:0]) + return group_keys, result_values, mutated @cache_readonly diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 538a707aa3580..06b2cff1211f5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1898,6 +1898,30 @@ def get_result(): tm.assert_equal(result, expected) return + if op in ["mad", "min", "max", "skew"] and isinstance(values, Categorical): + # Categorical doesn't implement, so with numeric_only=True + # these are dropped and we get an empty DataFrame back + result = get_result() + expected = df.set_index(keys)[[]] + + # with numeric_only=True, these are dropped, and we get + # an empty DataFrame back + if len(keys) != 1: + # Categorical is special without 'observed=True' + lev = Categorical([0], dtype=values.dtype) + mi = MultiIndex.from_product([lev, lev], names=keys) + expected = DataFrame([], columns=[], index=mi) + else: + # all columns are dropped, but we end up with one row + # Categorical is special without 'observed=True' + lev = Categorical([0], dtype=values.dtype) + ci = Index(lev, name=keys[0]) + expected = DataFrame([], columns=[], index=ci) + #expected = df.set_index(keys)[columns] + + tm.assert_equal(result, expected) + return + result = get_result() expected = df.set_index(keys)[columns] if override_dtype is not None: From dd4c10c4657bee97ebbf244cb3e0e398344bfbe5 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 1 Oct 2021 11:47:29 -0700 Subject: [PATCH 2/5] merge mater i think --- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/ops.py | 7 ++++++- pandas/tests/groupby/test_groupby.py | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2e4b5dc42a80d..8866db1be1796 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1146,7 +1146,7 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): ) result = result.astype(data.dtypes.to_dict(), copy=False) # TODO: will this astype mess up with non-unique columns? - #if not len(values): + # if not len(values): # breakpoint() # #result = result[[]] return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6b4dec1374561..1b49db8a46128 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -942,7 +942,7 @@ def curried(x): if name in base.plotting_methods: return self.apply(curried) - #if self.ndim == 1 and self._obj_with_exclusions.size == 0 and name not in ["idxmax", "idxmin"]:# and self._obj_with_exclusions.dtype != object: + # if self.ndim == 1 and self._obj_with_exclusions.size == 0 and name not in ["idxmax", "idxmin"]:# and self._obj_with_exclusions.dtype != object: # # make sure the function is called at least once so we get # # a TypeError if appropriate # curried(self._obj_with_exclusions.iloc[:0]) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ce57898739431..ae14c062114ac 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -786,7 +786,12 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): result_values.append(res) if not isinstance(f, str): # TODO: WTF? - if len(group_keys) == 0 and f.__name__ not in ["idxmin", "idxmax", "nanargmin", "nanargmax"]: + if len(group_keys) == 0 and f.__name__ not in [ + "idxmin", + "idxmax", + "nanargmin", + "nanargmax", + ]: # there have been zero function calls, so we do one dummy call # so that we can raise TypeError where appropriate f(data.iloc[:0]) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 06b2cff1211f5..e1e84f179c46d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1917,7 +1917,7 @@ def get_result(): lev = Categorical([0], dtype=values.dtype) ci = Index(lev, name=keys[0]) expected = DataFrame([], columns=[], index=ci) - #expected = df.set_index(keys)[columns] + # expected = df.set_index(keys)[columns] tm.assert_equal(result, expected) return From 9db6fab82eda0da0ff5c109cf6318da8c7a33d52 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 18 Oct 2021 13:54:11 -0700 Subject: [PATCH 3/5] TST: fix groupby-empty xfails --- pandas/core/groupby/generic.py | 4 -- pandas/core/groupby/groupby.py | 14 ----- pandas/core/groupby/ops.py | 22 +++---- pandas/tests/groupby/test_groupby.py | 93 ++++++++++++++++++---------- 4 files changed, 70 insertions(+), 63 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 886fa3315e9ec..3b54918ae99c1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -987,10 +987,6 @@ def _wrap_applied_output( index=self.grouper.result_index, columns=data.columns ) result = result.astype(data.dtypes.to_dict(), copy=False) - # TODO: will this astype mess up with non-unique columns? - # if not len(values): - # breakpoint() - # #result = result[[]] return result # GH12824 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 90e048e38b7f7..07cef290c8919 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -939,11 +939,6 @@ def curried(x): if name in base.plotting_methods: return self.apply(curried) - # if self.ndim == 1 and self._obj_with_exclusions.size == 0 and name not in ["idxmax", "idxmin"]:# and self._obj_with_exclusions.dtype != object: - # # make sure the function is called at least once so we get - # # a TypeError if appropriate - # curried(self._obj_with_exclusions.iloc[:0]) - return self._python_apply_general(curried, self._obj_with_exclusions) wrapper.__name__ = name @@ -1368,14 +1363,6 @@ def f(g): # ignore SettingWithCopy here in case the user mutates with option_context("mode.chained_assignment", None): - if not callable(f): - if hasattr(self, f): - res = getattr(self, f) - if inspect.isfunction(res) or inspect.ismethod(res): - return res() - return res - - raise TypeError(f"invalid func {f}") try: result = self._python_apply_general(f, self._selected_obj) except TypeError: @@ -1431,7 +1418,6 @@ def _python_apply_general( def _python_agg_general(self, func, *args, **kwargs): func = com.is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) - f.__name__ = func.__name__ # iterate through "columns" ex exclusions to populate output dict output: dict[base.OutputKey, ArrayLike] = {} diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e0ede565e3cac..3032f77e7c28e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -737,7 +737,6 @@ def apply( ) -> tuple[list, bool]: mutated = self.mutated splitter = self._get_splitter(data, axis=axis) - group_keys = self.group_keys_seq result_values = [] @@ -754,16 +753,17 @@ def apply( mutated = True result_values.append(res) - if not isinstance(f, str): # TODO: WTF? - if len(group_keys) == 0 and f.__name__ not in [ - "idxmin", - "idxmax", - "nanargmin", - "nanargmax", - ]: - # there have been zero function calls, so we do one dummy call - # so that we can raise TypeError where appropriate - f(data.iloc[:0]) + # gettr pattern for __name__ is needed for functools.partial objects + if len(group_keys) == 0 and getattr(f, "__name__", None) not in [ + "idxmin", + "idxmax", + "nanargmin", + "nanargmax", + ]: + # If group_keys is empty, then no function calls have been made, + # so we will not have raised even if this is an invalid dtype. + # So do one dummy call here to raise appropriate TypeError. + f(data.iloc[:0]) return result_values, mutated diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 71bb8c569f69a..00c37b9aa5fb5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -24,6 +24,11 @@ to_datetime, ) import pandas._testing as tm +from pandas.core.arrays import ( + BooleanArray, + FloatingArray, + IntegerArray, +) from pandas.core.base import SpecificationError import pandas.core.common as com @@ -1829,7 +1834,7 @@ def test_empty_groupby(columns, keys, values, method, op, request): if ( isinstance(values, Categorical) and not isinstance(columns, list) - and op in ["sum", "prod"] + and op in ["sum", "prod", "skew", "mad"] ): # handled below GH#41291 pass @@ -1851,11 +1856,7 @@ def test_empty_groupby(columns, keys, values, method, op, request): raises=TypeError, match="'Categorical' does not implement" ) request.node.add_marker(mark) - elif ( - isinstance(values, Categorical) - and len(keys) == 1 - and op in ["mad", "min", "max", "sum", "prod", "skew"] - ): + elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]: mark = pytest.mark.xfail( raises=AssertionError, match="(DataFrame|Series) are different" ) @@ -1869,7 +1870,17 @@ def test_empty_groupby(columns, keys, values, method, op, request): raises=AssertionError, match="(DataFrame|Series) are different" ) request.node.add_marker(mark) - elif isinstance(values, pd.core.arrays.BooleanArray) and op in ["sum", "prod"]: + elif ( + isinstance(values, (IntegerArray, FloatingArray)) + and op == "mad" + and isinstance(columns, list) + ): + mark = pytest.mark.xfail( + raises=TypeError, match="can only perform ops with numeric values" + ) + request.node.add_marker(mark) + + elif isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64" @@ -1895,19 +1906,29 @@ def get_result(): if columns == "C": # i.e. SeriesGroupBy - if op in ["prod", "sum"]: + if op in ["prod", "sum", "skew"]: # ops that require more than just ordered-ness if df.dtypes[0].kind == "M": # GH#41291 # datetime64 -> prod and sum are invalid - msg = "datetime64 type does not support" + if op == "skew": + msg = "'DatetimeArray' does not implement reduction 'skew'" + else: + msg = "datetime64 type does not support" with pytest.raises(TypeError, match=msg): get_result() return - elif isinstance(values, Categorical): + if op in ["prod", "sum", "skew", "mad"]: + if isinstance(values, Categorical): # GH#41291 - msg = "category type does not support" + if op == "mad": + # mad calls mean, which Categorical doesn't implement + msg = "'Categorical' does not implement reduction 'mean'" + elif op == "skew": + msg = f"'Categorical' does not implement reduction '{op}'" + else: + msg = "category type does not support" with pytest.raises(TypeError, match=msg): get_result() @@ -1954,29 +1975,33 @@ def get_result(): tm.assert_equal(result, expected) return - if op in ["mad", "min", "max", "skew"] and isinstance(values, Categorical): - # Categorical doesn't implement, so with numeric_only=True - # these are dropped and we get an empty DataFrame back - result = get_result() - expected = df.set_index(keys)[[]] - - # with numeric_only=True, these are dropped, and we get - # an empty DataFrame back - if len(keys) != 1: - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - mi = MultiIndex.from_product([lev, lev], names=keys) - expected = DataFrame([], columns=[], index=mi) - else: - # all columns are dropped, but we end up with one row - # Categorical is special without 'observed=True' - lev = Categorical([0], dtype=values.dtype) - ci = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=ci) - # expected = df.set_index(keys)[columns] - - tm.assert_equal(result, expected) - return + if ( + op in ["mad", "min", "max", "skew"] + and isinstance(values, Categorical) + and len(keys) == 1 + ): + # Categorical doesn't implement, so with numeric_only=True + # these are dropped and we get an empty DataFrame back + result = get_result() + expected = df.set_index(keys)[[]] + + # with numeric_only=True, these are dropped, and we get + # an empty DataFrame back + if len(keys) != 1: + # Categorical is special without 'observed=True' + lev = Categorical([0], dtype=values.dtype) + mi = MultiIndex.from_product([lev, lev], names=keys) + expected = DataFrame([], columns=[], index=mi) + else: + # all columns are dropped, but we end up with one row + # Categorical is special without 'observed=True' + lev = Categorical([0], dtype=values.dtype) + ci = Index(lev, name=keys[0]) + expected = DataFrame([], columns=[], index=ci) + # expected = df.set_index(keys)[columns] + + tm.assert_equal(result, expected) + return result = get_result() expected = df.set_index(keys)[columns] From 41bce1bd17b09c62fc0d2abe2b7eda70eb71710b Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 18 Oct 2021 14:04:00 -0700 Subject: [PATCH 4/5] typo fixup --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3032f77e7c28e..60c8851f059fe 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -753,7 +753,7 @@ def apply( mutated = True result_values.append(res) - # gettr pattern for __name__ is needed for functools.partial objects + # getattr pattern for __name__ is needed for functools.partial objects if len(group_keys) == 0 and getattr(f, "__name__", None) not in [ "idxmin", "idxmax", From 78a747fea40a3b31f96b791688b881bc56f94fcd Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 20 Oct 2021 11:32:07 -0700 Subject: [PATCH 5/5] TST: ArrayManager compat --- pandas/tests/groupby/test_groupby.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 00c37b9aa5fb5..203d8abb465d0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1827,7 +1827,7 @@ def test_pivot_table_values_key_error(): ) @pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning") @pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning") -def test_empty_groupby(columns, keys, values, method, op, request): +def test_empty_groupby(columns, keys, values, method, op, request, using_array_manager): # GH8093 & GH26411 override_dtype = None @@ -1837,7 +1837,13 @@ def test_empty_groupby(columns, keys, values, method, op, request): and op in ["sum", "prod", "skew", "mad"] ): # handled below GH#41291 - pass + + if using_array_manager and op == "mad": + right_msg = "Cannot interpret 'CategoricalDtype.* as a data type" + msg = "Regex pattern \"'Categorical' does not implement.*" + right_msg + mark = pytest.mark.xfail(raises=AssertionError, match=msg) + request.node.add_marker(mark) + elif ( isinstance(values, Categorical) and len(keys) == 1 @@ -1880,6 +1886,19 @@ def test_empty_groupby(columns, keys, values, method, op, request): ) request.node.add_marker(mark) + elif ( + op == "mad" + and not isinstance(columns, list) + and isinstance(values, pd.DatetimeIndex) + and values.tz is not None + and using_array_manager + ): + mark = pytest.mark.xfail( + raises=TypeError, + match=r"Cannot interpret 'datetime64\[ns, US/Eastern\]' as a data type", + ) + request.node.add_marker(mark) + elif isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64"