From 91aa285144d2a3dd5c276eec549f3cdbe01772f2 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 26 Aug 2021 00:14:22 +0530 Subject: [PATCH 01/26] BUG: groupby agg fails silently with mixed dtypes --- doc/source/whatsnew/v1.3.3.rst | 2 +- pandas/core/groupby/generic.py | 4 +++- pandas/tests/groupby/test_groupby.py | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 1340188c3d609..f4e4134346e15 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,7 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) -- +- Fixed regression in :meth:`.GroupBy.sum`, :meth:`.GroupBy.std` and :meth:`.GroupBy.var` (:issue:`43209`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35cb247e96bc3..5409f33c238f6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -41,6 +41,7 @@ doc, ) +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( ensure_int64, is_bool, @@ -1597,7 +1598,8 @@ def _gotitem(self, key, ndim: int, subset=None): def _get_data_to_aggregate(self) -> Manager2D: obj = self._obj_with_exclusions if self.axis == 1: - return obj.T._mgr + transposed_dtype = find_common_type(obj.dtypes.tolist()) + return obj.T.astype(transposed_dtype)._mgr else: return obj._mgr diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a714abd461461..3d91cd4354178 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2419,3 +2419,17 @@ def test_rolling_wrong_param_min_period(): result_error_msg = r"__init__\(\) got an unexpected keyword argument 'min_period'" with pytest.raises(TypeError, match=result_error_msg): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() + + +@pytest.mark.parametrize("dtype", [int, "Int64"]) +def test_multiindex_groupby(dtype): + # GH#43209 + df = DataFrame( + [[1, 2, 3, 4, 5, 6]] * 3, + columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), + ).astype({("a", "j"): dtype, ("b", "j"): dtype}) + result = df.groupby(level=1, axis=1).sum() + expected = DataFrame( + [[5, 7, 9], [5, 7, 9], [5, 7, 9]], columns=["i", "j", "k"], dtype=dtype + ) + tm.assert_frame_equal(result, expected) From 9b9b7af8f0c4cbfafce4a29959101a72f4122323 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 26 Aug 2021 00:18:18 +0530 Subject: [PATCH 02/26] updated whatsnew --- doc/source/whatsnew/v1.3.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index f4e4134346e15..f7e3609301991 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,7 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) -- Fixed regression in :meth:`.GroupBy.sum`, :meth:`.GroupBy.std` and :meth:`.GroupBy.var` (:issue:`43209`) +- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types (:issue:`43209`) .. --------------------------------------------------------------------------- From d968e5772f08b399eba956fa4f59d0a003fc6a16 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 26 Aug 2021 01:01:06 +0530 Subject: [PATCH 03/26] added tests for var and std --- pandas/core/groupby/generic.py | 3 ++- pandas/tests/groupby/test_groupby.py | 15 +++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5409f33c238f6..9d6ac77223146 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1057,7 +1057,8 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) def _iterate_slices(self) -> Iterable[Series]: obj = self._selected_obj if self.axis == 1: - obj = obj.T + transposed_dtype = find_common_type(obj.dtypes.tolist()) + obj = obj.T.astype(transposed_dtype) if isinstance(obj, Series) and obj.name not in self.exclusions: # Occurs when doing DataFrameGroupBy(...)["X"] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3d91cd4354178..eb1b31f65655a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2421,15 +2421,18 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() +@pytest.mark.parametrize( + "func, expected", + [("sum", [5, 7, 9]), ("std", [4.5 ** 0.5] * 3), ("var", [4.5] * 3)], +) @pytest.mark.parametrize("dtype", [int, "Int64"]) -def test_multiindex_groupby(dtype): +def test_multiindex_groupby(dtype, func, expected): # GH#43209 df = DataFrame( [[1, 2, 3, 4, 5, 6]] * 3, columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), ).astype({("a", "j"): dtype, ("b", "j"): dtype}) - result = df.groupby(level=1, axis=1).sum() - expected = DataFrame( - [[5, 7, 9], [5, 7, 9], [5, 7, 9]], columns=["i", "j", "k"], dtype=dtype - ) - tm.assert_frame_equal(result, expected) + result = df.groupby(level=1, axis=1).agg(func) + out_dtype = dtype if func == "sum" else None + expected = DataFrame([expected] * 3, columns=["i", "j", "k"], dtype=out_dtype) + tm.assert_frame_equal(result, expected, check_dtype=False) From 46a29e05bae0b0a77d6cc617f27fb649be1adece Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Mon, 30 Aug 2021 20:13:30 +0530 Subject: [PATCH 04/26] reorganized tests --- pandas/tests/groupby/test_groupby.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index eb1b31f65655a..52d48800519a0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2422,17 +2422,23 @@ def test_rolling_wrong_param_min_period(): @pytest.mark.parametrize( - "func, expected", - [("sum", [5, 7, 9]), ("std", [4.5 ** 0.5] * 3), ("var", [4.5] * 3)], + "func, expected, dtype, result_dtype", + [ + ("sum", [5, 7, 9], int, int), + ("std", [4.5 ** 0.5] * 3, int, float), + ("var", [4.5] * 3, int, float), + ("sum", [5, 7, 9], "Int64", "Int64"), + # result_dtype should ideally be Float64 + ("std", [4.5 ** 0.5] * 3, "Int64", float), + ("var", [4.5] * 3, "Int64", "Float64"), + ], ) -@pytest.mark.parametrize("dtype", [int, "Int64"]) -def test_multiindex_groupby(dtype, func, expected): +def test_multiindex_groupby(func, expected, dtype, result_dtype): # GH#43209 df = DataFrame( [[1, 2, 3, 4, 5, 6]] * 3, columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), ).astype({("a", "j"): dtype, ("b", "j"): dtype}) result = df.groupby(level=1, axis=1).agg(func) - out_dtype = dtype if func == "sum" else None - expected = DataFrame([expected] * 3, columns=["i", "j", "k"], dtype=out_dtype) - tm.assert_frame_equal(result, expected, check_dtype=False) + expected = DataFrame([expected] * 3, columns=["i", "j", "k"], dtype=result_dtype) + tm.assert_frame_equal(result, expected) From 16e28db834c477d28d9eaa45eb6656c8a99707b7 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Mon, 30 Aug 2021 22:03:10 +0530 Subject: [PATCH 05/26] specified int64 in result --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 52d48800519a0..1afd448759a31 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2424,7 +2424,7 @@ def test_rolling_wrong_param_min_period(): @pytest.mark.parametrize( "func, expected, dtype, result_dtype", [ - ("sum", [5, 7, 9], int, int), + ("sum", [5, 7, 9], int, "int64"), ("std", [4.5 ** 0.5] * 3, int, float), ("var", [4.5] * 3, int, float), ("sum", [5, 7, 9], "Int64", "Int64"), From 854ecda409b2505d8018e376290f08c7244b55f4 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Tue, 31 Aug 2021 21:25:47 +0530 Subject: [PATCH 06/26] added copy=False to astype --- pandas/core/groupby/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9d6ac77223146..1987baa96e44c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1058,7 +1058,7 @@ def _iterate_slices(self) -> Iterable[Series]: obj = self._selected_obj if self.axis == 1: transposed_dtype = find_common_type(obj.dtypes.tolist()) - obj = obj.T.astype(transposed_dtype) + obj = obj.T.astype(transposed_dtype, copy=False) if isinstance(obj, Series) and obj.name not in self.exclusions: # Occurs when doing DataFrameGroupBy(...)["X"] @@ -1600,7 +1600,7 @@ def _get_data_to_aggregate(self) -> Manager2D: obj = self._obj_with_exclusions if self.axis == 1: transposed_dtype = find_common_type(obj.dtypes.tolist()) - return obj.T.astype(transposed_dtype)._mgr + return obj.T.astype(transposed_dtype, copy=False)._mgr else: return obj._mgr From 7dd27f3ff34220073628f91e71e4b4110e121de9 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 1 Sep 2021 01:26:36 +0530 Subject: [PATCH 07/26] updated whatsnew --- doc/source/whatsnew/v1.3.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 29eb5e2d95650..af39c7fd492fe 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,7 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) -- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types (:issue:`43209`) +- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`Multiindex` (:issue:`43209`) - Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`) .. --------------------------------------------------------------------------- From 309ee59478023166021384db6edc6fe898d5394f Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 1 Sep 2021 01:28:26 +0530 Subject: [PATCH 08/26] typo corrected --- doc/source/whatsnew/v1.3.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index af39c7fd492fe..1a18e4485997b 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,7 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) -- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`Multiindex` (:issue:`43209`) +- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`MultiIndex` (:issue:`43209`) - Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`) .. --------------------------------------------------------------------------- From 8d1bfb1cb92e3d632f460334df30f10824c0fad1 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 1 Sep 2021 11:20:01 +0530 Subject: [PATCH 09/26] added issue ref --- pandas/core/groupby/generic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1987baa96e44c..de7627515ce68 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1057,6 +1057,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) def _iterate_slices(self) -> Iterable[Series]: obj = self._selected_obj if self.axis == 1: + # To be removed post #GH #43337 fix transposed_dtype = find_common_type(obj.dtypes.tolist()) obj = obj.T.astype(transposed_dtype, copy=False) @@ -1599,6 +1600,7 @@ def _gotitem(self, key, ndim: int, subset=None): def _get_data_to_aggregate(self) -> Manager2D: obj = self._obj_with_exclusions if self.axis == 1: + # To be removed post #GH #43337 fix transposed_dtype = find_common_type(obj.dtypes.tolist()) return obj.T.astype(transposed_dtype, copy=False)._mgr else: From 41471aa464962cbc64450f588482d21025e8b50b Mon Sep 17 00:00:00 2001 From: Shoham Debnath Date: Thu, 2 Sep 2021 23:27:05 +0530 Subject: [PATCH 10/26] added issue ref to test --- pandas/tests/groupby/test_groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1afd448759a31..91d2c00843cbe 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2429,6 +2429,7 @@ def test_rolling_wrong_param_min_period(): ("var", [4.5] * 3, int, float), ("sum", [5, 7, 9], "Int64", "Int64"), # result_dtype should ideally be Float64 + # GH#43330 ("std", [4.5 ** 0.5] * 3, "Int64", float), ("var", [4.5] * 3, "Int64", "Float64"), ], From d6992e54f4fd684a1e291e7409209d4840c914b2 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 5 Sep 2021 21:06:43 +0530 Subject: [PATCH 11/26] reverted old; raising DataError as 1.2.5 --- pandas/core/groupby/generic.py | 17 ++++---- pandas/core/groupby/groupby.py | 61 +++++++++++++++++++--------- pandas/tests/groupby/test_groupby.py | 22 +++++----- 3 files changed, 61 insertions(+), 39 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index de7627515ce68..a4b366b118b8b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -41,7 +41,6 @@ doc, ) -from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( ensure_int64, is_bool, @@ -67,7 +66,10 @@ validate_func_kwargs, ) from pandas.core.apply import GroupByApply -from pandas.core.base import SpecificationError +from pandas.core.base import ( + DataError, + SpecificationError, +) import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame @@ -1057,9 +1059,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) def _iterate_slices(self) -> Iterable[Series]: obj = self._selected_obj if self.axis == 1: - # To be removed post #GH #43337 fix - transposed_dtype = find_common_type(obj.dtypes.tolist()) - obj = obj.T.astype(transposed_dtype, copy=False) + obj = obj.T if isinstance(obj, Series) and obj.name not in self.exclusions: # Occurs when doing DataFrameGroupBy(...)["X"] @@ -1099,6 +1099,9 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=True) + if not len(new_mgr) and len(data): + raise DataError("No numeric types to aggregate") + if len(new_mgr) < len(data): warnings.warn( f"Dropping invalid columns in {type(self).__name__}.{how} " @@ -1600,9 +1603,7 @@ def _gotitem(self, key, ndim: int, subset=None): def _get_data_to_aggregate(self) -> Manager2D: obj = self._obj_with_exclusions if self.axis == 1: - # To be removed post #GH #43337 fix - transposed_dtype = find_common_type(obj.dtypes.tolist()) - return obj.T.astype(transposed_dtype, copy=False)._mgr + return obj.T._mgr else: return obj._mgr diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 79ee71ddb1047..5852794631d87 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1361,12 +1361,19 @@ def _agg_general( with group_selection_context(self): # try a cython aggregation if we can - result = self._cython_agg_general( - how=alias, - alt=npfunc, - numeric_only=numeric_only, - min_count=min_count, - ) + result = None + try: + result = self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) + except DataError: + pass + + if result is None: + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) return result.__finalize__(self.obj, method="groupby") def _agg_py_fallback( @@ -1740,16 +1747,21 @@ def std(self, ddof: int = 1): Series or DataFrame Standard deviation of values within each group. """ - return self._get_cythonized_result( - libgroupby.group_var, - aggregate=True, - needs_counts=True, - needs_values=True, - needs_2d=True, - cython_dtype=np.dtype(np.float64), - post_processing=lambda vals, inference: np.sqrt(vals), - ddof=ddof, - ) + try: + return self._get_cythonized_result( + libgroupby.group_var, + aggregate=True, + needs_counts=True, + needs_values=True, + needs_2d=True, + cython_dtype=np.dtype(np.float64), + post_processing=lambda vals, inference: np.sqrt(vals), + ddof=ddof, + ) + except DataError: + func = lambda x: x.std(ddof=ddof) + with group_selection_context(self): + return self._python_agg_general(func) @final @Substitution(name="groupby") @@ -1771,10 +1783,17 @@ def var(self, ddof: int = 1): Variance of values within each group. """ if ddof == 1: - numeric_only = self._resolve_numeric_only(lib.no_default) - return self._cython_agg_general( - "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only - ) + try: + numeric_only = self._resolve_numeric_only(lib.no_default) + return self._cython_agg_general( + "var", + alt=lambda x: Series(x).var(ddof=ddof), + numeric_only=numeric_only, + ) + except DataError: + func = lambda x: x.var(ddof=ddof) + with group_selection_context(self): + return self._python_agg_general(func) else: func = lambda x: x.var(ddof=ddof) with group_selection_context(self): @@ -3049,6 +3068,8 @@ def blk_func(values: ArrayLike) -> ArrayLike: # error_msg is "" on an frame/series with no rows or columns if not output and error_msg != "": raise TypeError(error_msg) + elif not output and error_msg == "" and not self._obj_with_exclusions.empty: + raise DataError("No numeric types to aggregate") if aggregate: return self._wrap_aggregated_output(output) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c96e0c94a7134..13bc1f7912319 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2446,24 +2446,24 @@ def test_rolling_wrong_param_min_period(): @pytest.mark.parametrize( - "func, expected, dtype, result_dtype", + "func, expected, dtype, result_dtype_dict", [ - ("sum", [5, 7, 9], int, "int64"), - ("std", [4.5 ** 0.5] * 3, int, float), - ("var", [4.5] * 3, int, float), - ("sum", [5, 7, 9], "Int64", "Int64"), - # result_dtype should ideally be Float64 - # GH#43330 - ("std", [4.5 ** 0.5] * 3, "Int64", float), - ("var", [4.5] * 3, "Int64", "Float64"), + ("sum", [5, 7, 9], "int64", {}), + ("std", [4.5 ** 0.5] * 3, int, {"i": float, "j": float, "k": float}), + ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}), + ("sum", [5, 7, 9], "Int64", {"j": "float64"}), + ("std", [4.5 ** 0.5] * 3, "Int64", {"i": float, "j": float, "k": float}), + ("var", [4.5] * 3, "Int64", {"i": float, "j": float, "k": float}), ], ) -def test_multiindex_groupby(func, expected, dtype, result_dtype): +def test_multiindex_groupby(func, expected, dtype, result_dtype_dict): # GH#43209 df = DataFrame( [[1, 2, 3, 4, 5, 6]] * 3, columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), ).astype({("a", "j"): dtype, ("b", "j"): dtype}) result = df.groupby(level=1, axis=1).agg(func) - expected = DataFrame([expected] * 3, columns=["i", "j", "k"], dtype=result_dtype) + expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( + result_dtype_dict + ) tm.assert_frame_equal(result, expected) From ab1fd879ec856efecbae53cf980dc13eb1a25efd Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 5 Sep 2021 23:27:01 +0530 Subject: [PATCH 12/26] used _selected_obj instead of mgr --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a4b366b118b8b..05253a849541f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1099,7 +1099,7 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - if not len(new_mgr) and len(data): + if not len(new_mgr) and len(self._selected_obj): raise DataError("No numeric types to aggregate") if len(new_mgr) < len(data): From 9039124269b055e8cf54da73e169d5168bb3ec07 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Mon, 6 Sep 2021 00:23:48 +0530 Subject: [PATCH 13/26] =0 --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 05253a849541f..e4db5084d17f5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1099,7 +1099,7 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - if not len(new_mgr) and len(self._selected_obj): + if not len(new_mgr) and len(self._selected_obj) == 0: raise DataError("No numeric types to aggregate") if len(new_mgr) < len(data): From 64ca85a9004d141d86d1968dc91b0336b31cf2e8 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Fri, 10 Sep 2021 00:22:11 +0530 Subject: [PATCH 14/26] draft --- pandas/core/groupby/generic.py | 17 +++++----- pandas/core/groupby/groupby.py | 61 ++++++++++++---------------------- 2 files changed, 30 insertions(+), 48 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 240c1eac34e03..150b8f6832ce3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -41,6 +41,7 @@ doc, ) +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( ensure_int64, is_bool, @@ -66,10 +67,7 @@ validate_func_kwargs, ) from pandas.core.apply import GroupByApply -from pandas.core.base import ( - DataError, - SpecificationError, -) +from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame @@ -1041,7 +1039,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) def _iterate_slices(self) -> Iterable[Series]: obj = self._selected_obj if self.axis == 1: - obj = obj.T + # To be removed post #GH #43337 fix + transposed_dtype = find_common_type(obj.dtypes.tolist()) + obj = obj.T.astype(transposed_dtype, copy=False) if isinstance(obj, Series) and obj.name not in self.exclusions: # Occurs when doing DataFrameGroupBy(...)["X"] @@ -1081,9 +1081,6 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - if not len(new_mgr) and len(self._selected_obj) == 0: - raise DataError("No numeric types to aggregate") - if len(new_mgr) < len(data): warnings.warn( f"Dropping invalid columns in {type(self).__name__}.{how} " @@ -1585,7 +1582,9 @@ def _gotitem(self, key, ndim: int, subset=None): def _get_data_to_aggregate(self) -> Manager2D: obj = self._obj_with_exclusions if self.axis == 1: - return obj.T._mgr + # To be removed post #GH #43337 fix + transposed_dtype = find_common_type(obj.dtypes.tolist()) + return obj.T.astype(transposed_dtype, copy=False)._mgr else: return obj._mgr diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1cfa6e1d2e714..fe57d47d9b740 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1410,19 +1410,12 @@ def _agg_general( with group_selection_context(self): # try a cython aggregation if we can - result = None - try: - result = self._cython_agg_general( - how=alias, - alt=npfunc, - numeric_only=numeric_only, - min_count=min_count, - ) - except DataError: - pass - - if result is None: - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + result = self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) return result.__finalize__(self.obj, method="groupby") def _agg_py_fallback( @@ -1796,21 +1789,16 @@ def std(self, ddof: int = 1): Series or DataFrame Standard deviation of values within each group. """ - try: - return self._get_cythonized_result( - libgroupby.group_var, - aggregate=True, - needs_counts=True, - needs_values=True, - needs_2d=True, - cython_dtype=np.dtype(np.float64), - post_processing=lambda vals, inference: np.sqrt(vals), - ddof=ddof, - ) - except DataError: - func = lambda x: x.std(ddof=ddof) - with group_selection_context(self): - return self._python_agg_general(func) + return self._get_cythonized_result( + libgroupby.group_var, + aggregate=True, + needs_counts=True, + needs_values=True, + needs_2d=True, + cython_dtype=np.dtype(np.float64), + post_processing=lambda vals, inference: np.sqrt(vals), + ddof=ddof, + ) @final @Substitution(name="groupby") @@ -1832,17 +1820,12 @@ def var(self, ddof: int = 1): Variance of values within each group. """ if ddof == 1: - try: - numeric_only = self._resolve_numeric_only(lib.no_default) - return self._cython_agg_general( - "var", - alt=lambda x: Series(x).var(ddof=ddof), - numeric_only=numeric_only, - ) - except DataError: - func = lambda x: x.var(ddof=ddof) - with group_selection_context(self): - return self._python_agg_general(func) + numeric_only = self._resolve_numeric_only(lib.no_default) + return self._cython_agg_general( + "var", + alt=lambda x: Series(x).var(ddof=ddof), + numeric_only=numeric_only, + ) else: func = lambda x: x.var(ddof=ddof) with group_selection_context(self): From 4ae0d6bfd221e6087ac5da80dcb63a7856dac889 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Fri, 10 Sep 2021 02:29:23 +0530 Subject: [PATCH 15/26] cast mi groupbysum --- pandas/core/groupby/groupby.py | 13 ++++++++----- pandas/tests/groupby/test_groupby.py | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b3c7a0b8b9ee9..732b961c525cf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1855,9 +1855,7 @@ def var(self, ddof: int = 1): if ddof == 1: numeric_only = self._resolve_numeric_only(lib.no_default) return self._cython_agg_general( - "var", - alt=lambda x: Series(x).var(ddof=ddof), - numeric_only=numeric_only, + "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only ) else: func = lambda x: x.var(ddof=ddof) @@ -1937,6 +1935,13 @@ def sum( alias="add", npfunc=np.sum, ) + if self.axis == 1 and isinstance(self.obj.columns, MultiIndex): + dtypes_df = self.obj.dtypes.unstack().head(1) + if len(set(dtypes_df.values[0])) > 1: + # if self.obj has mixed dtype + result = result.astype( + dict(zip(dtypes_df.columns, dtypes_df.values[0])) + ) return self._reindex_output(result, fill_value=0) @@ -3130,8 +3135,6 @@ def blk_func(values: ArrayLike) -> ArrayLike: # error_msg is "" on an frame/series with no rows or columns if not output and error_msg != "": raise TypeError(error_msg) - elif not output and error_msg == "" and not self._obj_with_exclusions.empty: - raise DataError("No numeric types to aggregate") if aggregate: return self._wrap_aggregated_output(output) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 13bc1f7912319..41330ef698039 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2451,9 +2451,9 @@ def test_rolling_wrong_param_min_period(): ("sum", [5, 7, 9], "int64", {}), ("std", [4.5 ** 0.5] * 3, int, {"i": float, "j": float, "k": float}), ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}), - ("sum", [5, 7, 9], "Int64", {"j": "float64"}), + ("sum", [5, 7, 9], "Int64", {"j": "Int64"}), ("std", [4.5 ** 0.5] * 3, "Int64", {"i": float, "j": float, "k": float}), - ("var", [4.5] * 3, "Int64", {"i": float, "j": float, "k": float}), + ("var", [4.5] * 3, "Int64", {"i": "Float64", "j": "Float64", "k": "Float64"}), ], ) def test_multiindex_groupby(func, expected, dtype, result_dtype_dict): From 572f23c3baaaef81543578a5126e62e66350e539 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Fri, 10 Sep 2021 11:46:38 +0530 Subject: [PATCH 16/26] dropped na --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 349a5630f893a..696344da14481 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1944,7 +1944,7 @@ def sum( npfunc=np.sum, ) if self.axis == 1 and isinstance(self.obj.columns, MultiIndex): - dtypes_df = self.obj.dtypes.unstack().head(1) + dtypes_df = self.obj.dtypes.unstack().head(1).dropna(axis=1) if len(set(dtypes_df.values[0])) > 1: # if self.obj has mixed dtype result = result.astype( From 625a7517f5de233ab86a71325722e52ab3bb2207 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 11 Sep 2021 21:19:13 +0530 Subject: [PATCH 17/26] try casting in _wrap_agged_manager --- pandas/core/groupby/generic.py | 23 +++++++++++++++++++++++ pandas/core/groupby/groupby.py | 8 -------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7ef3159bc5e3b..dd27bb0c39e6f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1594,11 +1594,34 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: if self.axis == 1: result = result.T + result = self._maybe_recast_columns_if_transposed(result) # Note: we only need to pass datetime=True in order to get numeric # values converted return self._reindex_output(result)._convert(datetime=True) + def _maybe_recast_columns_if_transposed(self, result) -> DataFrame: + obj = self._selected_obj + dtype_df = obj.dtypes + if obj._is_homogeneous_type: + # not mixed dtype + return result + if isinstance(obj.columns, MultiIndex): + dtype_df_dict = ( + dtype_df.reset_index(level=self.level) + .drop_duplicates(f"level_{self.level}") + .set_index(f"level_{self.level}") + .to_dict()[0] + ) + else: + dtype_df_dict = dtype_df.drop_duplicates().loc[result.index].to_dict() + result_cast = result.astype(dtype_df_dict, copy=False, errors="ignore") + if np.array_equal(result_cast.values, result.values): + # able to safely cast + return result_cast + else: + return result + def _iterate_column_groupbys(self, obj: FrameOrSeries): for i, colname in enumerate(obj.columns): yield colname, SeriesGroupBy( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 343d372f4bf3a..fe3eaa5be2173 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1956,14 +1956,6 @@ def sum( alias="add", npfunc=np.sum, ) - if self.axis == 1 and isinstance(self.obj.columns, MultiIndex): - dtypes_df = self.obj.dtypes.unstack().head(1).dropna(axis=1) - if len(set(dtypes_df.values[0])) > 1: - # if self.obj has mixed dtype - result = result.astype( - dict(zip(dtypes_df.columns, dtypes_df.values[0])) - ) - return self._reindex_output(result, fill_value=0) @final From 9b0acd73a0eeade01fc8601b7802dc2346ef33e9 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 11 Sep 2021 22:52:41 +0530 Subject: [PATCH 18/26] added test axis=1 --- pandas/core/groupby/generic.py | 2 +- pandas/tests/groupby/test_groupby.py | 27 ++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dd27bb0c39e6f..22bf056ef814f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1614,7 +1614,7 @@ def _maybe_recast_columns_if_transposed(self, result) -> DataFrame: .to_dict()[0] ) else: - dtype_df_dict = dtype_df.drop_duplicates().loc[result.index].to_dict() + dtype_df_dict = dtype_df.drop_duplicates().loc[result.columns].to_dict() result_cast = result.astype(dtype_df_dict, copy=False, errors="ignore") if np.array_equal(result_cast.values, result.values): # able to safely cast diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2469c7fc5ecfe..fe1d28241bc51 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2522,7 +2522,7 @@ def test_rolling_wrong_param_min_period(): ("var", [4.5] * 3, "Int64", {"i": "Float64", "j": "Float64", "k": "Float64"}), ], ) -def test_multiindex_groupby(func, expected, dtype, result_dtype_dict): +def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict): # GH#43209 df = DataFrame( [[1, 2, 3, 4, 5, 6]] * 3, @@ -2533,3 +2533,28 @@ def test_multiindex_groupby(func, expected, dtype, result_dtype_dict): result_dtype_dict ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "func, expected_data, result_dtype_dict", + [ + ("sum", [[2, 4], [10, 12], [18, 20]], {10: "Int64", 20: "int64"}), + # should ideally by Int64 #43330 + ("std", [[2 ** 0.5] * 2] * 3, float), + ("var", [[2] * 2] * 3, {10: "Int64", 20: "int64"}), + ], +) +def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): + # GH#43209 + df = DataFrame( + np.arange(12).reshape(3, 4), + index=Index([0, 1, 0], name="y"), + columns=Index([10, 20, 10, 20], name="x"), + ).astype({10: "Int64"}) + result = df.groupby("x", axis=1).agg(func) + expected = DataFrame( + data=expected_data, + index=Index([0, 1, 0], name="y"), + columns=Index([10, 20], name="x"), + ).astype(result_dtype_dict) + tm.assert_frame_equal(result, expected) From 265b3bb17c33d18ec40165646eb6341ae6a4ef32 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 12 Sep 2021 01:22:03 +0530 Subject: [PATCH 19/26] overrid int64; failing in 32bit --- pandas/tests/groupby/test_groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index fe1d28241bc51..b318eae6528bb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2550,6 +2550,7 @@ def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): np.arange(12).reshape(3, 4), index=Index([0, 1, 0], name="y"), columns=Index([10, 20, 10, 20], name="x"), + dtype="int64", ).astype({10: "Int64"}) result = df.groupby("x", axis=1).agg(func) expected = DataFrame( From 753c7dfe30a5dc5eae2cab716a414c7c48e99623 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Mon, 13 Sep 2021 21:18:23 +0530 Subject: [PATCH 20/26] updated whatsnew --- doc/source/whatsnew/v1.3.3.rst | 1 - doc/source/whatsnew/v1.3.4.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 0bd660d47ae8f..ecec6d975ccb7 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -16,7 +16,6 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) -- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`MultiIndex` (:issue:`43209`) - Fixed regression in :meth:`.GroupBy.apply` where ``nan`` values were dropped even with ``dropna=False`` (:issue:`43205`) - Fixed regression in :meth:`.GroupBy.quantile` which was failing with ``pandas.NA`` (:issue:`42849`) - Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`) diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 273686f0aaa8f..231bae94bf6c3 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :meth:`.GroupBy.agg` where it was failing silently with mixed data types along ``axis=1`` and :class:`MultiIndex` (:issue:`43209`) - .. --------------------------------------------------------------------------- From de86f725c61fe3aa180799d8c49634b9138f56ea Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Mon, 13 Sep 2021 22:59:26 +0530 Subject: [PATCH 21/26] astype for std --- pandas/core/groupby/groupby.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4ae29b4b64e64..6acc76d251cb4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -62,6 +62,7 @@ class providing the base-class of operations. ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, @@ -3140,7 +3141,9 @@ def blk_func(values: ArrayLike) -> ArrayLike: mgr = obj._mgr if self.axis == 1: - mgr = obj.T._mgr + # To be removed post #GH #43337 fix + transposed_dtype = find_common_type(obj.dtypes.tolist()) + mgr = obj.T.astype(transposed_dtype, copy=False)._mgr if numeric_only: mgr = mgr.get_numeric_data() From 2a29451207520ab76694742ddeb9b18dd57fd510 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Tue, 14 Sep 2021 20:36:53 +0530 Subject: [PATCH 22/26] smaller patch for 1.3.x --- pandas/core/groupby/generic.py | 32 ++-------------------------- pandas/core/groupby/groupby.py | 11 +++++----- pandas/tests/groupby/test_groupby.py | 10 ++++----- 3 files changed, 13 insertions(+), 40 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 53b21a89e03c8..46e42326d4191 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -38,7 +38,6 @@ doc, ) -from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( ensure_int64, is_bool, @@ -956,9 +955,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) def _iterate_slices(self) -> Iterable[Series]: obj = self._selected_obj if self.axis == 1: - # To be removed post #GH #43337 fix - transposed_dtype = find_common_type(obj.dtypes.tolist()) - obj = obj.T.astype(transposed_dtype, copy=False) + obj = obj.T if isinstance(obj, Series) and obj.name not in self.exclusions: # Occurs when doing DataFrameGroupBy(...)["X"] @@ -1470,9 +1467,7 @@ def _gotitem(self, key, ndim: int, subset=None): def _get_data_to_aggregate(self) -> Manager2D: obj = self._obj_with_exclusions if self.axis == 1: - # To be removed post #GH #43337 fix - transposed_dtype = find_common_type(obj.dtypes.tolist()) - return obj.T.astype(transposed_dtype, copy=False)._mgr + return obj.T._mgr else: return obj._mgr @@ -1520,34 +1515,11 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: if self.axis == 1: result = result.T - result = self._maybe_recast_columns_if_transposed(result) # Note: we only need to pass datetime=True in order to get numeric # values converted return self._reindex_output(result)._convert(datetime=True) - def _maybe_recast_columns_if_transposed(self, result) -> DataFrame: - obj = self._selected_obj - dtype_df = obj.dtypes - if obj._is_homogeneous_type: - # not mixed dtype - return result - if isinstance(obj.columns, MultiIndex): - dtype_df_dict = ( - dtype_df.reset_index(level=self.level) - .drop_duplicates(f"level_{self.level}") - .set_index(f"level_{self.level}") - .to_dict()[0] - ) - else: - dtype_df_dict = dtype_df.drop_duplicates().loc[result.columns].to_dict() - result_cast = result.astype(dtype_df_dict, copy=False, errors="ignore") - if np.array_equal(result_cast.values, result.values): - # able to safely cast - return result_cast - else: - return result - def _iterate_column_groupbys(self, obj: FrameOrSeries): for i, colname in enumerate(obj.columns): yield colname, SeriesGroupBy( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6acc76d251cb4..d4a595836481d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -62,7 +62,6 @@ class providing the base-class of operations. ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, @@ -1214,7 +1213,10 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: numeric_only = True # GH#42395 GH#43108 GH#43154 # Regression from 1.2.5 to 1.3 caused object columns to be dropped - obj = self._obj_with_exclusions + if self.axis: + obj = self._obj_with_exclusions.T + else: + obj = self._obj_with_exclusions check = obj._get_numeric_data() if len(obj.columns) and not len(check.columns) and not obj.empty: numeric_only = False @@ -1981,6 +1983,7 @@ def sum( alias="add", npfunc=np.sum, ) + return self._reindex_output(result, fill_value=0) @final @@ -3141,9 +3144,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: mgr = obj._mgr if self.axis == 1: - # To be removed post #GH #43337 fix - transposed_dtype = find_common_type(obj.dtypes.tolist()) - mgr = obj.T.astype(transposed_dtype, copy=False)._mgr + mgr = obj.T._mgr if numeric_only: mgr = mgr.get_numeric_data() diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b318eae6528bb..20ad6bb9f7224 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2517,9 +2517,9 @@ def test_rolling_wrong_param_min_period(): ("sum", [5, 7, 9], "int64", {}), ("std", [4.5 ** 0.5] * 3, int, {"i": float, "j": float, "k": float}), ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}), - ("sum", [5, 7, 9], "Int64", {"j": "Int64"}), + ("sum", [5, 7, 9], "Int64", {"j": "int64"}), ("std", [4.5 ** 0.5] * 3, "Int64", {"i": float, "j": float, "k": float}), - ("var", [4.5] * 3, "Int64", {"i": "Float64", "j": "Float64", "k": "Float64"}), + ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}), ], ) def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict): @@ -2538,10 +2538,10 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype @pytest.mark.parametrize( "func, expected_data, result_dtype_dict", [ - ("sum", [[2, 4], [10, 12], [18, 20]], {10: "Int64", 20: "int64"}), + ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}), # should ideally by Int64 #43330 - ("std", [[2 ** 0.5] * 2] * 3, float), - ("var", [[2] * 2] * 3, {10: "Int64", 20: "int64"}), + ("std", [[2 ** 0.5] * 2] * 3, "float64"), + ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}), ], ) def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): From 00838be6b8706fdfedb09af08aef3235d7f63c2c Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Fri, 17 Sep 2021 00:46:50 +0530 Subject: [PATCH 23/26] changes wrt 1.2.5 --- pandas/tests/groupby/test_groupby.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 20ad6bb9f7224..6402b66cc8f01 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2516,10 +2516,17 @@ def test_rolling_wrong_param_min_period(): [ ("sum", [5, 7, 9], "int64", {}), ("std", [4.5 ** 0.5] * 3, int, {"i": float, "j": float, "k": float}), + # 1.2.5: ValueError: Length mismatch: Expected axis + # has 0 elements, new values have 3 elements ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}), + # 1.2.5: DataError: No numeric types to aggregate ("sum", [5, 7, 9], "Int64", {"j": "int64"}), + # 1.2.5: j:float64 ("std", [4.5 ** 0.5] * 3, "Int64", {"i": float, "j": float, "k": float}), + # 1.2.5: ValueError: Length mismatch: Expected axis + # has 0 elements, new values have 3 elements ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}), + # 1.2.5: DataError: No numeric types to aggregate ], ) def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict): @@ -2539,9 +2546,13 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype "func, expected_data, result_dtype_dict", [ ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}), - # should ideally by Int64 #43330 + # 1.2.5: 10: float64 + # std should ideally return Int64 #43330 ("std", [[2 ** 0.5] * 2] * 3, "float64"), + # 1.2.5: ValueError: Length mismatch: Expected axis + # has 0 elements, new values have 3 elements ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}), + # 1.2.5: DataError: No numeric types to aggregate ], ) def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): From 600b71e499b982768e8e3924d996c2f9c63ae849 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 29 Sep 2021 12:21:04 +0530 Subject: [PATCH 24/26] removed comments highlighting diff with 1.2.5 from test --- pandas/tests/groupby/test_groupby.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6402b66cc8f01..2a691a723cfd4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2546,13 +2546,9 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype "func, expected_data, result_dtype_dict", [ ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}), - # 1.2.5: 10: float64 # std should ideally return Int64 #43330 ("std", [[2 ** 0.5] * 2] * 3, "float64"), - # 1.2.5: ValueError: Length mismatch: Expected axis - # has 0 elements, new values have 3 elements ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}), - # 1.2.5: DataError: No numeric types to aggregate ], ) def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): From fe132784b9f08a9b322f51621a3132a925de0b79 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 29 Sep 2021 12:45:26 +0530 Subject: [PATCH 25/26] removed comments from test2 --- pandas/tests/groupby/test_groupby.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2a691a723cfd4..843e6765cee0d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2516,17 +2516,10 @@ def test_rolling_wrong_param_min_period(): [ ("sum", [5, 7, 9], "int64", {}), ("std", [4.5 ** 0.5] * 3, int, {"i": float, "j": float, "k": float}), - # 1.2.5: ValueError: Length mismatch: Expected axis - # has 0 elements, new values have 3 elements ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}), - # 1.2.5: DataError: No numeric types to aggregate ("sum", [5, 7, 9], "Int64", {"j": "int64"}), - # 1.2.5: j:float64 ("std", [4.5 ** 0.5] * 3, "Int64", {"i": float, "j": float, "k": float}), - # 1.2.5: ValueError: Length mismatch: Expected axis - # has 0 elements, new values have 3 elements ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}), - # 1.2.5: DataError: No numeric types to aggregate ], ) def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict): @@ -2546,7 +2539,7 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype "func, expected_data, result_dtype_dict", [ ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}), - # std should ideally return Int64 #43330 + # std should ideally return Int64 / Float64 #43330 ("std", [[2 ** 0.5] * 2] * 3, "float64"), ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}), ], From 1aaf326a5c83eab9c8c2832a521cf7c8ee3878b7 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 29 Sep 2021 22:34:40 +0530 Subject: [PATCH 26/26] moved tests to test_aggregate --- .../tests/groupby/aggregate/test_aggregate.py | 50 +++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 50 ------------------- 2 files changed, 50 insertions(+), 50 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 7bb850d38340f..d8d53c355bd0f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -225,6 +225,56 @@ def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func): gb.agg(reduction_func, axis=1) +@pytest.mark.parametrize( + "func, expected, dtype, result_dtype_dict", + [ + ("sum", [5, 7, 9], "int64", {}), + ("std", [4.5 ** 0.5] * 3, int, {"i": float, "j": float, "k": float}), + ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}), + ("sum", [5, 7, 9], "Int64", {"j": "int64"}), + ("std", [4.5 ** 0.5] * 3, "Int64", {"i": float, "j": float, "k": float}), + ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}), + ], +) +def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict): + # GH#43209 + df = DataFrame( + [[1, 2, 3, 4, 5, 6]] * 3, + columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), + ).astype({("a", "j"): dtype, ("b", "j"): dtype}) + result = df.groupby(level=1, axis=1).agg(func) + expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( + result_dtype_dict + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "func, expected_data, result_dtype_dict", + [ + ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}), + # std should ideally return Int64 / Float64 #43330 + ("std", [[2 ** 0.5] * 2] * 3, "float64"), + ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}), + ], +) +def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): + # GH#43209 + df = DataFrame( + np.arange(12).reshape(3, 4), + index=Index([0, 1, 0], name="y"), + columns=Index([10, 20, 10, 20], name="x"), + dtype="int64", + ).astype({10: "Int64"}) + result = df.groupby("x", axis=1).agg(func) + expected = DataFrame( + data=expected_data, + index=Index([0, 1, 0], name="y"), + columns=Index([10, 20], name="x"), + ).astype(result_dtype_dict) + tm.assert_frame_equal(result, expected) + + def test_aggregate_item_by_item(df): grouped = df.groupby("A") diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 843e6765cee0d..b9a6730996a02 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2509,53 +2509,3 @@ def test_rolling_wrong_param_min_period(): result_error_msg = r"__init__\(\) got an unexpected keyword argument 'min_period'" with pytest.raises(TypeError, match=result_error_msg): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() - - -@pytest.mark.parametrize( - "func, expected, dtype, result_dtype_dict", - [ - ("sum", [5, 7, 9], "int64", {}), - ("std", [4.5 ** 0.5] * 3, int, {"i": float, "j": float, "k": float}), - ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}), - ("sum", [5, 7, 9], "Int64", {"j": "int64"}), - ("std", [4.5 ** 0.5] * 3, "Int64", {"i": float, "j": float, "k": float}), - ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}), - ], -) -def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict): - # GH#43209 - df = DataFrame( - [[1, 2, 3, 4, 5, 6]] * 3, - columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]), - ).astype({("a", "j"): dtype, ("b", "j"): dtype}) - result = df.groupby(level=1, axis=1).agg(func) - expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype( - result_dtype_dict - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "func, expected_data, result_dtype_dict", - [ - ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}), - # std should ideally return Int64 / Float64 #43330 - ("std", [[2 ** 0.5] * 2] * 3, "float64"), - ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}), - ], -) -def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict): - # GH#43209 - df = DataFrame( - np.arange(12).reshape(3, 4), - index=Index([0, 1, 0], name="y"), - columns=Index([10, 20, 10, 20], name="x"), - dtype="int64", - ).astype({10: "Int64"}) - result = df.groupby("x", axis=1).agg(func) - expected = DataFrame( - data=expected_data, - index=Index([0, 1, 0], name="y"), - columns=Index([10, 20], name="x"), - ).astype(result_dtype_dict) - tm.assert_frame_equal(result, expected)