From 6b0f5e71746779c0017824b7fa859235ca2c1cff Mon Sep 17 00:00:00 2001 From: Zerobugs_DesignLab Date: Sat, 21 Aug 2021 19:33:13 +0530 Subject: [PATCH 01/34] Updating _resolve_numeric_only function of GroupBy --- pandas/core/groupby/groupby.py | 20 ++++++++++++++++++- .../tests/groupby/aggregate/test_aggregate.py | 14 +++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6761387b0dbc2..c72c398ca1e1c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -10,6 +10,7 @@ class providing the base-class of operations. from contextlib import contextmanager import datetime +import numpy as np from functools import ( partial, wraps, @@ -1119,7 +1120,24 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # i.e. not explicitly passed by user if self.obj.ndim == 2: # i.e. DataFrameGroupBy - numeric_only = True + # Checking if the dataframe has non-numeric features + + df_cols = self.obj.columns + num_cols = self.obj.select_dtypes(\ + include=[np.number,np.datetime64,np.timedelta64]).columns + # Removing the key columns + + if set(num_cols).intersection(set(self.keys)) : + df_cols = set(df_cols)- set(self.keys) + num_cols = set(num_cols)- set(self.keys) + else: + df_cols = set(df_cols)- set(self.keys) + num_cols=set(num_cols) + + if len(obj_cols-num_cols) > 0: + numeric_only = False + else: + numeric_only=True else: numeric_only = False diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0a693967fbb19..f70cbc86fe33a 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -115,6 +115,20 @@ def test_groupby_aggregation_mixed_dtype(): tm.assert_frame_equal(result, expected) + expected2 = DataFrame( + { + "v1": [15, 7, 9, 3, 3, 5], + "v2": [165, 77, 99, 33, 33, 55], + "by2":[293, 194, 0, 'damp', 'dry', 'wetred'] + }, + index= Index([1, 2, 12, 'big', 'blue', 'red'],\ + dtype='object', name='by1'), + ) + + g = df.groupby(["by1"]) + result = g.sum() + tm.assert_frame_equal(result, expected2) + def test_groupby_aggregation_multi_level_column(): # GH 29772 lst = [ From 116534f3e0107cbd6f4f429e8d90ec1100a38d7c Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 19:57:52 +0530 Subject: [PATCH 02/34] Update groupby.py Updated PEP8 issues in groupby.py --- pandas/core/groupby/groupby.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c72c398ca1e1c..0e210fb80170a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1123,21 +1123,21 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # Checking if the dataframe has non-numeric features df_cols = self.obj.columns - num_cols = self.obj.select_dtypes(\ - include=[np.number,np.datetime64,np.timedelta64]).columns - # Removing the key columns + num_cols = self.obj.select_dtypes( + include=[np.number, np.datetime64, np.timedelta64]).columns + # Removing the key columns if set(num_cols).intersection(set(self.keys)) : - df_cols = set(df_cols)- set(self.keys) - num_cols = set(num_cols)- set(self.keys) + df_cols = set(df_cols) - set(self.keys) + num_cols = set(num_cols) - set(self.keys) else: - df_cols = set(df_cols)- set(self.keys) - num_cols=set(num_cols) + df_cols = set(df_cols) - set(self.keys) + num_cols = set(num_cols) - if len(obj_cols-num_cols) > 0: + if len(obj_cols - num_cols) > 0: numeric_only = False else: - numeric_only=True + numeric_only = True else: numeric_only = False From 7b5ecb469ceb3ad63db01657306d362b6a6f3319 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 20:03:06 +0530 Subject: [PATCH 03/34] Update test_aggregate.py Solved PEP8 issues in test_aggregate --- pandas/tests/groupby/aggregate/test_aggregate.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index f70cbc86fe33a..996407df4b0d5 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -113,16 +113,14 @@ def test_groupby_aggregation_mixed_dtype(): g = df.groupby(["by1", "by2"]) result = g[["v1", "v2"]].mean() tm.assert_frame_equal(result, expected) - - expected2 = DataFrame( { "v1": [15, 7, 9, 3, 3, 5], "v2": [165, 77, 99, 33, 33, 55], - "by2":[293, 194, 0, 'damp', 'dry', 'wetred'] + "by2": [293, 194, 0, 'damp', 'dry', 'wetred'] }, - index= Index([1, 2, 12, 'big', 'blue', 'red'],\ - dtype='object', name='by1'), + index = Index([1,2,12,'big','blue','red'], + dtype='object', name='by1'), ) g = df.groupby(["by1"]) From 7faf1fca9b7a479723eb7e8f680f5f53ee18b3b3 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 20:38:14 +0530 Subject: [PATCH 04/34] Update groupby.py Simplifying the resolve function --- pandas/core/groupby/groupby.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0e210fb80170a..a6b8e51138ee8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1122,19 +1122,10 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # i.e. DataFrameGroupBy # Checking if the dataframe has non-numeric features - df_cols = self.obj.columns - num_cols = self.obj.select_dtypes( - include=[np.number, np.datetime64, np.timedelta64]).columns - # Removing the key columns - - if set(num_cols).intersection(set(self.keys)) : - df_cols = set(df_cols) - set(self.keys) - num_cols = set(num_cols) - set(self.keys) - else: - df_cols = set(df_cols) - set(self.keys) - num_cols = set(num_cols) - - if len(obj_cols - num_cols) > 0: + non_num_cols = self.obj.select_dtypes( + exclude=[np.number, np.datetime64, np.timedelta64]).columns + + if len(set(non_num_cols) - set(self.keys)) > 0: numeric_only = False else: numeric_only = True From d773e7a49bf88fd9a730f824b16af1c1144fd552 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 20:47:13 +0530 Subject: [PATCH 05/34] Update groupby.py Solving PEP8 Issues --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a6b8e51138ee8..2f4de2656b684 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1124,7 +1124,6 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: non_num_cols = self.obj.select_dtypes( exclude=[np.number, np.datetime64, np.timedelta64]).columns - if len(set(non_num_cols) - set(self.keys)) > 0: numeric_only = False else: From 5b4e799d1a662babafd0a36a224876023fe57104 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 20:51:41 +0530 Subject: [PATCH 06/34] Update test_aggregate.py --- pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 996407df4b0d5..84e1bf65e4250 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -119,7 +119,7 @@ def test_groupby_aggregation_mixed_dtype(): "v2": [165, 77, 99, 33, 33, 55], "by2": [293, 194, 0, 'damp', 'dry', 'wetred'] }, - index = Index([1,2,12,'big','blue','red'], + index=Index([1, 2, 12, 'big', 'blue', 'red'], dtype='object', name='by1'), ) From 3d2e78e7bb5ed65a8ae9405412a5ba93d1ec49b2 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 21:28:11 +0530 Subject: [PATCH 07/34] Update groupby.py --- pandas/core/groupby/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2f4de2656b684..a6b8e51138ee8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1124,6 +1124,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: non_num_cols = self.obj.select_dtypes( exclude=[np.number, np.datetime64, np.timedelta64]).columns + if len(set(non_num_cols) - set(self.keys)) > 0: numeric_only = False else: From 5836f91544fcdfc2ffeeccf2a3261480130f53c9 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 23:07:51 +0530 Subject: [PATCH 08/34] Update groupby.py Solving issue when self.keys is of NoneType --- pandas/core/groupby/groupby.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a6b8e51138ee8..075575e03e519 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -10,7 +10,7 @@ class providing the base-class of operations. from contextlib import contextmanager import datetime -import numpy as np + from functools import ( partial, wraps, @@ -1124,11 +1124,16 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: non_num_cols = self.obj.select_dtypes( exclude=[np.number, np.datetime64, np.timedelta64]).columns - - if len(set(non_num_cols) - set(self.keys)) > 0: - numeric_only = False + if self.keys is not None: + if len(set(non_num_cols) - set(self.keys)) > 0: + numeric_only = False + else: + numeric_only = True else: - numeric_only = True + if len(non_num_cols)>0: + numeric_only = False + else: + numeric_only = True else: numeric_only = False From 1eb0e25484b542b0250550f3fe3b3e774e19031b Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 23:13:45 +0530 Subject: [PATCH 09/34] Update groupby.py --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 075575e03e519..0523dd869d66a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1130,7 +1130,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: else: numeric_only = True else: - if len(non_num_cols)>0: + if len(non_num_cols) > 0: numeric_only = False else: numeric_only = True From a0391e66cdf1d2896a3841cd946064659edf0551 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 23:16:32 +0530 Subject: [PATCH 10/34] Update test_aggregate.py --- pandas/tests/groupby/aggregate/test_aggregate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 84e1bf65e4250..22ebeac082245 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -119,8 +119,8 @@ def test_groupby_aggregation_mixed_dtype(): "v2": [165, 77, 99, 33, 33, 55], "by2": [293, 194, 0, 'damp', 'dry', 'wetred'] }, - index=Index([1, 2, 12, 'big', 'blue', 'red'], - dtype='object', name='by1'), + index=Index([1, 2, 12, 'big', 'blue', 'red'], + dtype='object', name='by1'), ) g = df.groupby(["by1"]) From 2a1835ad4ba014cf6f559d1f52d7a1862aeacbb2 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 23:17:26 +0530 Subject: [PATCH 11/34] Update test_aggregate.py --- pandas/tests/groupby/aggregate/test_aggregate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 22ebeac082245..0a39cdb5fe6ee 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -119,8 +119,8 @@ def test_groupby_aggregation_mixed_dtype(): "v2": [165, 77, 99, 33, 33, 55], "by2": [293, 194, 0, 'damp', 'dry', 'wetred'] }, - index=Index([1, 2, 12, 'big', 'blue', 'red'], - dtype='object', name='by1'), + index=Index([1, 2, 12, 'big', 'blue', 'red'], + dtype='object', name='by1'), ) g = df.groupby(["by1"]) From 66ecb968d998727b716e6afa7b5938c28b6770ec Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sat, 21 Aug 2021 23:18:17 +0530 Subject: [PATCH 12/34] Update test_aggregate.py --- pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0a39cdb5fe6ee..a35f94cd18d6f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -120,7 +120,7 @@ def test_groupby_aggregation_mixed_dtype(): "by2": [293, 194, 0, 'damp', 'dry', 'wetred'] }, index=Index([1, 2, 12, 'big', 'blue', 'red'], - dtype='object', name='by1'), + dtype='object', name='by1'), ) g = df.groupby(["by1"]) From c80fa9ef01811ea87383379287641e6112f5976e Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sun, 22 Aug 2021 10:25:08 +0530 Subject: [PATCH 13/34] Update groupby.py Checks for the presence of numeric features in columns to be aggregated --- pandas/core/groupby/groupby.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0523dd869d66a..e51f1b77386fd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1120,20 +1120,18 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # i.e. not explicitly passed by user if self.obj.ndim == 2: # i.e. DataFrameGroupBy - # Checking if the dataframe has non-numeric features + # Checking if the dataframe has numeric features for aggregation + + cols_for_agg = set(self.obj.select_dtypes( + include=[np.number, np.datetime64, np.timedelta64]).columns) - non_num_cols = self.obj.select_dtypes( - exclude=[np.number, np.datetime64, np.timedelta64]).columns if self.keys is not None: - if len(set(non_num_cols) - set(self.keys)) > 0: - numeric_only = False - else: - numeric_only = True + cols_for_agg = cols_for_agg - set(self.keys) + + if len(cols_for_agg) > 0: + numeric_only = True else: - if len(non_num_cols) > 0: - numeric_only = False - else: - numeric_only = True + numeric_only = False else: numeric_only = False From 95b653338282cc06bd602824f32df2b70877e809 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sun, 22 Aug 2021 10:30:09 +0530 Subject: [PATCH 14/34] Update groupby.py --- pandas/core/groupby/groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e51f1b77386fd..eea76f4decbda 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1127,11 +1127,10 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: if self.keys is not None: cols_for_agg = cols_for_agg - set(self.keys) - if len(cols_for_agg) > 0: numeric_only = True else: - numeric_only = False + numeric_only = False else: numeric_only = False From fa8a8caadf2ae504a2a21f70ea7ecf2a4c4981e4 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sun, 22 Aug 2021 10:46:09 +0530 Subject: [PATCH 15/34] Update test_aggregate.py Adding a new test case of only non_numeric features --- .../tests/groupby/aggregate/test_aggregate.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index a35f94cd18d6f..50601e7c44714 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -113,19 +113,22 @@ def test_groupby_aggregation_mixed_dtype(): g = df.groupby(["by1", "by2"]) result = g[["v1", "v2"]].mean() tm.assert_frame_equal(result, expected) - expected2 = DataFrame( + +def test_groupby_aggregation_non_numeric_dtype(): + + df=DataFrame([["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], + columns=["MW", "v"]) + + expected = DataFrame( { - "v1": [15, 7, 9, 3, 3, 5], - "v2": [165, 77, 99, 33, 33, 55], - "by2": [293, 194, 0, 'damp', 'dry', 'wetred'] + "v": [[1,1],[10,20]], + }, - index=Index([1, 2, 12, 'big', 'blue', 'red'], - dtype='object', name='by1'), + index = Index(['M', 'W'], dtype='object', name='MW'), ) - - g = df.groupby(["by1"]) + g = df.groupby(by=["MW"]) result = g.sum() - tm.assert_frame_equal(result, expected2) + tm.assert_frame_equal(result, expected) def test_groupby_aggregation_multi_level_column(): # GH 29772 From e3f676752d9ea40e0e430ee72373494fd295db02 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sun, 22 Aug 2021 10:53:36 +0530 Subject: [PATCH 16/34] Update test_aggregate.py --- pandas/tests/groupby/aggregate/test_aggregate.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 50601e7c44714..1d0f27bd77d15 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -114,22 +114,23 @@ def test_groupby_aggregation_mixed_dtype(): result = g[["v1", "v2"]].mean() tm.assert_frame_equal(result, expected) + def test_groupby_aggregation_non_numeric_dtype(): - df=DataFrame([["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], - columns=["MW", "v"]) + df = DataFrame([["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], + columns=["MW", "v"]) expected = DataFrame( { - "v": [[1,1],[10,20]], - + "v": [[1, 1], [10, 20]], }, - index = Index(['M', 'W'], dtype='object', name='MW'), + index=Index(['M', 'W'], dtype='object', name='MW'), ) g = df.groupby(by=["MW"]) result = g.sum() tm.assert_frame_equal(result, expected) + def test_groupby_aggregation_multi_level_column(): # GH 29772 lst = [ From ed668e6334b6a43c788ced2d7771f48b2ed0c478 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sun, 22 Aug 2021 14:55:15 +0530 Subject: [PATCH 17/34] Update groupby.py --- pandas/core/groupby/groupby.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index eea76f4decbda..0b96e222c2848 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1124,9 +1124,6 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: cols_for_agg = set(self.obj.select_dtypes( include=[np.number, np.datetime64, np.timedelta64]).columns) - - if self.keys is not None: - cols_for_agg = cols_for_agg - set(self.keys) if len(cols_for_agg) > 0: numeric_only = True else: From f30855a1c72ac3efdcde0d36fb32c3a189ef21b2 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sun, 22 Aug 2021 16:04:56 +0530 Subject: [PATCH 18/34] Update groupby.py --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0b96e222c2848..9b5dc6010dfd5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1123,7 +1123,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # Checking if the dataframe has numeric features for aggregation cols_for_agg = set(self.obj.select_dtypes( - include=[np.number, np.datetime64, np.timedelta64]).columns) + include=[np.number, np.datetime64, np.timedelta64, 'category']).columns) if len(cols_for_agg) > 0: numeric_only = True else: From 65261b4455c5dfc5e45186c802b8e1383e0b1f7d Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sun, 22 Aug 2021 16:06:51 +0530 Subject: [PATCH 19/34] Update groupby.py --- pandas/core/groupby/groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9b5dc6010dfd5..0b3049ea68446 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1123,7 +1123,8 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # Checking if the dataframe has numeric features for aggregation cols_for_agg = set(self.obj.select_dtypes( - include=[np.number, np.datetime64, np.timedelta64, 'category']).columns) + include=[np.number, np.datetime64, np.timedelta64, 'category'] + ).columns) if len(cols_for_agg) > 0: numeric_only = True else: From 2477aa311bd15ad6192ccad795ff30d8d6d260a3 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sun, 22 Aug 2021 19:43:12 +0530 Subject: [PATCH 20/34] Update groupby.py Checking for empty dataframes passed in groupby --- pandas/core/groupby/groupby.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0b3049ea68446..ffc9c6e078abe 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1100,6 +1100,12 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]): def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) + def is_empty_df(self): + if self.obj.empty: + return True + else: + return False + def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: """ Determine subclass-specific default value for 'numeric_only'. @@ -1120,15 +1126,18 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # i.e. not explicitly passed by user if self.obj.ndim == 2: # i.e. DataFrameGroupBy - # Checking if the dataframe has numeric features for aggregation - - cols_for_agg = set(self.obj.select_dtypes( - include=[np.number, np.datetime64, np.timedelta64, 'category'] - ).columns) - if len(cols_for_agg) > 0: - numeric_only = True + # Checking for empty object + if not is_empty_df(self): + # Checking if the dataframe has numeric features for aggregation + cols_for_agg = set(self.obj.select_dtypes( + include=[np.number, np.datetime64, np.timedelta64] + )) + if len(cols_for_agg) > 0: + numeric_only = True + else: + numeric_only = False else: - numeric_only = False + numeric_only = True else: numeric_only = False From da24f29880df65941c07e0fa15c7ee6eef6d5a62 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Sun, 22 Aug 2021 20:32:58 +0530 Subject: [PATCH 21/34] Update groupby.py --- pandas/core/groupby/groupby.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ffc9c6e078abe..7739d4c6167a0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1100,11 +1100,7 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]): def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) - def is_empty_df(self): - if self.obj.empty: - return True - else: - return False + def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: """ @@ -1127,7 +1123,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: if self.obj.ndim == 2: # i.e. DataFrameGroupBy # Checking for empty object - if not is_empty_df(self): + if not self.obj.empty: # Checking if the dataframe has numeric features for aggregation cols_for_agg = set(self.obj.select_dtypes( include=[np.number, np.datetime64, np.timedelta64] From 40753534a0fd5b36de3cca96a2a9e5c5a5bddfed Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Tue, 24 Aug 2021 21:19:48 +0530 Subject: [PATCH 22/34] Update groupby.py Going back to the previous behaviour --- pandas/core/groupby/groupby.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7739d4c6167a0..4867da23a70dc 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1101,7 +1101,7 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = Fals raise AbstractMethodError(self) - + ''' def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: """ Determine subclass-specific default value for 'numeric_only'. @@ -1140,7 +1140,32 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: # error: Incompatible return value type (got "Union[bool, NoDefault]", # expected "bool") return numeric_only # type: ignore[return-value] + ''' + + def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: + """ + Determine subclass-specific default value for 'numeric_only'. + For SeriesGroupBy we want the default to be False (to match Series behavior). + For DataFrameGroupBy we want it to be True (for backwards-compat). + Parameters + ---------- + numeric_only : bool or lib.no_default + Returns + ------- + bool + """ + # GH#41291 + if numeric_only is lib.no_default: + # i.e. not explicitly passed by user + if self.obj.ndim == 2: + # i.e. DataFrameGroupBy + numeric_only = True + else: + numeric_only = False + # error: Incompatible return value type (got "Union[bool, NoDefault]", + # expected "bool") + return numeric_only # type: ignore[return-value] # ----------------------------------------------------------------- # numba From 463a7c95ca576019397a297ef910f344ec08993d Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Tue, 24 Aug 2021 21:21:46 +0530 Subject: [PATCH 23/34] Update test_aggregate.py Removing the test case --- pandas/tests/groupby/aggregate/test_aggregate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 1d0f27bd77d15..1f7a97bf4d50c 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -114,7 +114,7 @@ def test_groupby_aggregation_mixed_dtype(): result = g[["v1", "v2"]].mean() tm.assert_frame_equal(result, expected) - +''' def test_groupby_aggregation_non_numeric_dtype(): df = DataFrame([["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], @@ -129,7 +129,7 @@ def test_groupby_aggregation_non_numeric_dtype(): g = df.groupby(by=["MW"]) result = g.sum() tm.assert_frame_equal(result, expected) - +''' def test_groupby_aggregation_multi_level_column(): # GH 29772 From dd562c55f59feb1f7bc9feb772a10f358a0b1ad1 Mon Sep 17 00:00:00 2001 From: Prerana Chakraborty <40196782+kurchi1205@users.noreply.github.com> Date: Tue, 24 Aug 2021 21:26:00 +0530 Subject: [PATCH 24/34] Update frame.py Issuing a warning for default value of numeric only --- pandas/core/frame.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 61dde5f9dff74..2b6232b325759 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -38,7 +38,7 @@ import numpy.ma as ma from pandas._config import get_option - +from pandas.util._exceptions import find_stack_level from pandas._libs import ( algos as libalgos, lib, @@ -7622,7 +7622,18 @@ def groupby( ) else: squeeze = False - + if len(self.select_dtypes( + include=[np.number, np.datetime64, np.timedelta64] + ).columns) == 0: + warnings.warn( + ( + "The `numeric_only` parameter defaults to True in current version. " + "Will be set according to the datatypes " + "in a future version." + ), + FutureWarning, + stacklevel=find_stack_level(), + ) if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) From f400174bf6f15069d4a0a2f812f13fb0c1892452 Mon Sep 17 00:00:00 2001 From: Zerobugs_DesignLab Date: Sun, 29 Aug 2021 12:59:03 +0530 Subject: [PATCH 25/34] Making the necessary to _resolve_numeric_only and adding testcase --- pandas/core/frame.py | 15 +----- pandas/core/groupby/groupby.py | 53 +++---------------- .../tests/groupby/aggregate/test_aggregate.py | 19 ++++--- 3 files changed, 20 insertions(+), 67 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2b6232b325759..61dde5f9dff74 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -38,7 +38,7 @@ import numpy.ma as ma from pandas._config import get_option -from pandas.util._exceptions import find_stack_level + from pandas._libs import ( algos as libalgos, lib, @@ -7622,18 +7622,7 @@ def groupby( ) else: squeeze = False - if len(self.select_dtypes( - include=[np.number, np.datetime64, np.timedelta64] - ).columns) == 0: - warnings.warn( - ( - "The `numeric_only` parameter defaults to True in current version. " - "Will be set according to the datatypes " - "in a future version." - ), - FutureWarning, - stacklevel=find_stack_level(), - ) + if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4867da23a70dc..2263edfdd5fdd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -10,7 +10,6 @@ class providing the base-class of operations. from contextlib import contextmanager import datetime - from functools import ( partial, wraps, @@ -1100,48 +1099,6 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]): def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) - - ''' - def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: - """ - Determine subclass-specific default value for 'numeric_only'. - - For SeriesGroupBy we want the default to be False (to match Series behavior). - For DataFrameGroupBy we want it to be True (for backwards-compat). - - Parameters - ---------- - numeric_only : bool or lib.no_default - - Returns - ------- - bool - """ - # GH#41291 - if numeric_only is lib.no_default: - # i.e. not explicitly passed by user - if self.obj.ndim == 2: - # i.e. DataFrameGroupBy - # Checking for empty object - if not self.obj.empty: - # Checking if the dataframe has numeric features for aggregation - cols_for_agg = set(self.obj.select_dtypes( - include=[np.number, np.datetime64, np.timedelta64] - )) - if len(cols_for_agg) > 0: - numeric_only = True - else: - numeric_only = False - else: - numeric_only = True - else: - numeric_only = False - - # error: Incompatible return value type (got "Union[bool, NoDefault]", - # expected "bool") - return numeric_only # type: ignore[return-value] - ''' - def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: """ Determine subclass-specific default value for 'numeric_only'. @@ -1160,12 +1117,16 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: if self.obj.ndim == 2: # i.e. DataFrameGroupBy numeric_only = True + obj = self._obj_with_exclusions + check = obj._get_numeric_data() + if len(obj.columns) and not len(check.columns): + warnings.warn("... Explicitly pass numeric_only ...") + numeric_only = False + else: numeric_only = False + return numeric_only - # error: Incompatible return value type (got "Union[bool, NoDefault]", - # expected "bool") - return numeric_only # type: ignore[return-value] # ----------------------------------------------------------------- # numba diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 1f7a97bf4d50c..b065e4eb0b278 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -114,22 +114,25 @@ def test_groupby_aggregation_mixed_dtype(): result = g[["v1", "v2"]].mean() tm.assert_frame_equal(result, expected) -''' + def test_groupby_aggregation_non_numeric_dtype(): - df = DataFrame([["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], - columns=["MW", "v"]) + df = DataFrame( + [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] + ) expected = DataFrame( { "v": [[1, 1], [10, 20]], }, - index=Index(['M', 'W'], dtype='object', name='MW'), + index=Index(["M", "W"], dtype="object", name="MW"), ) - g = df.groupby(by=["MW"]) - result = g.sum() - tm.assert_frame_equal(result, expected) -''' + + with tm.assert_produces_warning(UserWarning): + g = df.groupby(by=["MW"]) + result = g.sum() + tm.assert_frame_equal(result, expected) + def test_groupby_aggregation_multi_level_column(): # GH 29772 From 56f28d7e78707a2309d1c4db3f3d7bf31daccb94 Mon Sep 17 00:00:00 2001 From: Zerobugs_DesignLab Date: Sun, 29 Aug 2021 14:40:20 +0530 Subject: [PATCH 26/34] Checking for empty DataFrame --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2263edfdd5fdd..9d177f85c72cd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1119,7 +1119,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: numeric_only = True obj = self._obj_with_exclusions check = obj._get_numeric_data() - if len(obj.columns) and not len(check.columns): + if len(obj.columns) and not len(check.columns) and not obj.empty: warnings.warn("... Explicitly pass numeric_only ...") numeric_only = False From a3b09e6f39abb46a661f3510bc81d4cbd0319028 Mon Sep 17 00:00:00 2001 From: Zerobugs_DesignLab Date: Mon, 30 Aug 2021 18:39:22 +0530 Subject: [PATCH 27/34] Adding test cases --- .../tests/groupby/aggregate/test_aggregate.py | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b065e4eb0b278..a07facd5835f0 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -20,6 +20,8 @@ MultiIndex, Series, concat, + Timedelta, + Int64Index ) import pandas._testing as tm from pandas.core.base import SpecificationError @@ -116,7 +118,7 @@ def test_groupby_aggregation_mixed_dtype(): def test_groupby_aggregation_non_numeric_dtype(): - + # GH #43108 df = DataFrame( [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] ) @@ -134,6 +136,41 @@ def test_groupby_aggregation_non_numeric_dtype(): tm.assert_frame_equal(result, expected) +def test_groupby_aggregation_multi_non_numeric_dtype(): + # GH #42395 + df = DataFrame({"x": [1,0,1,1,0], "y": [Timedelta(i, "days") for i in range(1,6)], "z": [Timedelta(i*10, "days") for i in range(1,6)]}) + + expected = DataFrame( + { + "y": [Timedelta(i, "days") for i in range(7,9)], + "z": [Timedelta(i*10, "days") for i in range(7,9)] + }, + index=Int64Index([0, 1], dtype='int64', name='x'), + ) + + with tm.assert_produces_warning(UserWarning): + g = df.groupby(by=["x"]) + result = g.sum() + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_numeric_with_non_numeric_dtype(): + # GH #43108 + df = DataFrame({"x": [1,0,1,1,0], "y": [Timedelta(i, "days") for i in range(1,6)], "z": [i for i in range(1,6)]}) + + expected = DataFrame( + { + "z": [7, 8] + }, + index=Int64Index([0, 1], dtype='int64', name='x'), + ) + + + g = df.groupby(by=["x"]) + result = g.sum() + tm.assert_frame_equal(result, expected) + + def test_groupby_aggregation_multi_level_column(): # GH 29772 lst = [ From f7e2d590b2a0a38194b7844db24546366b66fa66 Mon Sep 17 00:00:00 2001 From: Zerobugs_DesignLab Date: Mon, 30 Aug 2021 19:24:58 +0530 Subject: [PATCH 28/34] No changes --- .../tests/groupby/aggregate/test_aggregate.py | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index a07facd5835f0..b7a5b0cecd32c 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -17,11 +17,11 @@ from pandas import ( DataFrame, Index, + Int64Index, MultiIndex, Series, - concat, Timedelta, - Int64Index + concat, ) import pandas._testing as tm from pandas.core.base import SpecificationError @@ -138,14 +138,20 @@ def test_groupby_aggregation_non_numeric_dtype(): def test_groupby_aggregation_multi_non_numeric_dtype(): # GH #42395 - df = DataFrame({"x": [1,0,1,1,0], "y": [Timedelta(i, "days") for i in range(1,6)], "z": [Timedelta(i*10, "days") for i in range(1,6)]}) + df = DataFrame( + { + "x": [1, 0, 1, 1, 0], + "y": [Timedelta(i, "days") for i in range(1, 6)], + "z": [Timedelta(i * 10, "days") for i in range(1, 6)], + } + ) expected = DataFrame( { - "y": [Timedelta(i, "days") for i in range(7,9)], - "z": [Timedelta(i*10, "days") for i in range(7,9)] + "y": [Timedelta(i, "days") for i in range(7, 9)], + "z": [Timedelta(i * 10, "days") for i in range(7, 9)], }, - index=Int64Index([0, 1], dtype='int64', name='x'), + index=Int64Index([0, 1], dtype="int64", name="x"), ) with tm.assert_produces_warning(UserWarning): @@ -156,16 +162,19 @@ def test_groupby_aggregation_multi_non_numeric_dtype(): def test_groupby_aggregation_numeric_with_non_numeric_dtype(): # GH #43108 - df = DataFrame({"x": [1,0,1,1,0], "y": [Timedelta(i, "days") for i in range(1,6)], "z": [i for i in range(1,6)]}) + df = DataFrame( + { + "x": [1, 0, 1, 1, 0], + "y": [Timedelta(i, "days") for i in range(1, 6)], + "z": [i for i in range(1, 6)], + } + ) expected = DataFrame( - { - "z": [7, 8] - }, - index=Int64Index([0, 1], dtype='int64', name='x'), + {"z": [7, 8]}, + index=Int64Index([0, 1], dtype="int64", name="x"), ) - g = df.groupby(by=["x"]) result = g.sum() tm.assert_frame_equal(result, expected) From 270549a012007ace891b7b91f68d9fa4fe956142 Mon Sep 17 00:00:00 2001 From: Zerobugs_DesignLab Date: Wed, 1 Sep 2021 11:06:41 +0530 Subject: [PATCH 29/34] Changes in tests and warning --- pandas/core/groupby/groupby.py | 1 - .../tests/groupby/aggregate/test_aggregate.py | 63 ------------------- pandas/tests/groupby/test_function.py | 63 +++++++++++++++++++ 3 files changed, 63 insertions(+), 64 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0b216e5417463..8ce4c2cf0a4f4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1120,7 +1120,6 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: obj = self._obj_with_exclusions check = obj._get_numeric_data() if len(obj.columns) and not len(check.columns) and not obj.empty: - warnings.warn("... Explicitly pass numeric_only ...") numeric_only = False else: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b7a5b0cecd32c..c73e8d1b02a6d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -117,69 +117,6 @@ def test_groupby_aggregation_mixed_dtype(): tm.assert_frame_equal(result, expected) -def test_groupby_aggregation_non_numeric_dtype(): - # GH #43108 - df = DataFrame( - [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] - ) - - expected = DataFrame( - { - "v": [[1, 1], [10, 20]], - }, - index=Index(["M", "W"], dtype="object", name="MW"), - ) - - with tm.assert_produces_warning(UserWarning): - g = df.groupby(by=["MW"]) - result = g.sum() - tm.assert_frame_equal(result, expected) - - -def test_groupby_aggregation_multi_non_numeric_dtype(): - # GH #42395 - df = DataFrame( - { - "x": [1, 0, 1, 1, 0], - "y": [Timedelta(i, "days") for i in range(1, 6)], - "z": [Timedelta(i * 10, "days") for i in range(1, 6)], - } - ) - - expected = DataFrame( - { - "y": [Timedelta(i, "days") for i in range(7, 9)], - "z": [Timedelta(i * 10, "days") for i in range(7, 9)], - }, - index=Int64Index([0, 1], dtype="int64", name="x"), - ) - - with tm.assert_produces_warning(UserWarning): - g = df.groupby(by=["x"]) - result = g.sum() - tm.assert_frame_equal(result, expected) - - -def test_groupby_aggregation_numeric_with_non_numeric_dtype(): - # GH #43108 - df = DataFrame( - { - "x": [1, 0, 1, 1, 0], - "y": [Timedelta(i, "days") for i in range(1, 6)], - "z": [i for i in range(1, 6)], - } - ) - - expected = DataFrame( - {"z": [7, 8]}, - index=Int64Index([0, 1], dtype="int64", name="x"), - ) - - g = df.groupby(by=["x"]) - result = g.sum() - tm.assert_frame_equal(result, expected) - - def test_groupby_aggregation_multi_level_column(): # GH 29772 lst = [ diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3ae11847cc06b..3deb5d662a547 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -262,6 +262,69 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): tm.assert_index_equal(result.columns, expected_columns) + def test_groupby_aggregation_non_numeric_dtype(): + # GH #43108 + df = DataFrame( + [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] + ) + + expected = DataFrame( + { + "v": [[1, 1], [10, 20]], + }, + index=Index(["M", "W"], dtype="object", name="MW"), + ) + + + gb = df.groupby(by=["MW"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + + def test_groupby_aggregation_multi_non_numeric_dtype(): + # GH #42395 + df = DataFrame( + { + "x": [1, 0, 1, 1, 0], + "y": [Timedelta(i, "days") for i in range(1, 6)], + "z": [Timedelta(i * 10, "days") for i in range(1, 6)], + } + ) + + expected = DataFrame( + { + "y": [Timedelta(i, "days") for i in range(7, 9)], + "z": [Timedelta(i * 10, "days") for i in range(7, 9)], + }, + index=Int64Index([0, 1], dtype="int64", name="x"), + ) + + + gb = df.groupby(by=["x"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + + def test_groupby_aggregation_numeric_with_non_numeric_dtype(): + # GH #43108 + df = DataFrame( + { + "x": [1, 0, 1, 1, 0], + "y": [Timedelta(i, "days") for i in range(1, 6)], + "z": [i for i in range(1, 6)], + } + ) + + expected = DataFrame( + {"z": [7, 8]}, + index=Int64Index([0, 1], dtype="int64", name="x"), + ) + + gb = df.groupby(by=["x"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + class TestGroupByNonCythonPaths: # GH#5610 non-cython calls should not include the grouper From 17808bfb0a5140082af5b13cc3cb9a1cb9d25312 Mon Sep 17 00:00:00 2001 From: Zerobugs_DesignLab Date: Wed, 1 Sep 2021 12:07:36 +0530 Subject: [PATCH 30/34] Solving the errors --- pandas/tests/groupby/aggregate/test_aggregate.py | 2 -- pandas/tests/groupby/test_function.py | 7 ++----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index c73e8d1b02a6d..0a693967fbb19 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -17,10 +17,8 @@ from pandas import ( DataFrame, Index, - Int64Index, MultiIndex, Series, - Timedelta, concat, ) import pandas._testing as tm diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3deb5d662a547..92f9f7d22df5e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -10,8 +10,10 @@ from pandas import ( DataFrame, Index, + Int64Index, MultiIndex, Series, + Timedelta, Timestamp, date_range, ) @@ -275,12 +277,10 @@ def test_groupby_aggregation_non_numeric_dtype(): index=Index(["M", "W"], dtype="object", name="MW"), ) - gb = df.groupby(by=["MW"]) result = gb.sum() tm.assert_frame_equal(result, expected) - def test_groupby_aggregation_multi_non_numeric_dtype(): # GH #42395 df = DataFrame( @@ -299,12 +299,10 @@ def test_groupby_aggregation_multi_non_numeric_dtype(): index=Int64Index([0, 1], dtype="int64", name="x"), ) - gb = df.groupby(by=["x"]) result = gb.sum() tm.assert_frame_equal(result, expected) - def test_groupby_aggregation_numeric_with_non_numeric_dtype(): # GH #43108 df = DataFrame( @@ -325,7 +323,6 @@ def test_groupby_aggregation_numeric_with_non_numeric_dtype(): tm.assert_frame_equal(result, expected) - class TestGroupByNonCythonPaths: # GH#5610 non-cython calls should not include the grouper # Tests for code not expected to go through cython paths. From 08b2429f4a995a563518d31f414c4f233871110a Mon Sep 17 00:00:00 2001 From: Zerobugs_DesignLab Date: Wed, 1 Sep 2021 13:08:31 +0530 Subject: [PATCH 31/34] Solving the errors --- pandas/tests/groupby/test_function.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 92f9f7d22df5e..33066e848c8ef 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -264,7 +264,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): tm.assert_index_equal(result.columns, expected_columns) - def test_groupby_aggregation_non_numeric_dtype(): + def test_groupby_aggregation_non_numeric_dtype(self): # GH #43108 df = DataFrame( [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] @@ -281,7 +281,7 @@ def test_groupby_aggregation_non_numeric_dtype(): result = gb.sum() tm.assert_frame_equal(result, expected) - def test_groupby_aggregation_multi_non_numeric_dtype(): + def test_groupby_aggregation_multi_non_numeric_dtype(self): # GH #42395 df = DataFrame( { @@ -303,7 +303,7 @@ def test_groupby_aggregation_multi_non_numeric_dtype(): result = gb.sum() tm.assert_frame_equal(result, expected) - def test_groupby_aggregation_numeric_with_non_numeric_dtype(): + def test_groupby_aggregation_numeric_with_non_numeric_dtype(self): # GH #43108 df = DataFrame( { From 45f54d6cb548828072fc2d5885687d4d4db5b712 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Wed, 8 Sep 2021 22:17:10 -0400 Subject: [PATCH 32/34] whatsnew 1.3.3, move tests, restore mypy --- doc/source/whatsnew/v1.3.3.rst | 1 + pandas/core/groupby/groupby.py | 10 ++++- pandas/tests/groupby/test_function.py | 60 ------------------------- pandas/tests/groupby/test_groupby.py | 63 +++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 61 deletions(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 5ffc1a20b382f..952dda8665e1c 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :func:`is_list_like` where objects with ``__iter__`` set to ``None`` would be identified as iterable (:issue:`43373`) - Fixed regression in :meth:`.Resampler.aggregate` when used after column selection would raise if ``func`` is a list of aggregation functions (:issue:`42905`) - Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`) +- Fixed regression in :meth:`DataFrame.groupby` where aggregation on columns with object types dropped results on those columns (:issue:`42395`, :issue:`43108`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8022d967a90d3..2da42b3fa11e7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1152,11 +1152,14 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = Fals def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: """ Determine subclass-specific default value for 'numeric_only'. + For SeriesGroupBy we want the default to be False (to match Series behavior). For DataFrameGroupBy we want it to be True (for backwards-compat). + Parameters ---------- numeric_only : bool or lib.no_default + Returns ------- bool @@ -1167,14 +1170,19 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: if self.obj.ndim == 2: # i.e. DataFrameGroupBy numeric_only = True + # GH#42395 GH#43108 GH#43154 + # Regression from 1.2.5 to 1.3 caused object columns to be dropped obj = self._obj_with_exclusions check = obj._get_numeric_data() if len(obj.columns) and not len(check.columns) and not obj.empty: numeric_only = False + # TODO: v1.4+ Add FutureWarning else: numeric_only = False - return numeric_only + # error: Incompatible return value type (got "Union[bool, NoDefault]", + # expected "bool") + return numeric_only # type: ignore[return-value] @cache_readonly def _group_keys_index(self) -> Index: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 33066e848c8ef..3ae11847cc06b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -10,10 +10,8 @@ from pandas import ( DataFrame, Index, - Int64Index, MultiIndex, Series, - Timedelta, Timestamp, date_range, ) @@ -264,64 +262,6 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): tm.assert_index_equal(result.columns, expected_columns) - def test_groupby_aggregation_non_numeric_dtype(self): - # GH #43108 - df = DataFrame( - [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] - ) - - expected = DataFrame( - { - "v": [[1, 1], [10, 20]], - }, - index=Index(["M", "W"], dtype="object", name="MW"), - ) - - gb = df.groupby(by=["MW"]) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - def test_groupby_aggregation_multi_non_numeric_dtype(self): - # GH #42395 - df = DataFrame( - { - "x": [1, 0, 1, 1, 0], - "y": [Timedelta(i, "days") for i in range(1, 6)], - "z": [Timedelta(i * 10, "days") for i in range(1, 6)], - } - ) - - expected = DataFrame( - { - "y": [Timedelta(i, "days") for i in range(7, 9)], - "z": [Timedelta(i * 10, "days") for i in range(7, 9)], - }, - index=Int64Index([0, 1], dtype="int64", name="x"), - ) - - gb = df.groupby(by=["x"]) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - def test_groupby_aggregation_numeric_with_non_numeric_dtype(self): - # GH #43108 - df = DataFrame( - { - "x": [1, 0, 1, 1, 0], - "y": [Timedelta(i, "days") for i in range(1, 6)], - "z": [i for i in range(1, 6)], - } - ) - - expected = DataFrame( - {"z": [7, 8]}, - index=Int64Index([0, 1], dtype="int64", name="x"), - ) - - gb = df.groupby(by=["x"]) - result = gb.sum() - tm.assert_frame_equal(result, expected) - class TestGroupByNonCythonPaths: # GH#5610 non-cython calls should not include the grouper diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f26f18c9c20a0..7e8a0157f3b5c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -14,9 +14,11 @@ DataFrame, Grouper, Index, + Int64Index, MultiIndex, RangeIndex, Series, + Timedelta, Timestamp, date_range, read_csv, @@ -2392,6 +2394,67 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) +def test_groupby_aggregation_non_numeric_dtype(): + # GH #43108 + df = DataFrame( + [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] + ) + + expected = DataFrame( + { + "v": [[1, 1], [10, 20]], + }, + index=Index(["M", "W"], dtype="object", name="MW"), + ) + + gb = df.groupby(by=["MW"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_multi_non_numeric_dtype(): + # GH #42395 + df = DataFrame( + { + "x": [1, 0, 1, 1, 0], + "y": [Timedelta(i, "days") for i in range(1, 6)], + "z": [Timedelta(i * 10, "days") for i in range(1, 6)], + } + ) + + expected = DataFrame( + { + "y": [Timedelta(i, "days") for i in range(7, 9)], + "z": [Timedelta(i * 10, "days") for i in range(7, 9)], + }, + index=Int64Index([0, 1], dtype="int64", name="x"), + ) + + gb = df.groupby(by=["x"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_numeric_with_non_numeric_dtype(): + # GH #43108 + df = DataFrame( + { + "x": [1, 0, 1, 1, 0], + "y": [Timedelta(i, "days") for i in range(1, 6)], + "z": list(range(1, 6)), + } + ) + + expected = DataFrame( + {"z": [7, 8]}, + index=Int64Index([0, 1], dtype="int64", name="x"), + ) + + gb = df.groupby(by=["x"]) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + def test_groupby_filtered_df_std(): # GH 16174 dicts = [ From 00124239c36d959296ea22022b978c637ced3a08 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Wed, 8 Sep 2021 22:18:11 -0400 Subject: [PATCH 33/34] add back blank line --- pandas/core/groupby/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2da42b3fa11e7..5b2de5d779aa1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1180,6 +1180,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: else: numeric_only = False + # error: Incompatible return value type (got "Union[bool, NoDefault]", # expected "bool") return numeric_only # type: ignore[return-value] From d46091edc35be4606696431a8dd80ab25d335630 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Thu, 9 Sep 2021 12:36:28 -0400 Subject: [PATCH 34/34] add FutureWarning. Avoid Int64Index --- pandas/tests/groupby/aggregate/test_cython.py | 3 ++- pandas/tests/groupby/test_groupby.py | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index a035c5500e2dc..694f843ec138f 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -97,7 +97,8 @@ def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) - result = frame[["b"]].groupby(frame["a"]).mean() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = frame[["b"]].groupby(frame["a"]).mean() expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates()) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7e8a0157f3b5c..1e11b54f14329 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -14,7 +14,6 @@ DataFrame, Grouper, Index, - Int64Index, MultiIndex, RangeIndex, Series, @@ -2427,7 +2426,7 @@ def test_groupby_aggregation_multi_non_numeric_dtype(): "y": [Timedelta(i, "days") for i in range(7, 9)], "z": [Timedelta(i * 10, "days") for i in range(7, 9)], }, - index=Int64Index([0, 1], dtype="int64", name="x"), + index=Index([0, 1], dtype="int64", name="x"), ) gb = df.groupby(by=["x"]) @@ -2447,7 +2446,7 @@ def test_groupby_aggregation_numeric_with_non_numeric_dtype(): expected = DataFrame( {"z": [7, 8]}, - index=Int64Index([0, 1], dtype="int64", name="x"), + index=Index([0, 1], dtype="int64", name="x"), ) gb = df.groupby(by=["x"])