@@ -640,7 +640,7 @@ def test_frame_multi_key_function_list():
640
640
tm .assert_frame_equal (agged , expected )
641
641
642
642
643
- def test_frame_multi_key_function_list_partial_failure ():
643
+ def test_frame_multi_key_function_list_partial_failure (using_infer_string ):
644
644
data = DataFrame (
645
645
{
646
646
"A" : [
@@ -691,6 +691,8 @@ def test_frame_multi_key_function_list_partial_failure():
691
691
grouped = data .groupby (["A" , "B" ])
692
692
funcs = ["mean" , "std" ]
693
693
msg = re .escape ("agg function failed [how->mean,dtype->" )
694
+ if using_infer_string :
695
+ msg = "dtype 'str' does not support operation 'mean'"
694
696
with pytest .raises (TypeError , match = msg ):
695
697
grouped .agg (funcs )
696
698
@@ -981,9 +983,11 @@ def test_groupby_multi_corner(df):
981
983
tm .assert_frame_equal (agged , expected )
982
984
983
985
984
- def test_raises_on_nuisance (df ):
986
+ def test_raises_on_nuisance (df , using_infer_string ):
985
987
grouped = df .groupby ("A" )
986
988
msg = re .escape ("agg function failed [how->mean,dtype->" )
989
+ if using_infer_string :
990
+ msg = "dtype 'str' does not support operation 'mean'"
987
991
with pytest .raises (TypeError , match = msg ):
988
992
grouped .agg ("mean" )
989
993
with pytest .raises (TypeError , match = msg ):
@@ -1026,15 +1030,18 @@ def test_keep_nuisance_agg(df, agg_function):
1026
1030
["sum" , "mean" , "prod" , "std" , "var" , "sem" , "median" ],
1027
1031
)
1028
1032
@pytest .mark .parametrize ("numeric_only" , [True , False ])
1029
- def test_omit_nuisance_agg (df , agg_function , numeric_only ):
1033
+ def test_omit_nuisance_agg (df , agg_function , numeric_only , using_infer_string ):
1030
1034
# GH 38774, GH 38815
1031
1035
grouped = df .groupby ("A" )
1032
1036
1033
1037
no_drop_nuisance = ("var" , "std" , "sem" , "mean" , "prod" , "median" )
1034
1038
if agg_function in no_drop_nuisance and not numeric_only :
1035
1039
# Added numeric_only as part of GH#46560; these do not drop nuisance
1036
1040
# columns when numeric_only is False
1037
- if agg_function in ("std" , "sem" ):
1041
+ if using_infer_string :
1042
+ msg = f"dtype 'str' does not support operation '{ agg_function } '"
1043
+ klass = TypeError
1044
+ elif agg_function in ("std" , "sem" ):
1038
1045
klass = ValueError
1039
1046
msg = "could not convert string to float: 'one'"
1040
1047
else :
@@ -1055,16 +1062,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
1055
1062
tm .assert_frame_equal (result , expected )
1056
1063
1057
1064
1058
- def test_raise_on_nuisance_python_single (df ):
1065
+ def test_raise_on_nuisance_python_single (df , using_infer_string ):
1059
1066
# GH 38815
1060
1067
grouped = df .groupby ("A" )
1061
- with pytest .raises (ValueError , match = "could not convert" ):
1068
+
1069
+ err = ValueError
1070
+ msg = "could not convert"
1071
+ if using_infer_string :
1072
+ err = TypeError
1073
+ msg = "dtype 'str' does not support operation 'skew'"
1074
+ with pytest .raises (err , match = msg ):
1062
1075
grouped .skew ()
1063
1076
1064
1077
1065
- def test_raise_on_nuisance_python_multiple (three_group ):
1078
+ def test_raise_on_nuisance_python_multiple (three_group , using_infer_string ):
1066
1079
grouped = three_group .groupby (["A" , "B" ])
1067
1080
msg = re .escape ("agg function failed [how->mean,dtype->" )
1081
+ if using_infer_string :
1082
+ msg = "dtype 'str' does not support operation 'mean'"
1068
1083
with pytest .raises (TypeError , match = msg ):
1069
1084
grouped .agg ("mean" )
1070
1085
with pytest .raises (TypeError , match = msg ):
@@ -1102,12 +1117,16 @@ def test_nonsense_func():
1102
1117
df .groupby (lambda x : x + "foo" )
1103
1118
1104
1119
1105
- def test_wrap_aggregated_output_multindex (multiindex_dataframe_random_data ):
1120
+ def test_wrap_aggregated_output_multindex (
1121
+ multiindex_dataframe_random_data , using_infer_string
1122
+ ):
1106
1123
df = multiindex_dataframe_random_data .T
1107
1124
df ["baz" , "two" ] = "peekaboo"
1108
1125
1109
1126
keys = [np .array ([0 , 0 , 1 ]), np .array ([0 , 0 , 1 ])]
1110
1127
msg = re .escape ("agg function failed [how->mean,dtype->" )
1128
+ if using_infer_string :
1129
+ msg = "dtype 'str' does not support operation 'mean'"
1111
1130
with pytest .raises (TypeError , match = msg ):
1112
1131
df .groupby (keys ).agg ("mean" )
1113
1132
agged = df .drop (columns = ("baz" , "two" )).groupby (keys ).agg ("mean" )
@@ -1299,8 +1318,10 @@ def test_groupby_with_hier_columns():
1299
1318
1300
1319
def test_grouping_ndarray (df ):
1301
1320
grouped = df .groupby (df ["A" ].values )
1321
+ grouped2 = df .groupby (df ["A" ].rename (None ))
1322
+
1302
1323
result = grouped .sum ()
1303
- expected = df . groupby ( df [ "A" ]. rename ( None )) .sum ()
1324
+ expected = grouped2 .sum ()
1304
1325
tm .assert_frame_equal (result , expected )
1305
1326
1306
1327
@@ -1793,8 +1814,8 @@ def test_no_dummy_key_names(df):
1793
1814
result = df .groupby (df ["A" ].values ).sum ()
1794
1815
assert result .index .name is None
1795
1816
1796
- result = df .groupby ([df ["A" ].values , df ["B" ].values ]).sum ()
1797
- assert result .index .names == (None , None )
1817
+ result2 = df .groupby ([df ["A" ].values , df ["B" ].values ]).sum ()
1818
+ assert result2 .index .names == (None , None )
1798
1819
1799
1820
1800
1821
def test_groupby_sort_multiindex_series ():
@@ -2099,6 +2120,7 @@ def get_categorical_invalid_expected():
2099
2120
is_per = isinstance (df .dtypes .iloc [0 ], pd .PeriodDtype )
2100
2121
is_dt64 = df .dtypes .iloc [0 ].kind == "M"
2101
2122
is_cat = isinstance (values , Categorical )
2123
+ is_str = isinstance (df .dtypes .iloc [0 ], pd .StringDtype )
2102
2124
2103
2125
if (
2104
2126
isinstance (values , Categorical )
@@ -2123,13 +2145,15 @@ def get_categorical_invalid_expected():
2123
2145
2124
2146
if op in ["prod" , "sum" , "skew" ]:
2125
2147
# ops that require more than just ordered-ness
2126
- if is_dt64 or is_cat or is_per :
2148
+ if is_dt64 or is_cat or is_per or ( is_str and op != "sum" ) :
2127
2149
# GH#41291
2128
2150
# datetime64 -> prod and sum are invalid
2129
2151
if is_dt64 :
2130
2152
msg = "datetime64 type does not support"
2131
2153
elif is_per :
2132
2154
msg = "Period type does not support"
2155
+ elif is_str :
2156
+ msg = f"dtype 'str' does not support operation '{ op } '"
2133
2157
else :
2134
2158
msg = "category type does not support"
2135
2159
if op == "skew" :
@@ -3083,7 +3107,7 @@ def test_obj_with_exclusions_duplicate_columns():
3083
3107
def test_groupby_numeric_only_std_no_result (numeric_only ):
3084
3108
# GH 51080
3085
3109
dicts_non_numeric = [{"a" : "foo" , "b" : "bar" }, {"a" : "car" , "b" : "dar" }]
3086
- df = DataFrame (dicts_non_numeric )
3110
+ df = DataFrame (dicts_non_numeric , dtype = object )
3087
3111
dfgb = df .groupby ("a" , as_index = False , sort = False )
3088
3112
3089
3113
if numeric_only :
@@ -3142,10 +3166,14 @@ def test_grouping_with_categorical_interval_columns():
3142
3166
def test_groupby_sum_on_nan_should_return_nan (bug_var ):
3143
3167
# GH 24196
3144
3168
df = DataFrame ({"A" : [bug_var , bug_var , bug_var , np .nan ]})
3169
+ if isinstance (bug_var , str ):
3170
+ df = df .astype (object )
3145
3171
dfgb = df .groupby (lambda x : x )
3146
3172
result = dfgb .sum (min_count = 1 )
3147
3173
3148
- expected_df = DataFrame ([bug_var , bug_var , bug_var , None ], columns = ["A" ])
3174
+ expected_df = DataFrame (
3175
+ [bug_var , bug_var , bug_var , None ], columns = ["A" ], dtype = df ["A" ].dtype
3176
+ )
3149
3177
tm .assert_frame_equal (result , expected_df )
3150
3178
3151
3179
0 commit comments