@@ -425,7 +425,7 @@ def test_frame_multi_key_function_list():
425
425
tm .assert_frame_equal (agged , expected )
426
426
427
427
428
- def test_frame_multi_key_function_list_partial_failure ():
428
+ def test_frame_multi_key_function_list_partial_failure (using_infer_string ):
429
429
data = DataFrame (
430
430
{
431
431
"A" : [
@@ -476,6 +476,8 @@ def test_frame_multi_key_function_list_partial_failure():
476
476
grouped = data .groupby (["A" , "B" ])
477
477
funcs = ["mean" , "std" ]
478
478
msg = re .escape ("agg function failed [how->mean,dtype->" )
479
+ if using_infer_string :
480
+ msg = "dtype 'str' does not support operation 'mean'"
479
481
with pytest .raises (TypeError , match = msg ):
480
482
grouped .agg (funcs )
481
483
@@ -662,9 +664,11 @@ def test_groupby_multi_corner(df):
662
664
tm .assert_frame_equal (agged , expected )
663
665
664
666
665
- def test_raises_on_nuisance (df ):
667
+ def test_raises_on_nuisance (df , using_infer_string ):
666
668
grouped = df .groupby ("A" )
667
669
msg = re .escape ("agg function failed [how->mean,dtype->" )
670
+ if using_infer_string :
671
+ msg = "dtype 'str' does not support operation 'mean'"
668
672
with pytest .raises (TypeError , match = msg ):
669
673
grouped .agg ("mean" )
670
674
with pytest .raises (TypeError , match = msg ):
@@ -699,15 +703,18 @@ def test_keep_nuisance_agg(df, agg_function):
699
703
["sum" , "mean" , "prod" , "std" , "var" , "sem" , "median" ],
700
704
)
701
705
@pytest .mark .parametrize ("numeric_only" , [True , False ])
702
- def test_omit_nuisance_agg (df , agg_function , numeric_only ):
706
+ def test_omit_nuisance_agg (df , agg_function , numeric_only , using_infer_string ):
703
707
# GH 38774, GH 38815
704
708
grouped = df .groupby ("A" )
705
709
706
710
no_drop_nuisance = ("var" , "std" , "sem" , "mean" , "prod" , "median" )
707
711
if agg_function in no_drop_nuisance and not numeric_only :
708
712
# Added numeric_only as part of GH#46560; these do not drop nuisance
709
713
# columns when numeric_only is False
710
- if agg_function in ("std" , "sem" ):
714
+ if using_infer_string :
715
+ msg = f"dtype 'str' does not support operation '{ agg_function } '"
716
+ klass = TypeError
717
+ elif agg_function in ("std" , "sem" ):
711
718
klass = ValueError
712
719
msg = "could not convert string to float: 'one'"
713
720
else :
@@ -728,16 +735,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
728
735
tm .assert_frame_equal (result , expected )
729
736
730
737
731
- def test_raise_on_nuisance_python_single (df ):
738
+ def test_raise_on_nuisance_python_single (df , using_infer_string ):
732
739
# GH 38815
733
740
grouped = df .groupby ("A" )
734
- with pytest .raises (ValueError , match = "could not convert" ):
741
+
742
+ err = ValueError
743
+ msg = "could not convert"
744
+ if using_infer_string :
745
+ err = TypeError
746
+ msg = "dtype 'str' does not support operation 'skew'"
747
+ with pytest .raises (err , match = msg ):
735
748
grouped .skew ()
736
749
737
750
738
- def test_raise_on_nuisance_python_multiple (three_group ):
751
+ def test_raise_on_nuisance_python_multiple (three_group , using_infer_string ):
739
752
grouped = three_group .groupby (["A" , "B" ])
740
753
msg = re .escape ("agg function failed [how->mean,dtype->" )
754
+ if using_infer_string :
755
+ msg = "dtype 'str' does not support operation 'mean'"
741
756
with pytest .raises (TypeError , match = msg ):
742
757
grouped .agg ("mean" )
743
758
with pytest .raises (TypeError , match = msg ):
@@ -775,12 +790,16 @@ def test_nonsense_func():
775
790
df .groupby (lambda x : x + "foo" )
776
791
777
792
778
- def test_wrap_aggregated_output_multindex (multiindex_dataframe_random_data ):
793
+ def test_wrap_aggregated_output_multindex (
794
+ multiindex_dataframe_random_data , using_infer_string
795
+ ):
779
796
df = multiindex_dataframe_random_data .T
780
797
df ["baz" , "two" ] = "peekaboo"
781
798
782
799
keys = [np .array ([0 , 0 , 1 ]), np .array ([0 , 0 , 1 ])]
783
800
msg = re .escape ("agg function failed [how->mean,dtype->" )
801
+ if using_infer_string :
802
+ msg = "dtype 'str' does not support operation 'mean'"
784
803
with pytest .raises (TypeError , match = msg ):
785
804
df .groupby (keys ).agg ("mean" )
786
805
agged = df .drop (columns = ("baz" , "two" )).groupby (keys ).agg ("mean" )
@@ -960,8 +979,10 @@ def test_groupby_with_hier_columns():
960
979
961
980
def test_grouping_ndarray (df ):
962
981
grouped = df .groupby (df ["A" ].values )
982
+ grouped2 = df .groupby (df ["A" ].rename (None ))
983
+
963
984
result = grouped .sum ()
964
- expected = df . groupby ( df [ "A" ]. rename ( None )) .sum ()
985
+ expected = grouped2 .sum ()
965
986
tm .assert_frame_equal (result , expected )
966
987
967
988
@@ -1457,8 +1478,8 @@ def test_no_dummy_key_names(df):
1457
1478
result = df .groupby (df ["A" ].values ).sum ()
1458
1479
assert result .index .name is None
1459
1480
1460
- result = df .groupby ([df ["A" ].values , df ["B" ].values ]).sum ()
1461
- assert result .index .names == (None , None )
1481
+ result2 = df .groupby ([df ["A" ].values , df ["B" ].values ]).sum ()
1482
+ assert result2 .index .names == (None , None )
1462
1483
1463
1484
1464
1485
def test_groupby_sort_multiindex_series ():
@@ -1761,6 +1782,7 @@ def get_categorical_invalid_expected():
1761
1782
is_per = isinstance (df .dtypes .iloc [0 ], pd .PeriodDtype )
1762
1783
is_dt64 = df .dtypes .iloc [0 ].kind == "M"
1763
1784
is_cat = isinstance (values , Categorical )
1785
+ is_str = isinstance (df .dtypes .iloc [0 ], pd .StringDtype )
1764
1786
1765
1787
if (
1766
1788
isinstance (values , Categorical )
@@ -1785,13 +1807,15 @@ def get_categorical_invalid_expected():
1785
1807
1786
1808
if op in ["prod" , "sum" , "skew" ]:
1787
1809
# ops that require more than just ordered-ness
1788
- if is_dt64 or is_cat or is_per :
1810
+ if is_dt64 or is_cat or is_per or ( is_str and op != "sum" ) :
1789
1811
# GH#41291
1790
1812
# datetime64 -> prod and sum are invalid
1791
1813
if is_dt64 :
1792
1814
msg = "datetime64 type does not support"
1793
1815
elif is_per :
1794
1816
msg = "Period type does not support"
1817
+ elif is_str :
1818
+ msg = f"dtype 'str' does not support operation '{ op } '"
1795
1819
else :
1796
1820
msg = "category type does not support"
1797
1821
if op == "skew" :
@@ -2714,7 +2738,7 @@ def test_obj_with_exclusions_duplicate_columns():
2714
2738
def test_groupby_numeric_only_std_no_result (numeric_only ):
2715
2739
# GH 51080
2716
2740
dicts_non_numeric = [{"a" : "foo" , "b" : "bar" }, {"a" : "car" , "b" : "dar" }]
2717
- df = DataFrame (dicts_non_numeric )
2741
+ df = DataFrame (dicts_non_numeric , dtype = object )
2718
2742
dfgb = df .groupby ("a" , as_index = False , sort = False )
2719
2743
2720
2744
if numeric_only :
@@ -2773,10 +2797,14 @@ def test_grouping_with_categorical_interval_columns():
2773
2797
def test_groupby_sum_on_nan_should_return_nan (bug_var ):
2774
2798
# GH 24196
2775
2799
df = DataFrame ({"A" : [bug_var , bug_var , bug_var , np .nan ]})
2800
+ if isinstance (bug_var , str ):
2801
+ df = df .astype (object )
2776
2802
dfgb = df .groupby (lambda x : x )
2777
2803
result = dfgb .sum (min_count = 1 )
2778
2804
2779
- expected_df = DataFrame ([bug_var , bug_var , bug_var , None ], columns = ["A" ])
2805
+ expected_df = DataFrame (
2806
+ [bug_var , bug_var , bug_var , None ], columns = ["A" ], dtype = df ["A" ].dtype
2807
+ )
2780
2808
tm .assert_frame_equal (result , expected_df )
2781
2809
2782
2810
0 commit comments