@@ -188,7 +188,10 @@ def test_groupby_as_index_apply(pdf, gdf, as_index, engine):
188
188
gdf = gdf .groupby ("y" , as_index = as_index ).apply (
189
189
lambda df : df ["x" ].mean (), engine = engine
190
190
)
191
- pdf = pdf .groupby ("y" , as_index = as_index ).apply (lambda df : df ["x" ].mean ())
191
+ kwargs = {"func" : lambda df : df ["x" ].mean ()}
192
+ if PANDAS_GE_220 :
193
+ kwargs ["include_groups" ] = False
194
+ pdf = pdf .groupby ("y" , as_index = as_index ).apply (** kwargs )
192
195
assert_groupby_results_equal (pdf , gdf )
193
196
194
197
@@ -311,8 +314,12 @@ def foo(df):
311
314
df ["out" ] = df ["val1" ] + df ["val2" ]
312
315
return df
313
316
314
- expect = expect_grpby .apply (foo )
315
- got = got_grpby .apply (foo )
317
+ if PANDAS_GE_220 :
318
+ kwargs = {"include_groups" : False }
319
+ else :
320
+ kwargs = {}
321
+ expect = expect_grpby .apply (foo , ** kwargs )
322
+ got = got_grpby .apply (foo , ** kwargs )
316
323
assert_groupby_results_equal (expect , got )
317
324
318
325
@@ -346,24 +353,24 @@ def test_groupby_apply_args(func, args):
346
353
["key1" , "key2" ], as_index = False , group_keys = False
347
354
)
348
355
got_grpby = df .groupby (["key1" , "key2" ])
349
-
350
- expect = expect_grpby .apply (func , * args )
351
- got = got_grpby .apply (func , * args )
356
+ if PANDAS_GE_220 :
357
+ kwargs = {"include_groups" : False }
358
+ else :
359
+ kwargs = {}
360
+ expect = expect_grpby .apply (func , * args , ** kwargs )
361
+ got = got_grpby .apply (func , * args , ** kwargs )
352
362
assert_groupby_results_equal (expect , got )
353
363
354
364
355
365
def test_groupby_apply_grouped ():
356
366
np .random .seed (0 )
357
367
df = DataFrame ()
358
368
nelem = 20
359
- df ["key1" ] = np . random . randint ( 0 , 3 , nelem )
360
- df ["key2" ] = np . random . randint ( 0 , 2 , nelem )
361
- df ["val1" ] = np . random . random (nelem )
362
- df ["val2" ] = np . random . random (nelem )
369
+ df ["key1" ] = range ( nelem )
370
+ df ["key2" ] = range ( nelem )
371
+ df ["val1" ] = range (nelem )
372
+ df ["val2" ] = range (nelem )
363
373
364
- expect_grpby = df .to_pandas ().groupby (
365
- ["key1" , "key2" ], as_index = False , group_keys = False
366
- )
367
374
got_grpby = df .groupby (["key1" , "key2" ])
368
375
369
376
def foo (key1 , val1 , com1 , com2 ):
@@ -380,14 +387,11 @@ def foo(key1, val1, com1, com2):
380
387
381
388
got = got .to_pandas ()
382
389
383
- # Get expected result by emulating the operation in pandas
384
- def emulate (df ):
385
- df ["com1" ] = df .key1 * 10000 + df .val1
386
- df ["com2" ] = np .arange (len (df ), dtype = np .int32 )
387
- return df
388
-
389
- expect = expect_grpby .apply (emulate )
390
- expect = expect .sort_values (["key1" , "key2" ])
390
+ expect = df .copy ()
391
+ expect ["com1" ] = (expect ["key1" ] * 10000 + expect ["key1" ]).astype (
392
+ np .float64
393
+ )
394
+ expect ["com2" ] = np .zeros (nelem , dtype = np .int32 )
391
395
392
396
assert_groupby_results_equal (expect , got )
393
397
@@ -462,8 +466,14 @@ def run_groupby_apply_jit_test(data, func, keys, *args):
462
466
got_groupby_obj = data .groupby (keys )
463
467
464
468
# compare cuDF jit to pandas
465
- cudf_jit_result = got_groupby_obj .apply (func , * args , engine = "jit" )
466
- pandas_result = expect_groupby_obj .apply (func , * args )
469
+ if PANDAS_GE_220 :
470
+ kwargs = {"include_groups" : False }
471
+ else :
472
+ kwargs = {}
473
+ cudf_jit_result = got_groupby_obj .apply (
474
+ func , * args , engine = "jit" , ** kwargs
475
+ )
476
+ pandas_result = expect_groupby_obj .apply (func , * args , ** kwargs )
467
477
assert_groupby_results_equal (cudf_jit_result , pandas_result )
468
478
469
479
@@ -776,7 +786,7 @@ def test_groupby_apply_jit_block_divergence():
776
786
)
777
787
778
788
def diverging_block (grp_df ):
779
- if grp_df ["a " ].mean () > 0 :
789
+ if grp_df ["b " ].mean () > 1 :
780
790
return grp_df ["b" ].mean ()
781
791
return 0
782
792
@@ -831,27 +841,41 @@ def f(group):
831
841
return group .sum ()
832
842
833
843
part = partial (f )
834
-
835
- expect = pdf .groupby ("a" ).apply (part )
836
- got = gdf .groupby ("a" ).apply (part , engine = "auto" )
837
-
844
+ if PANDAS_GE_220 :
845
+ kwargs = {"include_groups" : False }
846
+ else :
847
+ kwargs = {}
848
+ expect = pdf .groupby ("a" ).apply (part , ** kwargs )
849
+ got = gdf .groupby ("a" ).apply (part , engine = "auto" , ** kwargs )
838
850
assert_groupby_results_equal (expect , got )
839
851
840
852
841
- @pytest .mark .parametrize ("func" , [lambda group : group .x + group .y ])
842
- def test_groupby_apply_return_col_from_df (func ):
853
+ def test_groupby_apply_return_col_from_df ():
843
854
# tests a UDF that consists of purely colwise
844
855
# ops, such as `lambda group: group.x + group.y`
845
856
# which returns a column
846
- df = cudf .datasets .randomdata ()
857
+ func = lambda group : group .x + group .y # noqa:E731
858
+ df = cudf .DataFrame (
859
+ {
860
+ "id" : range (10 ),
861
+ "x" : range (10 ),
862
+ "y" : range (10 ),
863
+ }
864
+ )
847
865
pdf = df .to_pandas ()
848
866
849
867
def func (df ):
850
868
return df .x + df .y
851
869
852
- expect = pdf .groupby ("id" ).apply (func )
853
- got = df .groupby ("id" ).apply (func )
854
-
870
+ if PANDAS_GE_220 :
871
+ kwargs = {"include_groups" : False }
872
+ else :
873
+ kwargs = {}
874
+ got = df .groupby ("id" ).apply (func , ** kwargs )
875
+ expect = pdf .groupby ("id" ).apply (func , ** kwargs )
876
+ # pandas seems to erroneously add an extra MI level of ids
877
+ # TODO: Figure out how pandas groupby.apply determines the columns
878
+ expect = pd .DataFrame (expect .droplevel (1 ), columns = got .columns )
855
879
assert_groupby_results_equal (expect , got )
856
880
857
881
@@ -863,8 +887,12 @@ def test_groupby_apply_return_df(func):
863
887
df = cudf .DataFrame ({"a" : [1 , 1 , 2 , 2 ], "b" : [1 , 2 , 3 , 4 ]})
864
888
pdf = df .to_pandas ()
865
889
866
- expect = pdf .groupby ("a" ).apply (func )
867
- got = df .groupby ("a" ).apply (func )
890
+ if PANDAS_GE_220 :
891
+ kwargs = {"include_groups" : False }
892
+ else :
893
+ kwargs = {}
894
+ expect = pdf .groupby ("a" ).apply (func , ** kwargs )
895
+ got = df .groupby ("a" ).apply (func , ** kwargs )
868
896
assert_groupby_results_equal (expect , got )
869
897
870
898
@@ -1910,14 +1938,21 @@ def test_groupby_apply_noempty_group():
1910
1938
{"a" : [1 , 1 , 2 , 2 ], "b" : [1 , 2 , 1 , 2 ], "c" : [1 , 2 , 3 , 4 ]}
1911
1939
)
1912
1940
gdf = cudf .from_pandas (pdf )
1913
- assert_groupby_results_equal (
1941
+ if PANDAS_GE_220 :
1942
+ kwargs = {"include_groups" : False }
1943
+ else :
1944
+ kwargs = {}
1945
+ expect = (
1914
1946
pdf .groupby ("a" , group_keys = False )
1915
- .apply (lambda x : x .iloc [[0 , 1 ]])
1916
- .reset_index (drop = True ),
1947
+ .apply (lambda x : x .iloc [[0 , 1 ]], ** kwargs )
1948
+ .reset_index (drop = True )
1949
+ )
1950
+ got = (
1917
1951
gdf .groupby ("a" )
1918
- .apply (lambda x : x .iloc [[0 , 1 ]])
1919
- .reset_index (drop = True ),
1952
+ .apply (lambda x : x .iloc [[0 , 1 ]], ** kwargs )
1953
+ .reset_index (drop = True )
1920
1954
)
1955
+ assert_groupby_results_equal (expect , got )
1921
1956
1922
1957
1923
1958
def test_reset_index_after_empty_groupby ():
@@ -2198,8 +2233,12 @@ def test_groupby_apply_return_scalars(func, args):
2198
2233
)
2199
2234
gdf = cudf .from_pandas (pdf )
2200
2235
2201
- expected = pdf .groupby ("A" ).apply (func , * args )
2202
- actual = gdf .groupby ("A" ).apply (func , * args )
2236
+ if PANDAS_GE_220 :
2237
+ kwargs = {"include_groups" : False }
2238
+ else :
2239
+ kwargs = {}
2240
+ expected = pdf .groupby ("A" ).apply (func , * args , ** kwargs )
2241
+ actual = gdf .groupby ("A" ).apply (func , * args , ** kwargs )
2203
2242
2204
2243
assert_groupby_results_equal (expected , actual )
2205
2244
@@ -2242,8 +2281,14 @@ def test_groupby_apply_return_series_dataframe(func, args):
2242
2281
)
2243
2282
gdf = cudf .from_pandas (pdf )
2244
2283
2245
- expected = pdf .groupby (["key" ], group_keys = False ).apply (func , * args )
2246
- actual = gdf .groupby (["key" ]).apply (func , * args )
2284
+ if PANDAS_GE_220 :
2285
+ kwargs = {"include_groups" : False }
2286
+ else :
2287
+ kwargs = {}
2288
+ expected = pdf .groupby (["key" ], group_keys = False ).apply (
2289
+ func , * args , ** kwargs
2290
+ )
2291
+ actual = gdf .groupby (["key" ]).apply (func , * args , ** kwargs )
2247
2292
2248
2293
assert_groupby_results_equal (expected , actual )
2249
2294
0 commit comments