@@ -110,7 +110,7 @@ def func(df):
110
110
return df
111
111
112
112
assert_eq (df .groupby ('a' ).apply (func ),
113
- ddf .groupby ('a' ).apply (func ))
113
+ ddf .groupby ('a' ).apply (func , meta = { "a" : int , "b" : float } ))
114
114
115
115
116
116
@pytest .mark .parametrize ('grouper' , [
@@ -131,8 +131,10 @@ def func(df):
131
131
df ['b' ] = df .b - df .b .mean ()
132
132
return df
133
133
134
+ # last one causes a DeprcationWarning from pandas, hard to track down...
134
135
assert_eq (df .groupby (grouper (df )).apply (func ),
135
- ddf .groupby (grouper (ddf )).apply (func ))
136
+ ddf .groupby (grouper (ddf )).apply (func , meta = {"a" : int , "d" : int ,
137
+ "b" : float }))
136
138
137
139
138
140
def test_groupby_dir ():
@@ -158,14 +160,15 @@ def func(df):
158
160
return df .assign (b = df .b - df .b .mean ())
159
161
160
162
with dask .set_options (get = get ):
161
- assert_eq (ddf .groupby ('a' ).apply (func ),
162
- pdf .groupby ('a' ).apply (func ))
163
+ with pytest .warns (None ):
164
+ assert_eq (ddf .groupby ('a' ).apply (func ),
165
+ pdf .groupby ('a' ).apply (func ))
163
166
164
- assert_eq (ddf .groupby ('a' ).apply (func ).set_index ('a' ),
165
- pdf .groupby ('a' ).apply (func ).set_index ('a' ))
167
+ assert_eq (ddf .groupby ('a' ).apply (func ).set_index ('a' ),
168
+ pdf .groupby ('a' ).apply (func ).set_index ('a' ))
166
169
167
- assert_eq (pdf2 .groupby (pdf2 .index ).apply (func ),
168
- ddf2 .groupby (ddf2 .index ).apply (func ))
170
+ assert_eq (pdf2 .groupby (pdf2 .index ).apply (func ),
171
+ ddf2 .groupby (ddf2 .index ).apply (func ))
169
172
170
173
171
174
def test_groupby_multilevel_getitem ():
@@ -258,7 +261,8 @@ def test_series_groupby_propagates_names():
258
261
df = pd .DataFrame ({'x' : [1 , 2 , 3 ], 'y' : [4 , 5 , 6 ]})
259
262
ddf = dd .from_pandas (df , 2 )
260
263
func = lambda df : df ['y' ].sum ()
261
- result = ddf .groupby ('x' ).apply (func )
264
+ with pytest .warns (UserWarning ):
265
+ result = ddf .groupby ('x' ).apply (func )
262
266
expected = df .groupby ('x' ).apply (func )
263
267
assert_eq (result , expected )
264
268
@@ -507,40 +511,42 @@ def call(g, m, **kwargs):
507
511
508
512
509
513
def test_apply_shuffle ():
514
+ import warnings ; warnings .simplefilter ("error" , UserWarning )
510
515
pdf = pd .DataFrame ({'A' : [1 , 2 , 3 , 4 ] * 5 ,
511
516
'B' : np .random .randn (20 ),
512
517
'C' : np .random .randn (20 ),
513
518
'D' : np .random .randn (20 )})
514
519
ddf = dd .from_pandas (pdf , 3 )
515
520
516
- assert_eq (ddf .groupby ('A' ).apply (lambda x : x .sum ()),
517
- pdf .groupby ('A' ).apply (lambda x : x .sum ()))
521
+ with pytest .warns (UserWarning ):
522
+ assert_eq (ddf .groupby ('A' ).apply (lambda x : x .sum ()),
523
+ pdf .groupby ('A' ).apply (lambda x : x .sum ()))
518
524
519
- assert_eq (ddf .groupby (ddf ['A' ]).apply (lambda x : x .sum ()),
520
- pdf .groupby (pdf ['A' ]).apply (lambda x : x .sum ()))
525
+ assert_eq (ddf .groupby (ddf ['A' ]).apply (lambda x : x .sum ()),
526
+ pdf .groupby (pdf ['A' ]).apply (lambda x : x .sum ()))
521
527
522
- assert_eq (ddf .groupby (ddf ['A' ] + 1 ).apply (lambda x : x .sum ()),
523
- pdf .groupby (pdf ['A' ] + 1 ).apply (lambda x : x .sum ()))
528
+ assert_eq (ddf .groupby (ddf ['A' ] + 1 ).apply (lambda x : x .sum ()),
529
+ pdf .groupby (pdf ['A' ] + 1 ).apply (lambda x : x .sum ()))
524
530
525
- # SeriesGroupBy
526
- assert_eq (ddf .groupby ('A' )['B' ].apply (lambda x : x .sum ()),
527
- pdf .groupby ('A' )['B' ].apply (lambda x : x .sum ()))
531
+ # SeriesGroupBy
532
+ assert_eq (ddf .groupby ('A' )['B' ].apply (lambda x : x .sum ()),
533
+ pdf .groupby ('A' )['B' ].apply (lambda x : x .sum ()))
528
534
529
- assert_eq (ddf .groupby (ddf ['A' ])['B' ].apply (lambda x : x .sum ()),
530
- pdf .groupby (pdf ['A' ])['B' ].apply (lambda x : x .sum ()))
535
+ assert_eq (ddf .groupby (ddf ['A' ])['B' ].apply (lambda x : x .sum ()),
536
+ pdf .groupby (pdf ['A' ])['B' ].apply (lambda x : x .sum ()))
531
537
532
- assert_eq (ddf .groupby (ddf ['A' ] + 1 )['B' ].apply (lambda x : x .sum ()),
533
- pdf .groupby (pdf ['A' ] + 1 )['B' ].apply (lambda x : x .sum ()))
538
+ assert_eq (ddf .groupby (ddf ['A' ] + 1 )['B' ].apply (lambda x : x .sum ()),
539
+ pdf .groupby (pdf ['A' ] + 1 )['B' ].apply (lambda x : x .sum ()))
534
540
535
- # DataFrameGroupBy with column slice
536
- assert_eq (ddf .groupby ('A' )[['B' , 'C' ]].apply (lambda x : x .sum ()),
537
- pdf .groupby ('A' )[['B' , 'C' ]].apply (lambda x : x .sum ()))
541
+ # DataFrameGroupBy with column slice
542
+ assert_eq (ddf .groupby ('A' )[['B' , 'C' ]].apply (lambda x : x .sum ()),
543
+ pdf .groupby ('A' )[['B' , 'C' ]].apply (lambda x : x .sum ()))
538
544
539
- assert_eq (ddf .groupby (ddf ['A' ])[['B' , 'C' ]].apply (lambda x : x .sum ()),
540
- pdf .groupby (pdf ['A' ])[['B' , 'C' ]].apply (lambda x : x .sum ()))
545
+ assert_eq (ddf .groupby (ddf ['A' ])[['B' , 'C' ]].apply (lambda x : x .sum ()),
546
+ pdf .groupby (pdf ['A' ])[['B' , 'C' ]].apply (lambda x : x .sum ()))
541
547
542
- assert_eq (ddf .groupby (ddf ['A' ] + 1 )[['B' , 'C' ]].apply (lambda x : x .sum ()),
543
- pdf .groupby (pdf ['A' ] + 1 )[['B' , 'C' ]].apply (lambda x : x .sum ()))
548
+ assert_eq (ddf .groupby (ddf ['A' ] + 1 )[['B' , 'C' ]].apply (lambda x : x .sum ()),
549
+ pdf .groupby (pdf ['A' ] + 1 )[['B' , 'C' ]].apply (lambda x : x .sum ()))
544
550
545
551
546
552
@pytest .mark .parametrize ('grouper' , [
@@ -559,17 +565,18 @@ def test_apply_shuffle_multilevel(grouper):
559
565
'D' : np .random .randn (20 )})
560
566
ddf = dd .from_pandas (pdf , 3 )
561
567
562
- # DataFrameGroupBy
563
- assert_eq (ddf .groupby (grouper (ddf )).apply (lambda x : x .sum ()),
564
- pdf .groupby (grouper (pdf )).apply (lambda x : x .sum ()))
568
+ with pytest .warns (UserWarning ):
569
+ # DataFrameGroupBy
570
+ assert_eq (ddf .groupby (grouper (ddf )).apply (lambda x : x .sum ()),
571
+ pdf .groupby (grouper (pdf )).apply (lambda x : x .sum ()))
565
572
566
- # SeriesGroupBy
567
- assert_eq (ddf .groupby (grouper (ddf ))['B' ].apply (lambda x : x .sum ()),
568
- pdf .groupby (grouper (pdf ))['B' ].apply (lambda x : x .sum ()))
573
+ # SeriesGroupBy
574
+ assert_eq (ddf .groupby (grouper (ddf ))['B' ].apply (lambda x : x .sum ()),
575
+ pdf .groupby (grouper (pdf ))['B' ].apply (lambda x : x .sum ()))
569
576
570
- # DataFrameGroupBy with column slice
571
- assert_eq (ddf .groupby (grouper (ddf ))[['B' , 'C' ]].apply (lambda x : x .sum ()),
572
- pdf .groupby (grouper (pdf ))[['B' , 'C' ]].apply (lambda x : x .sum ()))
577
+ # DataFrameGroupBy with column slice
578
+ assert_eq (ddf .groupby (grouper (ddf ))[['B' , 'C' ]].apply (lambda x : x .sum ()),
579
+ pdf .groupby (grouper (pdf ))[['B' , 'C' ]].apply (lambda x : x .sum ()))
573
580
574
581
575
582
def test_numeric_column_names ():
@@ -581,7 +588,7 @@ def test_numeric_column_names():
581
588
ddf = dd .from_pandas (df , npartitions = 2 )
582
589
assert_eq (ddf .groupby (0 ).sum (), df .groupby (0 ).sum ())
583
590
assert_eq (ddf .groupby ([0 , 2 ]).sum (), df .groupby ([0 , 2 ]).sum ())
584
- assert_eq (ddf .groupby (0 ).apply (lambda x : x ),
591
+ assert_eq (ddf .groupby (0 ).apply (lambda x : x , meta = { 0 : int , 1 : int , 2 : int } ),
585
592
df .groupby (0 ).apply (lambda x : x ))
586
593
587
594
@@ -594,12 +601,14 @@ def test_groupby_apply_tasks():
594
601
with dask .set_options (shuffle = 'tasks' ):
595
602
for ind in [lambda x : 'A' , lambda x : x .A ]:
596
603
a = df .groupby (ind (df )).apply (len )
597
- b = ddf .groupby (ind (ddf )).apply (len )
604
+ with pytest .warns (UserWarning ):
605
+ b = ddf .groupby (ind (ddf )).apply (len )
598
606
assert_eq (a , b .compute ())
599
607
assert not any ('partd' in k [0 ] for k in b .dask )
600
608
601
609
a = df .groupby (ind (df )).B .apply (len )
602
- b = ddf .groupby (ind (ddf )).B .apply (len )
610
+ with pytest .warns (UserWarning ):
611
+ b = ddf .groupby (ind (ddf )).B .apply (len )
603
612
assert_eq (a , b .compute ())
604
613
assert not any ('partd' in k [0 ] for k in b .dask )
605
614
@@ -610,7 +619,8 @@ def test_groupby_multiprocessing():
610
619
'B' : ['1' ,'1' ,'a' ,'a' ,'a' ]})
611
620
ddf = dd .from_pandas (df , npartitions = 3 )
612
621
with dask .set_options (get = get ):
613
- assert_eq (ddf .groupby ('B' ).apply (lambda x : x ),
622
+ assert_eq (ddf .groupby ('B' ).apply (lambda x : x , meta = {"A" : int ,
623
+ "B" : object }),
614
624
df .groupby ('B' ).apply (lambda x : x ))
615
625
616
626
@@ -652,8 +662,12 @@ def test_aggregate__examples(spec, split_every, grouper):
652
662
columns = ['c' , 'b' , 'a' , 'd' ])
653
663
ddf = dd .from_pandas (pdf , npartitions = 10 )
654
664
655
- assert_eq (pdf .groupby (grouper (pdf )).agg (spec ),
656
- ddf .groupby (grouper (ddf )).agg (spec , split_every = split_every ))
665
+ # Warning from pandas deprecation .agg(dict[dict])
666
+ # it's from pandas, so no reason to assert the deprecation warning,
667
+ # but we should still test it for now
668
+ with pytest .warns (None ):
669
+ assert_eq (pdf .groupby (grouper (pdf )).agg (spec ),
670
+ ddf .groupby (grouper (ddf )).agg (spec , split_every = split_every ))
657
671
658
672
659
673
@pytest .mark .parametrize ('spec' , [
@@ -678,9 +692,12 @@ def test_series_aggregate__examples(spec, split_every, grouper):
678
692
679
693
ddf = dd .from_pandas (pdf , npartitions = 10 )
680
694
ds = ddf ['c' ]
681
-
682
- assert_eq (ps .groupby (grouper (pdf )).agg (spec ),
683
- ds .groupby (grouper (ddf )).agg (spec , split_every = split_every ))
695
+ # Warning from pandas deprecation .agg(dict[dict])
696
+ # it's from pandas, so no reason to assert the deprecation warning,
697
+ # but we should still test it for now
698
+ with pytest .warns (None ):
699
+ assert_eq (ps .groupby (grouper (pdf )).agg (spec ),
700
+ ds .groupby (grouper (ddf )).agg (spec , split_every = split_every ))
684
701
685
702
686
703
@pytest .mark .parametrize ('spec' , [
0 commit comments