@@ -82,7 +82,7 @@ def get_stats(group):
82
82
assert result .index .names [0 ] == "C"
83
83
84
84
85
- def test_basic (using_infer_string ): # TODO: split this test
85
+ def test_basic ():
86
86
cats = Categorical (
87
87
["a" , "a" , "a" , "b" , "b" , "b" , "c" , "c" , "c" ],
88
88
categories = ["a" , "b" , "c" , "d" ],
@@ -95,17 +95,20 @@ def test_basic(using_infer_string): # TODO: split this test
95
95
result = data .groupby ("b" , observed = False ).mean ()
96
96
tm .assert_frame_equal (result , expected )
97
97
98
+
99
+ def test_basic_single_grouper ():
98
100
cat1 = Categorical (["a" , "a" , "b" , "b" ], categories = ["a" , "b" , "z" ], ordered = True )
99
101
cat2 = Categorical (["c" , "d" , "c" , "d" ], categories = ["c" , "d" , "y" ], ordered = True )
100
102
df = DataFrame ({"A" : cat1 , "B" : cat2 , "values" : [1 , 2 , 3 , 4 ]})
101
103
102
- # single grouper
103
104
gb = df .groupby ("A" , observed = False )
104
105
exp_idx = CategoricalIndex (["a" , "b" , "z" ], name = "A" , ordered = True )
105
106
expected = DataFrame ({"values" : Series ([3 , 7 , 0 ], index = exp_idx )})
106
107
result = gb .sum (numeric_only = True )
107
108
tm .assert_frame_equal (result , expected )
108
109
110
+
111
+ def test_basic_string (using_infer_string ):
109
112
# GH 8623
110
113
x = DataFrame (
111
114
[[1 , "John P. Doe" ], [2 , "Jane Dove" ], [1 , "John P. Doe" ]],
@@ -133,8 +136,9 @@ def f(x):
133
136
expected ["person_name" ] = expected ["person_name" ].astype (dtype )
134
137
tm .assert_frame_equal (result , expected )
135
138
139
+
140
+ def test_basic_monotonic ():
136
141
# GH 9921
137
- # Monotonic
138
142
df = DataFrame ({"a" : [5 , 15 , 25 ]})
139
143
c = pd .cut (df .a , bins = [0 , 10 , 20 , 30 , 40 ])
140
144
@@ -165,7 +169,8 @@ def f(x):
165
169
tm .assert_series_equal (df .a .groupby (c , observed = False ).filter (np .all ), df ["a" ])
166
170
tm .assert_frame_equal (df .groupby (c , observed = False ).filter (np .all ), df )
167
171
168
- # Non-monotonic
172
+
173
+ def test_basic_non_monotonic ():
169
174
df = DataFrame ({"a" : [5 , 15 , 25 , - 5 ]})
170
175
c = pd .cut (df .a , bins = [- 10 , 0 , 10 , 20 , 30 , 40 ])
171
176
@@ -183,6 +188,8 @@ def f(x):
183
188
df .groupby (c , observed = False ).transform (lambda xs : np .sum (xs )), df [["a" ]]
184
189
)
185
190
191
+
192
+ def test_basic_cut_grouping ():
186
193
# GH 9603
187
194
df = DataFrame ({"a" : [1 , 0 , 0 , 0 ]})
188
195
c = pd .cut (df .a , [0 , 1 , 2 , 3 , 4 ], labels = Categorical (list ("abcd" )))
@@ -193,13 +200,14 @@ def f(x):
193
200
expected .index .name = "a"
194
201
tm .assert_series_equal (result , expected )
195
202
196
- # more basic
203
+
204
+ def test_more_basic ():
197
205
levels = ["foo" , "bar" , "baz" , "qux" ]
198
- codes = np .random .default_rng (2 ).integers (0 , 4 , size = 100 )
206
+ codes = np .random .default_rng (2 ).integers (0 , 4 , size = 10 )
199
207
200
208
cats = Categorical .from_codes (codes , levels , ordered = True )
201
209
202
- data = DataFrame (np .random .default_rng (2 ).standard_normal ((100 , 4 )))
210
+ data = DataFrame (np .random .default_rng (2 ).standard_normal ((10 , 4 )))
203
211
204
212
result = data .groupby (cats , observed = False ).mean ()
205
213
@@ -225,9 +233,9 @@ def f(x):
225
233
# GH 10460
226
234
expc = Categorical .from_codes (np .arange (4 ).repeat (8 ), levels , ordered = True )
227
235
exp = CategoricalIndex (expc )
228
- tm .assert_index_equal (( desc_result .stack ().index .get_level_values (0 ) ), exp )
236
+ tm .assert_index_equal (desc_result .stack ().index .get_level_values (0 ), exp )
229
237
exp = Index (["count" , "mean" , "std" , "min" , "25%" , "50%" , "75%" , "max" ] * 4 )
230
- tm .assert_index_equal (( desc_result .stack ().index .get_level_values (1 ) ), exp )
238
+ tm .assert_index_equal (desc_result .stack ().index .get_level_values (1 ), exp )
231
239
232
240
233
241
def test_level_get_group (observed ):
@@ -352,6 +360,8 @@ def test_observed(observed):
352
360
353
361
tm .assert_frame_equal (result , expected )
354
362
363
+
364
+ def test_observed_single_column (observed ):
355
365
# https://github.com/pandas-dev/pandas/issues/8138
356
366
d = {
357
367
"cat" : Categorical (
@@ -362,7 +372,6 @@ def test_observed(observed):
362
372
}
363
373
df = DataFrame (d )
364
374
365
- # Grouping on a single column
366
375
groups_single_key = df .groupby ("cat" , observed = observed )
367
376
result = groups_single_key .mean ()
368
377
@@ -378,7 +387,17 @@ def test_observed(observed):
378
387
379
388
tm .assert_frame_equal (result , expected )
380
389
381
- # Grouping on two columns
390
+
391
+ def test_observed_two_columns (observed ):
392
+ # https://github.com/pandas-dev/pandas/issues/8138
393
+ d = {
394
+ "cat" : Categorical (
395
+ ["a" , "b" , "a" , "b" ], categories = ["a" , "b" , "c" ], ordered = True
396
+ ),
397
+ "ints" : [1 , 1 , 2 , 2 ],
398
+ "val" : [10 , 20 , 30 , 40 ],
399
+ }
400
+ df = DataFrame (d )
382
401
groups_double_key = df .groupby (["cat" , "ints" ], observed = observed )
383
402
result = groups_double_key .agg ("mean" )
384
403
expected = DataFrame (
@@ -404,6 +423,8 @@ def test_observed(observed):
404
423
expected = df [(df .cat == c ) & (df .ints == i )]
405
424
tm .assert_frame_equal (result , expected )
406
425
426
+
427
+ def test_observed_with_as_index (observed ):
407
428
# gh-8869
408
429
# with as_index
409
430
d = {
@@ -591,7 +612,6 @@ def test_dataframe_categorical_with_nan(observed):
591
612
592
613
593
614
@pytest .mark .parametrize ("ordered" , [True , False ])
594
- @pytest .mark .parametrize ("observed" , [True , False ])
595
615
def test_dataframe_categorical_ordered_observed_sort (ordered , observed , sort ):
596
616
# GH 25871: Fix groupby sorting on ordered Categoricals
597
617
# GH 25167: Groupby with observed=True doesn't sort
@@ -627,11 +647,11 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
627
647
def test_datetime ():
628
648
# GH9049: ensure backward compatibility
629
649
levels = pd .date_range ("2014-01-01" , periods = 4 )
630
- codes = np .random .default_rng (2 ).integers (0 , 4 , size = 100 )
650
+ codes = np .random .default_rng (2 ).integers (0 , 4 , size = 10 )
631
651
632
652
cats = Categorical .from_codes (codes , levels , ordered = True )
633
653
634
- data = DataFrame (np .random .default_rng (2 ).standard_normal ((100 , 4 )))
654
+ data = DataFrame (np .random .default_rng (2 ).standard_normal ((10 , 4 )))
635
655
result = data .groupby (cats , observed = False ).mean ()
636
656
637
657
expected = data .groupby (np .asarray (cats ), observed = False ).mean ()
@@ -832,7 +852,10 @@ def test_preserve_categories():
832
852
df .groupby ("A" , sort = False , observed = False ).first ().index , nosort_index
833
853
)
834
854
835
- # ordered=False
855
+
856
+ def test_preserve_categories_ordered_false ():
857
+ # GH-13179
858
+ categories = list ("abc" )
836
859
df = DataFrame ({"A" : Categorical (list ("ba" ), categories = categories , ordered = False )})
837
860
sort_index = CategoricalIndex (categories , categories , ordered = False , name = "A" )
838
861
# GH#48749 - don't change order of categories
@@ -846,7 +869,8 @@ def test_preserve_categories():
846
869
)
847
870
848
871
849
- def test_preserve_categorical_dtype ():
872
+ @pytest .mark .parametrize ("col" , ["C1" , "C2" ])
873
+ def test_preserve_categorical_dtype (col ):
850
874
# GH13743, GH13854
851
875
df = DataFrame (
852
876
{
@@ -865,18 +889,15 @@ def test_preserve_categorical_dtype():
865
889
"C2" : Categorical (list ("bac" ), categories = list ("bac" ), ordered = True ),
866
890
}
867
891
)
868
- for col in ["C1" , "C2" ]:
869
- result1 = df .groupby (by = col , as_index = False , observed = False ).mean (
870
- numeric_only = True
871
- )
872
- result2 = (
873
- df .groupby (by = col , as_index = True , observed = False )
874
- .mean (numeric_only = True )
875
- .reset_index ()
876
- )
877
- expected = exp_full .reindex (columns = result1 .columns )
878
- tm .assert_frame_equal (result1 , expected )
879
- tm .assert_frame_equal (result2 , expected )
892
+ result1 = df .groupby (by = col , as_index = False , observed = False ).mean (numeric_only = True )
893
+ result2 = (
894
+ df .groupby (by = col , as_index = True , observed = False )
895
+ .mean (numeric_only = True )
896
+ .reset_index ()
897
+ )
898
+ expected = exp_full .reindex (columns = result1 .columns )
899
+ tm .assert_frame_equal (result1 , expected )
900
+ tm .assert_frame_equal (result2 , expected )
880
901
881
902
882
903
@pytest .mark .parametrize (
@@ -931,6 +952,8 @@ def test_categorical_no_compress():
931
952
)
932
953
tm .assert_series_equal (result , exp )
933
954
955
+
956
+ def test_categorical_no_compress_string ():
934
957
cats = Categorical (
935
958
["a" , "a" , "a" , "b" , "b" , "b" , "c" , "c" , "c" ],
936
959
categories = ["a" , "b" , "c" , "d" ],
@@ -965,7 +988,7 @@ def test_sort():
965
988
# has a sorted x axis
966
989
# self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
967
990
968
- df = DataFrame ({"value" : np .random .default_rng (2 ).integers (0 , 10000 , 100 )})
991
+ df = DataFrame ({"value" : np .random .default_rng (2 ).integers (0 , 10000 , 10 )})
969
992
labels = [f"{ i } - { i + 499 } " for i in range (0 , 10000 , 500 )]
970
993
cat_labels = Categorical (labels , labels )
971
994
0 commit comments