@@ -556,7 +556,8 @@ class _GroupBy(PandasObject, SelectionMixin):
556
556
557
557
def __init__ (self , obj , keys = None , axis = 0 , level = None ,
558
558
grouper = None , exclusions = None , selection = None , as_index = True ,
559
- sort = True , group_keys = True , squeeze = False , ** kwargs ):
559
+ sort = True , group_keys = True , squeeze = False ,
560
+ observed = None , ** kwargs ):
560
561
561
562
self ._selection = selection
562
563
@@ -576,13 +577,15 @@ def __init__(self, obj, keys=None, axis=0, level=None,
576
577
self .sort = sort
577
578
self .group_keys = group_keys
578
579
self .squeeze = squeeze
580
+ self .observed = observed
579
581
self .mutated = kwargs .pop ('mutated' , False )
580
582
581
583
if grouper is None :
582
584
grouper , exclusions , obj = _get_grouper (obj , keys ,
583
585
axis = axis ,
584
586
level = level ,
585
587
sort = sort ,
588
+ observed = observed ,
586
589
mutated = self .mutated )
587
590
588
591
self .obj = obj
@@ -2331,18 +2334,21 @@ def ngroups(self):
2331
2334
def recons_labels (self ):
2332
2335
comp_ids , obs_ids , _ = self .group_info
2333
2336
labels = (ping .labels for ping in self .groupings )
2334
- return decons_obs_group_ids (comp_ids ,
2335
- obs_ids , self .shape , labels , xnull = True )
2337
+ return decons_obs_group_ids (
2338
+ comp_ids , obs_ids , self .shape , labels , xnull = True )
2336
2339
2337
2340
@cache_readonly
2338
2341
def result_index (self ):
2339
2342
if not self .compressed and len (self .groupings ) == 1 :
2340
2343
return self .groupings [0 ].group_index .rename (self .names [0 ])
2341
2344
2342
- return MultiIndex (levels = [ping .group_index for ping in self .groupings ],
2343
- labels = self .recons_labels ,
2344
- verify_integrity = False ,
2345
- names = self .names )
2345
+ labels = self .recons_labels
2346
+ levels = [ping .group_index for ping in self .groupings ]
2347
+ result = MultiIndex (levels = levels ,
2348
+ labels = labels ,
2349
+ verify_integrity = False ,
2350
+ names = self .names )
2351
+ return result
2346
2352
2347
2353
def get_group_levels (self ):
2348
2354
if not self .compressed and len (self .groupings ) == 1 :
@@ -2883,6 +2889,7 @@ class Grouping(object):
2883
2889
obj :
2884
2890
name :
2885
2891
level :
2892
+ observed : If we are a Categorical, use the observed values
2886
2893
in_axis : if the Grouping is a column in self.obj and hence among
2887
2894
Groupby.exclusions list
2888
2895
@@ -2898,14 +2905,15 @@ class Grouping(object):
2898
2905
"""
2899
2906
2900
2907
def __init__ (self , index , grouper = None , obj = None , name = None , level = None ,
2901
- sort = True , in_axis = False ):
2908
+ sort = True , observed = None , in_axis = False ):
2902
2909
2903
2910
self .name = name
2904
2911
self .level = level
2905
2912
self .grouper = _convert_grouper (index , grouper )
2906
2913
self .index = index
2907
2914
self .sort = sort
2908
2915
self .obj = obj
2916
+ self .observed = observed
2909
2917
self .in_axis = in_axis
2910
2918
2911
2919
# right place for this?
@@ -2954,16 +2962,34 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
2954
2962
elif is_categorical_dtype (self .grouper ):
2955
2963
2956
2964
self .grouper = self .grouper ._codes_for_groupby (self .sort )
2965
+ codes = self .grouper .codes
2966
+ categories = self .grouper .categories
2957
2967
2958
2968
# we make a CategoricalIndex out of the cat grouper
2959
2969
# preserving the categories / ordered attributes
2960
- self ._labels = self .grouper .codes
2970
+ self ._labels = codes
2971
+
2972
+ # Use the observed values of the grouper if inidcated
2973
+ observed = self .observed
2974
+ if observed is None :
2975
+ msg = ("pass observed=True to ensure that a "
2976
+ "categorical grouper only returns the "
2977
+ "observed groupers, or\n "
2978
+ "observed=False to return NA for non-observed"
2979
+ "values\n " )
2980
+ warnings .warn (msg , FutureWarning , stacklevel = 5 )
2981
+ observed = False
2982
+
2983
+ if observed :
2984
+ codes = algorithms .unique1d (codes )
2985
+ else :
2986
+ codes = np .arange (len (categories ))
2961
2987
2962
- c = self .grouper .categories
2963
2988
self ._group_index = CategoricalIndex (
2964
- Categorical .from_codes (np .arange (len (c )),
2965
- categories = c ,
2966
- ordered = self .grouper .ordered ))
2989
+ Categorical .from_codes (
2990
+ codes = codes ,
2991
+ categories = categories ,
2992
+ ordered = self .grouper .ordered ))
2967
2993
2968
2994
# we are done
2969
2995
if isinstance (self .grouper , Grouping ):
@@ -3048,7 +3074,7 @@ def groups(self):
3048
3074
3049
3075
3050
3076
def _get_grouper (obj , key = None , axis = 0 , level = None , sort = True ,
3051
- mutated = False , validate = True ):
3077
+ observed = None , mutated = False , validate = True ):
3052
3078
"""
3053
3079
create and return a BaseGrouper, which is an internal
3054
3080
mapping of how to create the grouper indexers.
@@ -3065,6 +3091,9 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
3065
3091
are and then creates a Grouping for each one, combined into
3066
3092
a BaseGrouper.
3067
3093
3094
+ If observed & we have a categorical grouper, only show the observed
3095
+ values
3096
+
3068
3097
If validate, then check for key/level overlaps
3069
3098
3070
3099
"""
@@ -3243,6 +3272,7 @@ def is_in_obj(gpr):
3243
3272
name = name ,
3244
3273
level = level ,
3245
3274
sort = sort ,
3275
+ observed = observed ,
3246
3276
in_axis = in_axis ) \
3247
3277
if not isinstance (gpr , Grouping ) else gpr
3248
3278
@@ -4154,7 +4184,7 @@ def first_not_none(values):
4154
4184
not_indexed_same = not_indexed_same )
4155
4185
elif self .grouper .groupings is not None :
4156
4186
if len (self .grouper .groupings ) > 1 :
4157
- key_index = MultiIndex . from_tuples ( keys , names = key_names )
4187
+ key_index = self . grouper . result_index
4158
4188
4159
4189
else :
4160
4190
ping = self .grouper .groupings [0 ]
@@ -4244,8 +4274,9 @@ def first_not_none(values):
4244
4274
4245
4275
# normally use vstack as its faster than concat
4246
4276
# and if we have mi-columns
4247
- if isinstance (v .index ,
4248
- MultiIndex ) or key_index is None :
4277
+ if (isinstance (v .index , MultiIndex ) or
4278
+ key_index is None or
4279
+ isinstance (key_index , MultiIndex )):
4249
4280
stacked_values = np .vstack (map (np .asarray , values ))
4250
4281
result = DataFrame (stacked_values , index = key_index ,
4251
4282
columns = index )
@@ -4696,6 +4727,14 @@ def _reindex_output(self, result):
4696
4727
4697
4728
This can re-expand the output space
4698
4729
"""
4730
+
4731
+ # TODO(jreback): remove completely
4732
+ # when observed parameter is defaulted to True
4733
+ # gh-20583
4734
+
4735
+ if self .observed :
4736
+ return result
4737
+
4699
4738
groupings = self .grouper .groupings
4700
4739
if groupings is None :
4701
4740
return result
0 commit comments