@@ -256,11 +256,13 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
256
256
def ax (self ):
257
257
return self .grouper
258
258
259
- def _get_grouper (self , obj ):
259
+ def _get_grouper (self , obj , validate = True ):
260
260
"""
261
261
Parameters
262
262
----------
263
263
obj : the subject object
264
+ validate : boolean, default True
265
+ if True, validate the grouper
264
266
265
267
Returns
266
268
-------
@@ -271,7 +273,8 @@ def _get_grouper(self, obj):
271
273
self .grouper , exclusions , self .obj = _get_grouper (self .obj , [self .key ],
272
274
axis = self .axis ,
273
275
level = self .level ,
274
- sort = self .sort )
276
+ sort = self .sort ,
277
+ validate = validate )
275
278
return self .binner , self .grouper , self .obj
276
279
277
280
def _set_grouper (self , obj , sort = False ):
@@ -326,12 +329,6 @@ def _set_grouper(self, obj, sort=False):
326
329
self .grouper = ax
327
330
return self .grouper
328
331
329
- def _get_binner_for_grouping (self , obj ):
330
- """ default to the standard binner here """
331
- group_axis = obj ._get_axis (self .axis )
332
- return Grouping (group_axis , None , obj = obj , name = self .key ,
333
- level = self .level , sort = self .sort , in_axis = False )
334
-
335
332
@property
336
333
def groups (self ):
337
334
return self .grouper .groups
@@ -1733,16 +1730,34 @@ class BaseGrouper(object):
1733
1730
"""
1734
1731
This is an internal Grouper class, which actually holds
1735
1732
the generated groups
1733
+
1734
+ Parameters
1735
+ ----------
1736
+ axis : int
1737
+ the axis to group
1738
+ groupings : array of grouping
1739
+ all the grouping instances to handle in this grouper
1740
+ for example for grouper list to groupby, need to pass the list
1741
+ sort : boolean, default True
1742
+ whether this grouper will give sorted result or not
1743
+ group_keys : boolean, default True
1744
+ mutated : boolean, default False
1745
+ indexer : intp array, optional
1746
+ the indexer created by Grouper
1747
+ some groupers (TimeGrouper) will sort its axis and its
1748
+ group_info is also sorted, so need the indexer to reorder
1749
+
1736
1750
"""
1737
1751
1738
1752
def __init__ (self , axis , groupings , sort = True , group_keys = True ,
1739
- mutated = False ):
1753
+ mutated = False , indexer = None ):
1740
1754
self ._filter_empty_groups = self .compressed = len (groupings ) != 1
1741
1755
self .axis = axis
1742
1756
self .groupings = groupings
1743
1757
self .sort = sort
1744
1758
self .group_keys = group_keys
1745
1759
self .mutated = mutated
1760
+ self .indexer = indexer
1746
1761
1747
1762
@property
1748
1763
def shape (self ):
@@ -1888,6 +1903,15 @@ def group_info(self):
1888
1903
comp_ids = _ensure_int64 (comp_ids )
1889
1904
return comp_ids , obs_group_ids , ngroups
1890
1905
1906
+ @cache_readonly
1907
+ def label_info (self ):
1908
+ # return the labels of items in original grouped axis
1909
+ labels , _ , _ = self .group_info
1910
+ if self .indexer is not None :
1911
+ sorter = np .lexsort ((labels , self .indexer ))
1912
+ labels = labels [sorter ]
1913
+ return labels
1914
+
1891
1915
def _get_compressed_labels (self ):
1892
1916
all_labels = [ping .labels for ping in self .groupings ]
1893
1917
if len (all_labels ) > 1 :
@@ -2288,11 +2312,42 @@ def generate_bins_generic(values, binner, closed):
2288
2312
2289
2313
class BinGrouper (BaseGrouper ):
2290
2314
2291
- def __init__ (self , bins , binlabels , filter_empty = False , mutated = False ):
2315
+ """
2316
+ This is an internal Grouper class
2317
+
2318
+ Parameters
2319
+ ----------
2320
+ bins : the split index of binlabels to group the item of axis
2321
+ binlabels : the label list
2322
+ filter_empty : boolean, default False
2323
+ mutated : boolean, default False
2324
+ indexer : a intp array
2325
+
2326
+ Examples
2327
+ --------
2328
+ bins: [2, 4, 6, 8, 10]
2329
+ binlabels: DatetimeIndex(['2005-01-01', '2005-01-03',
2330
+ '2005-01-05', '2005-01-07', '2005-01-09'],
2331
+ dtype='datetime64[ns]', freq='2D')
2332
+
2333
+ the group_info, which contains the label of each item in grouped
2334
+ axis, the index of label in label list, group number, is
2335
+
2336
+ (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5)
2337
+
2338
+ means that, the grouped axis has 10 items, can be grouped into 5
2339
+ labels, the first and second items belong to the first label, the
2340
+ third and forth items belong to the second label, and so on
2341
+
2342
+ """
2343
+
2344
+ def __init__ (self , bins , binlabels , filter_empty = False , mutated = False ,
2345
+ indexer = None ):
2292
2346
self .bins = _ensure_int64 (bins )
2293
2347
self .binlabels = _ensure_index (binlabels )
2294
2348
self ._filter_empty_groups = filter_empty
2295
2349
self .mutated = mutated
2350
+ self .indexer = indexer
2296
2351
2297
2352
@cache_readonly
2298
2353
def groups (self ):
@@ -2460,6 +2515,19 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
2460
2515
self .grouper , self ._labels , self ._group_index = \
2461
2516
index ._get_grouper_for_level (self .grouper , level )
2462
2517
2518
+ # a passed Grouper like, directly get the grouper in the same way
2519
+ # as single grouper groupby, use the group_info to get labels
2520
+ elif isinstance (self .grouper , Grouper ):
2521
+ # get the new grouper; we already have disambiguated
2522
+ # what key/level refer to exactly, don't need to
2523
+ # check again as we have by this point converted these
2524
+ # to an actual value (rather than a pd.Grouper)
2525
+ _ , grouper , _ = self .grouper ._get_grouper (self .obj , validate = False )
2526
+ if self .name is None :
2527
+ self .name = grouper .result_index .name
2528
+ self .obj = self .grouper .obj
2529
+ self .grouper = grouper
2530
+
2463
2531
else :
2464
2532
if self .grouper is None and self .name is not None :
2465
2533
self .grouper = self .obj [self .name ]
@@ -2482,16 +2550,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
2482
2550
categories = c ,
2483
2551
ordered = self .grouper .ordered ))
2484
2552
2485
- # a passed Grouper like
2486
- elif isinstance (self .grouper , Grouper ):
2487
-
2488
- # get the new grouper
2489
- grouper = self .grouper ._get_binner_for_grouping (self .obj )
2490
- self .obj = self .grouper .obj
2491
- self .grouper = grouper
2492
- if self .name is None :
2493
- self .name = grouper .name
2494
-
2495
2553
# we are done
2496
2554
if isinstance (self .grouper , Grouping ):
2497
2555
self .grouper = self .grouper .grouper
@@ -2536,6 +2594,10 @@ def ngroups(self):
2536
2594
2537
2595
@cache_readonly
2538
2596
def indices (self ):
2597
+ # we have a list of groupers
2598
+ if isinstance (self .grouper , BaseGrouper ):
2599
+ return self .grouper .indices
2600
+
2539
2601
values = _ensure_categorical (self .grouper )
2540
2602
return values ._reverse_indexer ()
2541
2603
@@ -2553,9 +2615,14 @@ def group_index(self):
2553
2615
2554
2616
def _make_labels (self ):
2555
2617
if self ._labels is None or self ._group_index is None :
2556
- labels , uniques = algorithms .factorize (
2557
- self .grouper , sort = self .sort )
2558
- uniques = Index (uniques , name = self .name )
2618
+ # we have a list of groupers
2619
+ if isinstance (self .grouper , BaseGrouper ):
2620
+ labels = self .grouper .label_info
2621
+ uniques = self .grouper .result_index
2622
+ else :
2623
+ labels , uniques = algorithms .factorize (
2624
+ self .grouper , sort = self .sort )
2625
+ uniques = Index (uniques , name = self .name )
2559
2626
self ._labels = labels
2560
2627
self ._group_index = uniques
2561
2628
@@ -2566,7 +2633,7 @@ def groups(self):
2566
2633
2567
2634
2568
2635
def _get_grouper (obj , key = None , axis = 0 , level = None , sort = True ,
2569
- mutated = False ):
2636
+ mutated = False , validate = True ):
2570
2637
"""
2571
2638
create and return a BaseGrouper, which is an internal
2572
2639
mapping of how to create the grouper indexers.
@@ -2583,6 +2650,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
2583
2650
are and then creates a Grouping for each one, combined into
2584
2651
a BaseGrouper.
2585
2652
2653
+ If validate, then check for key/level overlaps
2654
+
2586
2655
"""
2587
2656
group_axis = obj ._get_axis (axis )
2588
2657
@@ -2707,7 +2776,7 @@ def is_in_obj(gpr):
2707
2776
2708
2777
elif is_in_axis (gpr ): # df.groupby('name')
2709
2778
if gpr in obj :
2710
- if gpr in obj .index .names :
2779
+ if validate and gpr in obj .index .names :
2711
2780
warnings .warn (
2712
2781
("'%s' is both a column name and an index level.\n "
2713
2782
"Defaulting to column but "
0 commit comments