Skip to content

Commit 5e965e9

Browse files
committed
CLN: refactor of groupby/resample to handle Grouper
in a more elegant / cleaner way by keeping internal groupby state inside the Grouper rather than passing around lots of results DOC: minor doc edits for groupby.rst / v0.14.0 PEP8: minor pep changes
1 parent 2f667db commit 5e965e9

File tree

5 files changed

+134
-74
lines changed

5 files changed

+134
-74
lines changed

doc/source/groupby.rst

+8
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,8 @@ that could be potential groupers.
754754
df.groupby([pd.Grouper(freq='6M',level='Date'),'Buyer']).sum()
755755
756756
757+
.. _groupby.nth:
758+
757759
Taking the first rows of each group
758760
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
759761

@@ -833,6 +835,9 @@ To see the order in which each row appears within its group, use the
833835
Examples
834836
--------
835837

838+
Regrouping by factor
839+
~~~~~~~~~~~~~~~~~~~~
840+
836841
Regroup columns of a DataFrame according to their sum, and sum the aggregated ones.
837842

838843
.. ipython:: python
@@ -842,6 +847,9 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
842847
df.groupby(df.sum(), axis=1).sum()
843848
844849
850+
Returning a Series to propogate names
851+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
852+
845853
Group DataFrame columns, compute a set of metrics and return a named Series.
846854
The Series name is used as the name for the column index. This is especially
847855
useful in conjunction with reshaping operations such as stacking in which the

doc/source/v0.14.0.txt

+5-2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Highlights include:
1111

1212
- MultIndexing Using Slicers
1313
- Joining a singly-indexed DataFrame with a multi-indexed DataFrame
14+
- More flexible groupby specifications
1415

1516
API changes
1617
~~~~~~~~~~~
@@ -80,7 +81,7 @@ These are out-of-bounds selections
8081
g[['B']].head(1)
8182

8283
- groupby ``nth`` now filters by default, with optional dropna argument to ignore
83-
NaN (to replicate the previous behaviour.)
84+
NaN (to replicate the previous behaviour.), See :ref:`the docs <groupby.nth>`.
8485

8586
.. ipython:: python
8687

@@ -90,7 +91,8 @@ These are out-of-bounds selections
9091

9192
g.nth(0, dropna='any') # similar to old behaviour
9293

93-
- Allow specification of a more complex groupby via ``pd.Groupby``, See :ref:`the docs <groupby.specify>`. (:issue:`3794`)
94+
- Allow specification of a more complex groupby via ``pd.Groupby``, such as grouping
95+
by a Time and a string field simultaneously. See :ref:`the docs <groupby.specify>`. (:issue:`3794`)
9496

9597
- Local variable usage has changed in
9698
:func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query`
@@ -123,6 +125,7 @@ These are out-of-bounds selections
123125
.. ipython:: python
124126

125127
i[[0,1,2]].astype(np.int_)
128+
126129
- ``set_index`` no longer converts MultiIndexes to an Index of tuples. For example,
127130
the old behavior returned an Index in this case (:issue:`6459`):
128131

pandas/core/groupby.py

+59-21
Original file line numberDiff line numberDiff line change
@@ -170,12 +170,21 @@ def __new__(cls, *args, **kwargs):
170170
return super(Grouper, cls).__new__(cls)
171171

172172
def __init__(self, key=None, level=None, freq=None, axis=None, sort=True):
173-
self.key = key
174-
self.level = level
175-
self.freq = freq
176-
self.axis = axis
177-
self.sort = sort
178-
self.grouper = None
173+
self.key=key
174+
self.level=level
175+
self.freq=freq
176+
self.axis=axis
177+
self.sort=sort
178+
179+
self.grouper=None
180+
self.obj=None
181+
self.indexer=None
182+
self.binner=None
183+
self.grouper=None
184+
185+
@property
186+
def ax(self):
187+
return self.grouper
179188

180189
def get_grouper(self, obj):
181190

@@ -189,20 +198,17 @@ def get_grouper(self, obj):
189198
a tuple of binner, grouper, obj (possibly sorted)
190199
"""
191200

192-
# default is to not use a binner
193-
return None, self.get_grouper_for_ax(obj), obj
201+
self.set_grouper(obj)
202+
return self.binner, self.grouper, self.obj
194203

195-
def get_grouper_for_ax(self, obj):
204+
def set_grouper(self, obj):
196205
"""
197-
given an object and the specifcations, return a grouper for this particular specification
206+
given an object and the specifcations, setup the internal grouper for this particular specification
198207
199208
Parameters
200209
----------
201210
obj : the subject object
202211
203-
Returns
204-
-------
205-
grouper : an index mapping, or a BinGrouper like object
206212
"""
207213

208214
if self.key is not None and self.level is not None:
@@ -236,10 +242,18 @@ def get_grouper_for_ax(self, obj):
236242
if not (level == 0 or level == ax.name):
237243
raise ValueError("The grouper level {0} is not valid".format(level))
238244

239-
return self._get_grouper_for_ax(ax)
245+
# possibly sort
246+
if not ax.is_monotonic:
247+
indexer = self.indexer = ax.argsort(kind='quicksort')
248+
ax = ax.take(indexer)
249+
obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False)
250+
251+
self.obj = obj
252+
self.grouper = ax
253+
return self.grouper
240254

241-
def _get_grouper_for_ax(self, ax):
242-
return ax
255+
def get_binner_for_grouping(self, obj):
256+
raise NotImplementedError
243257

244258
@property
245259
def groups(self):
@@ -1572,7 +1586,6 @@ class Grouping(object):
15721586
index : Index
15731587
grouper :
15741588
obj :
1575-
axis :
15761589
name :
15771590
level :
15781591
@@ -1670,9 +1683,11 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
16701683
# a passed Grouper like
16711684
elif isinstance(self.grouper, Grouper):
16721685

1673-
self.grouper = self.grouper.get_grouper_for_ax(obj)
1686+
# get the new grouper
1687+
grouper = self.grouper.get_binner_for_grouping(obj)
1688+
self.grouper = grouper
16741689
if self.name is None:
1675-
self.name = self.grouper.name
1690+
self.name = grouper.name
16761691

16771692
# no level passed
16781693
if not isinstance(self.grouper, (Series, np.ndarray)):
@@ -1742,8 +1757,28 @@ def groups(self):
17421757

17431758

17441759
def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
1760+
"""
1761+
create and return a BaseGrouper, which is an internal
1762+
mapping of how to create the grouper indexers.
1763+
This may be composed of multiple Grouping objects, indicating
1764+
multiple groupers
1765+
1766+
Groupers are ultimately index mappings. They can originate as:
1767+
index mappings, keys to columns, functions, or Groupers
1768+
1769+
Groupers enable local references to axis,level,sort, while
1770+
the passed in axis, level, and sort are 'global'.
1771+
1772+
This routine tries to figure of what the passing in references
1773+
are and then creates a Grouping for each one, combined into
1774+
a BaseGrouper.
1775+
1776+
"""
1777+
17451778
group_axis = obj._get_axis(axis)
17461779

1780+
# validate thatthe passed level is compatible with the passed
1781+
# axis of the object
17471782
if level is not None:
17481783
if not isinstance(group_axis, MultiIndex):
17491784
if isinstance(level, compat.string_types):
@@ -1756,9 +1791,12 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
17561791
level = None
17571792
key = group_axis
17581793

1794+
# a passed in Grouper, directly convert
17591795
if isinstance(key, Grouper):
1760-
binner, gpr, obj = key.get_grouper(obj)
1761-
return gpr, [], obj
1796+
binner, grouper, obj = key.get_grouper(obj)
1797+
return grouper, [], obj
1798+
1799+
# already have a BaseGrouper, just return it
17621800
elif isinstance(key, BaseGrouper):
17631801
return key, [], obj
17641802

pandas/tests/test_groupby.py

+19
Original file line numberDiff line numberDiff line change
@@ -2933,6 +2933,7 @@ def test_timegrouper_with_reg_groups(self):
29332933
DT.datetime(2013,12,31,0,0),
29342934
DT.datetime(2013,12,31,0,0),
29352935
]}).set_index(['Date','Buyer'])
2936+
29362937
result = df.groupby([pd.Grouper(freq='A'),'Buyer']).sum()
29372938
assert_frame_equal(result,expected)
29382939

@@ -3022,6 +3023,24 @@ def test_timegrouper_with_reg_groups(self):
30223023
# error as we have both a level and a name!
30233024
self.assertRaises(ValueError, lambda : df.groupby([pd.Grouper(freq='1M',key='Date',level='Date'),'Buyer']).sum())
30243025

3026+
3027+
# single groupers
3028+
expected = DataFrame({ 'Quantity' : [31],
3029+
'Date' : [DT.datetime(2013,10,31,0,0)] }).set_index('Date')
3030+
result = df.groupby(pd.Grouper(freq='1M')).sum()
3031+
assert_frame_equal(result, expected)
3032+
3033+
result = df.groupby([pd.Grouper(freq='1M')]).sum()
3034+
assert_frame_equal(result, expected)
3035+
3036+
expected = DataFrame({ 'Quantity' : [31],
3037+
'Date' : [DT.datetime(2013,11,30,0,0)] }).set_index('Date')
3038+
result = df.groupby(pd.Grouper(freq='1M',key='Date')).sum()
3039+
assert_frame_equal(result, expected)
3040+
3041+
result = df.groupby([pd.Grouper(freq='1M',key='Date')]).sum()
3042+
assert_frame_equal(result, expected)
3043+
30253044
def test_cumcount(self):
30263045
df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'])
30273046
g = df.groupby('A')

pandas/tseries/resample.py

+43-51
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,17 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean',
7070
self.limit = limit
7171
self.base = base
7272

73+
# by definition we always sort
74+
kwargs['sort'] = True
75+
7376
super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs)
7477

7578
def resample(self, obj):
76-
ax = obj._get_axis(self.axis)
79+
self.set_grouper(obj)
80+
ax = self.grouper
7781

78-
obj = self._ensure_sortedness(obj)
7982
if isinstance(ax, DatetimeIndex):
80-
rs = self._resample_timestamps(obj)
83+
rs = self._resample_timestamps()
8184
elif isinstance(ax, PeriodIndex):
8285
offset = to_offset(self.freq)
8386
if offset.n > 1:
@@ -87,12 +90,13 @@ def resample(self, obj):
8790
self.kind = 'timestamp'
8891

8992
if self.kind is None or self.kind == 'period':
90-
rs = self._resample_periods(obj)
93+
rs = self._resample_periods()
9194
else:
92-
obj = obj.to_timestamp(how=self.convention)
93-
rs = self._resample_timestamps(obj)
95+
obj = self.obj.to_timestamp(how=self.convention)
96+
self.set_grouper(obj)
97+
rs = self._resample_timestamps()
9498
elif len(ax) == 0:
95-
return obj
99+
return self.obj
96100
else: # pragma: no cover
97101
raise TypeError('Only valid with DatetimeIndex or PeriodIndex')
98102

@@ -101,60 +105,42 @@ def resample(self, obj):
101105
return rs
102106

103107
def get_grouper(self, obj):
104-
# return a tuple of (binner, grouper, obj)
105-
return self._get_time_grouper(obj)
106-
107-
def _get_grouper_for_ax(self, ax):
108-
# return an ordering of the transformed group labels,
109-
# suitable for multi-grouping, e.g the labels for
110-
# the resampled intervals
108+
self.set_grouper(obj)
109+
return self.get_binner_for_resample()
111110

112-
indexer = None
113-
if not ax.is_monotonic:
114-
indexer = ax.argsort(kind='quicksort')
115-
ax = ax.take(indexer)
111+
def get_binner_for_resample(self):
112+
# create the BinGrouper
113+
# assume that self.set_grouper(obj) has already been called
116114

115+
ax = self.ax
117116
if self.kind is None or self.kind == 'timestamp':
118-
binner, bins, binlabels = self._get_time_bins(ax)
117+
self.binner, bins, binlabels = self._get_time_bins(ax)
119118
else:
120-
binner, bins, binlabels = self._get_time_period_bins(ax)
119+
self.binner, bins, binlabels = self._get_time_period_bins(ax)
121120

122-
grp = BinGrouper(bins, binlabels)
121+
self.grouper = BinGrouper(bins, binlabels)
122+
return self.binner, self.grouper, self.obj
123+
124+
def get_binner_for_grouping(self, obj):
125+
# return an ordering of the transformed group labels,
126+
# suitable for multi-grouping, e.g the labels for
127+
# the resampled intervals
128+
ax = self.set_grouper(obj)
129+
self.get_binner_for_resample()
123130

124131
# create the grouper
132+
binner = self.binner
125133
l = []
126-
for key, group in grp.get_iterator(ax):
134+
for key, group in self.grouper.get_iterator(ax):
127135
l.extend([key]*len(group))
128136
grouper = binner.__class__(l,freq=binner.freq,name=binner.name)
129137

130138
# since we may have had to sort
131139
# may need to reorder groups here
132-
if indexer is not None:
133-
grouper = grouper.take(indexer)
140+
if self.indexer is not None:
141+
grouper = grouper.take(self.indexer)
134142
return grouper
135143

136-
def _ensure_sortedness(self, obj):
137-
# ensure that our object is sorted
138-
ax = obj._get_axis(self.axis)
139-
if not ax.is_monotonic:
140-
try:
141-
obj = obj.sort_index(axis=self.axis)
142-
except:
143-
obj = obj.sort_index()
144-
return obj
145-
146-
def _get_time_grouper(self, obj):
147-
obj = self._ensure_sortedness(obj)
148-
ax = obj._get_axis(self.axis)
149-
150-
if self.kind is None or self.kind == 'timestamp':
151-
binner, bins, binlabels = self._get_time_bins(ax)
152-
else:
153-
binner, bins, binlabels = self._get_time_period_bins(ax)
154-
155-
grouper = BinGrouper(bins, binlabels)
156-
return binner, grouper, obj
157-
158144
def _get_time_bins(self, ax):
159145
if not isinstance(ax, DatetimeIndex):
160146
raise TypeError('axis must be a DatetimeIndex, but got '
@@ -243,10 +229,14 @@ def _get_time_period_bins(self, ax):
243229
def _agg_method(self):
244230
return self.how if self.how else _DEFAULT_METHOD
245231

246-
def _resample_timestamps(self, obj):
247-
axlabels = obj._get_axis(self.axis)
232+
def _resample_timestamps(self):
233+
# assumes set_grouper(obj) already called
234+
axlabels = self.ax
248235

249-
binner, grouper, _ = self._get_time_grouper(obj)
236+
self.get_binner_for_resample()
237+
grouper = self.grouper
238+
binner = self.binner
239+
obj = self.obj
250240

251241
# Determine if we're downsampling
252242
if axlabels.freq is not None or axlabels.inferred_freq is not None:
@@ -286,8 +276,10 @@ def _resample_timestamps(self, obj):
286276

287277
return result
288278

289-
def _resample_periods(self, obj):
290-
axlabels = obj._get_axis(self.axis)
279+
def _resample_periods(self):
280+
# assumes set_grouper(obj) already called
281+
axlabels = self.ax
282+
obj = self.obj
291283

292284
if len(axlabels) == 0:
293285
new_index = PeriodIndex(data=[], freq=self.freq)

0 commit comments

Comments
 (0)