Skip to content

Commit eb2cd04

Browse files
committed
BUG: GroupBy.size created by TimeGrouper raises AttributeError
1 parent 6f0ccaf commit eb2cd04

File tree

4 files changed

+53
-9
lines changed

4 files changed

+53
-9
lines changed

doc/source/v0.14.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ Bug Fixes
260260
``Index`` (:issue:`7464`).
261261
- Bug in ``DataFrame.reset_index`` loses ``tz`` (:issue:`3950`)
262262
- Bug in ``DatetimeIndex.freqstr`` raises ``AttributeError`` when ``freq`` is ``None`` (:issue:`7606`)
263+
- Bug in ``GroupBy.size`` created by ``TimeGrouper`` raises ``AttributeError`` (:issue:`7453`)
263264

264265
- Bug in non-monotonic ``Index.union`` may preserve ``name`` incorrectly (:issue:`7458`)
265266
- Bug in ``DatetimeIndex.intersection`` doesn't preserve timezone (:issue:`4690`)

pandas/core/groupby.py

+19-2
Original file line numberDiff line numberDiff line change
@@ -1643,8 +1643,9 @@ def indices(self):
16431643

16441644
i = 0
16451645
for label, bin in zip(self.binlabels, self.bins):
1646-
if label is not tslib.NaT and i < bin:
1647-
indices[label] = list(range(i, bin))
1646+
if i < bin:
1647+
if label is not tslib.NaT:
1648+
indices[label] = list(range(i, bin))
16481649
i = bin
16491650
return indices
16501651

@@ -1665,6 +1666,22 @@ def levels(self):
16651666
def names(self):
16661667
return [self.binlabels.name]
16671668

1669+
def size(self):
1670+
"""
1671+
Compute group sizes
1672+
1673+
"""
1674+
base = Series(np.zeros(len(self.result_index), dtype=np.int64),
1675+
index=self.result_index)
1676+
indices = self.indices
1677+
for k, v in compat.iteritems(indices):
1678+
indices[k] = len(v)
1679+
bin_counts = Series(indices, dtype=np.int64)
1680+
result = base.add(bin_counts, fill_value=0)
1681+
# addition with fill_value changes dtype to float64
1682+
result = result.astype(np.int64)
1683+
return result
1684+
16681685
#----------------------------------------------------------------------
16691686
# cython aggregation
16701687

pandas/tseries/tests/test_resample.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -1326,6 +1326,13 @@ def test_aggregate_normal(self):
13261326
dt_result = getattr(dt_grouped, func)()
13271327
assert_frame_equal(expected, dt_result)
13281328

1329+
# GH 7453
1330+
for func in ['size']:
1331+
expected = getattr(normal_grouped, func)()
1332+
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
1333+
dt_result = getattr(dt_grouped, func)()
1334+
assert_series_equal(expected, dt_result)
1335+
13291336
"""
13301337
for func in ['first', 'last']:
13311338
expected = getattr(normal_grouped, func)()
@@ -1339,7 +1346,7 @@ def test_aggregate_normal(self):
13391346
dt_result = getattr(dt_grouped, func)(3)
13401347
assert_frame_equal(expected, dt_result)
13411348
"""
1342-
# if TimeGrouper is used included, 'size' 'first','last' and 'nth' doesn't work yet
1349+
# if TimeGrouper is used included, 'first','last' and 'nth' doesn't work yet
13431350

13441351
def test_aggregate_with_nat(self):
13451352
# check TimeGrouper's aggregation is identical as normal groupby
@@ -1375,7 +1382,16 @@ def test_aggregate_with_nat(self):
13751382
dt_result = getattr(dt_grouped, func)()
13761383
assert_frame_equal(expected, dt_result)
13771384

1378-
# if NaT is included, 'var', 'std', 'mean', 'size', 'first','last' and 'nth' doesn't work yet
1385+
for func in ['size']:
1386+
normal_result = getattr(normal_grouped, func)()
1387+
pad = Series([0], index=[3])
1388+
expected = normal_result.append(pad)
1389+
expected = expected.sort_index()
1390+
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
1391+
dt_result = getattr(dt_grouped, func)()
1392+
assert_series_equal(expected, dt_result)
1393+
1394+
# if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth' doesn't work yet
13791395

13801396

13811397
if __name__ == '__main__':

vb_suite/groupby.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -108,16 +108,26 @@ def f():
108108
# size() speed
109109

110110
setup = common_setup + """
111-
df = DataFrame({'key1': np.random.randint(0, 500, size=100000),
112-
'key2': np.random.randint(0, 100, size=100000),
113-
'value1' : np.random.randn(100000),
114-
'value2' : np.random.randn(100000),
115-
'value3' : np.random.randn(100000)})
111+
n = 100000
112+
offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
113+
dates = np.datetime64('now') + offsets
114+
df = DataFrame({'key1': np.random.randint(0, 500, size=n),
115+
'key2': np.random.randint(0, 100, size=n),
116+
'value1' : np.random.randn(n),
117+
'value2' : np.random.randn(n),
118+
'value3' : np.random.randn(n),
119+
'dates' : dates})
116120
"""
117121

118122
groupby_multi_size = Benchmark("df.groupby(['key1', 'key2']).size()",
119123
setup, start_date=datetime(2011, 10, 1))
120124

125+
groupby_dt_size = Benchmark("df.groupby(['dates']).size()",
126+
setup, start_date=datetime(2011, 10, 1))
127+
128+
groupby_dt_timegrouper_size = Benchmark("df.groupby(TimeGrouper(key='dates', freq='M')).size()",
129+
setup, start_date=datetime(2011, 10, 1))
130+
121131
#----------------------------------------------------------------------
122132
# count() speed
123133

0 commit comments

Comments
 (0)