Skip to content

Commit d21f44b

Browse files
committed
Merge pull request #7600 from sinhrks/bingrouper
BUG: GroupBy.size created by TimeGrouper raises AttributeError
2 parents 11da541 + eb2cd04 commit d21f44b

File tree

4 files changed

+53
-9
lines changed

4 files changed

+53
-9
lines changed

doc/source/v0.14.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ Bug Fixes
261261
``Index`` (:issue:`7464`).
262262
- Bug in ``DataFrame.reset_index`` loses ``tz`` (:issue:`3950`)
263263
- Bug in ``DatetimeIndex.freqstr`` raises ``AttributeError`` when ``freq`` is ``None`` (:issue:`7606`)
264+
- Bug in ``GroupBy.size`` created by ``TimeGrouper`` raises ``AttributeError`` (:issue:`7453`)
264265

265266
- Bug in non-monotonic ``Index.union`` may preserve ``name`` incorrectly (:issue:`7458`)
266267
- Bug in ``DatetimeIndex.intersection`` doesn't preserve timezone (:issue:`4690`)

pandas/core/groupby.py

+19-2
Original file line numberDiff line numberDiff line change
@@ -1671,8 +1671,9 @@ def indices(self):
16711671

16721672
i = 0
16731673
for label, bin in zip(self.binlabels, self.bins):
1674-
if label is not tslib.NaT and i < bin:
1675-
indices[label] = list(range(i, bin))
1674+
if i < bin:
1675+
if label is not tslib.NaT:
1676+
indices[label] = list(range(i, bin))
16761677
i = bin
16771678
return indices
16781679

@@ -1693,6 +1694,22 @@ def levels(self):
16931694
def names(self):
16941695
return [self.binlabels.name]
16951696

1697+
def size(self):
1698+
"""
1699+
Compute group sizes
1700+
1701+
"""
1702+
base = Series(np.zeros(len(self.result_index), dtype=np.int64),
1703+
index=self.result_index)
1704+
indices = self.indices
1705+
for k, v in compat.iteritems(indices):
1706+
indices[k] = len(v)
1707+
bin_counts = Series(indices, dtype=np.int64)
1708+
result = base.add(bin_counts, fill_value=0)
1709+
# addition with fill_value changes dtype to float64
1710+
result = result.astype(np.int64)
1711+
return result
1712+
16961713
#----------------------------------------------------------------------
16971714
# cython aggregation
16981715

pandas/tseries/tests/test_resample.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -1326,6 +1326,13 @@ def test_aggregate_normal(self):
13261326
dt_result = getattr(dt_grouped, func)()
13271327
assert_frame_equal(expected, dt_result)
13281328

1329+
# GH 7453
1330+
for func in ['size']:
1331+
expected = getattr(normal_grouped, func)()
1332+
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
1333+
dt_result = getattr(dt_grouped, func)()
1334+
assert_series_equal(expected, dt_result)
1335+
13291336
"""
13301337
for func in ['first', 'last']:
13311338
expected = getattr(normal_grouped, func)()
@@ -1339,7 +1346,7 @@ def test_aggregate_normal(self):
13391346
dt_result = getattr(dt_grouped, func)(3)
13401347
assert_frame_equal(expected, dt_result)
13411348
"""
1342-
# if TimeGrouper is used included, 'size' 'first','last' and 'nth' doesn't work yet
1349+
# if TimeGrouper is used included, 'first','last' and 'nth' doesn't work yet
13431350

13441351
def test_aggregate_with_nat(self):
13451352
# check TimeGrouper's aggregation is identical as normal groupby
@@ -1375,7 +1382,16 @@ def test_aggregate_with_nat(self):
13751382
dt_result = getattr(dt_grouped, func)()
13761383
assert_frame_equal(expected, dt_result)
13771384

1378-
# if NaT is included, 'var', 'std', 'mean', 'size', 'first','last' and 'nth' doesn't work yet
1385+
for func in ['size']:
1386+
normal_result = getattr(normal_grouped, func)()
1387+
pad = Series([0], index=[3])
1388+
expected = normal_result.append(pad)
1389+
expected = expected.sort_index()
1390+
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
1391+
dt_result = getattr(dt_grouped, func)()
1392+
assert_series_equal(expected, dt_result)
1393+
1394+
# if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth' doesn't work yet
13791395

13801396

13811397
if __name__ == '__main__':

vb_suite/groupby.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -108,16 +108,26 @@ def f():
108108
# size() speed
109109

110110
setup = common_setup + """
111-
df = DataFrame({'key1': np.random.randint(0, 500, size=100000),
112-
'key2': np.random.randint(0, 100, size=100000),
113-
'value1' : np.random.randn(100000),
114-
'value2' : np.random.randn(100000),
115-
'value3' : np.random.randn(100000)})
111+
n = 100000
112+
offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
113+
dates = np.datetime64('now') + offsets
114+
df = DataFrame({'key1': np.random.randint(0, 500, size=n),
115+
'key2': np.random.randint(0, 100, size=n),
116+
'value1' : np.random.randn(n),
117+
'value2' : np.random.randn(n),
118+
'value3' : np.random.randn(n),
119+
'dates' : dates})
116120
"""
117121

118122
groupby_multi_size = Benchmark("df.groupby(['key1', 'key2']).size()",
119123
setup, start_date=datetime(2011, 10, 1))
120124

125+
groupby_dt_size = Benchmark("df.groupby(['dates']).size()",
126+
setup, start_date=datetime(2011, 10, 1))
127+
128+
groupby_dt_timegrouper_size = Benchmark("df.groupby(TimeGrouper(key='dates', freq='M')).size()",
129+
setup, start_date=datetime(2011, 10, 1))
130+
121131
#----------------------------------------------------------------------
122132
# count() speed
123133

0 commit comments

Comments
 (0)