Skip to content

BUG: GroupBy.size created by TimeGrouper raises AttributeError #7600

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 30, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/v0.14.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ Bug Fixes
``Index`` (:issue:`7464`).
- Bug in ``DataFrame.reset_index`` loses ``tz`` (:issue:`3950`)
- Bug in ``DatetimeIndex.freqstr`` raises ``AttributeError`` when ``freq`` is ``None`` (:issue:`7606`)
- Bug in ``GroupBy.size`` created by ``TimeGrouper`` raises ``AttributeError`` (:issue:`7453`)

- Bug in non-monotonic ``Index.union`` may preserve ``name`` incorrectly (:issue:`7458`)
- Bug in ``DatetimeIndex.intersection`` doesn't preserve timezone (:issue:`4690`)
Expand Down
21 changes: 19 additions & 2 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1643,8 +1643,9 @@ def indices(self):

i = 0
for label, bin in zip(self.binlabels, self.bins):
if label is not tslib.NaT and i < bin:
indices[label] = list(range(i, bin))
if i < bin:
if label is not tslib.NaT:
indices[label] = list(range(i, bin))
i = bin
return indices

Expand All @@ -1665,6 +1666,22 @@ def levels(self):
def names(self):
return [self.binlabels.name]

def size(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you not using value_counts? this should be very similar (if not call irectly), BaseGrouper.size

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are 2 reasons.

  • value_counts cannot fulfill intermediate timestamps which is not included in the group key. For example, if the TimeGrouper categorize [2011-01-01, 2011-01-02, 2011-01-04] with daily frequency, the resulted group keys must be [2011-01-01, 2011-01-02, 2011-01-03, 2011-01-04]
  • BinGrouper doesn't retain original Index to be directly passed to value_counts. To use value_counts, it must be created using similar logic.
indices = self.indices
labels = []
for k, v in compat.iteritems(indices):
            labels.extend([k] * len(v))
value_counts(labels)

"""
Compute group sizes

"""
base = Series(np.zeros(len(self.result_index), dtype=np.int64),
index=self.result_index)
indices = self.indices
for k, v in compat.iteritems(indices):
indices[k] = len(v)
bin_counts = Series(indices, dtype=np.int64)
result = base.add(bin_counts, fill_value=0)
# addition with fill_value changes dtype to float64
result = result.astype(np.int64)
return result

#----------------------------------------------------------------------
# cython aggregation

Expand Down
20 changes: 18 additions & 2 deletions pandas/tseries/tests/test_resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -1326,6 +1326,13 @@ def test_aggregate_normal(self):
dt_result = getattr(dt_grouped, func)()
assert_frame_equal(expected, dt_result)

# GH 7453
for func in ['size']:
expected = getattr(normal_grouped, func)()
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
dt_result = getattr(dt_grouped, func)()
assert_series_equal(expected, dt_result)

"""
for func in ['first', 'last']:
expected = getattr(normal_grouped, func)()
Expand All @@ -1339,7 +1346,7 @@ def test_aggregate_normal(self):
dt_result = getattr(dt_grouped, func)(3)
assert_frame_equal(expected, dt_result)
"""
# if TimeGrouper is used included, 'size' 'first','last' and 'nth' doesn't work yet
# if TimeGrouper is used included, 'first','last' and 'nth' doesn't work yet

def test_aggregate_with_nat(self):
# check TimeGrouper's aggregation is identical as normal groupby
Expand Down Expand Up @@ -1375,7 +1382,16 @@ def test_aggregate_with_nat(self):
dt_result = getattr(dt_grouped, func)()
assert_frame_equal(expected, dt_result)

# if NaT is included, 'var', 'std', 'mean', 'size', 'first','last' and 'nth' doesn't work yet
for func in ['size']:
normal_result = getattr(normal_grouped, func)()
pad = Series([0], index=[3])
expected = normal_result.append(pad)
expected = expected.sort_index()
expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
dt_result = getattr(dt_grouped, func)()
assert_series_equal(expected, dt_result)

# if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth' doesn't work yet


if __name__ == '__main__':
Expand Down
20 changes: 15 additions & 5 deletions vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,16 +108,26 @@ def f():
# size() speed

setup = common_setup + """
df = DataFrame({'key1': np.random.randint(0, 500, size=100000),
'key2': np.random.randint(0, 100, size=100000),
'value1' : np.random.randn(100000),
'value2' : np.random.randn(100000),
'value3' : np.random.randn(100000)})
n = 100000
offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
dates = np.datetime64('now') + offsets
df = DataFrame({'key1': np.random.randint(0, 500, size=n),
'key2': np.random.randint(0, 100, size=n),
'value1' : np.random.randn(n),
'value2' : np.random.randn(n),
'value3' : np.random.randn(n),
'dates' : dates})
"""

groupby_multi_size = Benchmark("df.groupby(['key1', 'key2']).size()",
setup, start_date=datetime(2011, 10, 1))

groupby_dt_size = Benchmark("df.groupby(['dates']).size()",
setup, start_date=datetime(2011, 10, 1))

groupby_dt_timegrouper_size = Benchmark("df.groupby(TimeGrouper(key='dates', freq='M')).size()",
setup, start_date=datetime(2011, 10, 1))

#----------------------------------------------------------------------
# count() speed

Expand Down