Skip to content

BUG: Bug in DataFrameGroupby.transform when transforming with a passed non-sorted key (GH8046) #8049

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 18, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ Bug Fixes
when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`)

- Bug in HDFStore iteration when passing a where (:issue:`8014`)

- Bug in DataFrameGroupby.transform when transforming with a passed non-sorted key (:issue:`8046`)
- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)


Expand Down
35 changes: 20 additions & 15 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,24 @@ def _set_selection_from_grouper(self):
if len(groupers):
self._group_selection = (ax-Index(groupers)).tolist()

def _set_result_index_ordered(self, result):
# set the result index on the passed values object
# return the new object
# related 8046

# the values/counts are repeated according to the group index
indices = self.indices

# shortcut of we have an already ordered grouper

if not Index(self.grouper.group_info[0]).is_monotonic:
index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ]))
result.index = index
result = result.sort_index()

result.index = self.obj.index
return result

def _local_dir(self):
return sorted(set(self.obj._local_dir() + list(self._apply_whitelist)))

Expand Down Expand Up @@ -2087,7 +2105,6 @@ def _convert_grouper(axis, grouper):
else:
return grouper


class SeriesGroupBy(GroupBy):
_apply_whitelist = _series_apply_whitelist

Expand Down Expand Up @@ -2319,18 +2336,7 @@ def _transform_fast(self, func):
counts = self.count().values
values = np.repeat(values, com._ensure_platform_int(counts))

# the values/counts are repeated according to the group index
indices = self.indices

# shortcut of we have an already ordered grouper
if Index(self.grouper.group_info[0]).is_monotonic:
result = Series(values, index=self.obj.index)
else:
index = Index(np.concatenate([ indices[v] for v in self.grouper.result_index ]))
result = Series(values, index=index).sort_index()
result.index = self.obj.index

return result
return self._set_result_index_ordered(Series(values))

def filter(self, func, dropna=True, *args, **kwargs):
"""
Expand Down Expand Up @@ -2842,8 +2848,7 @@ def _transform_general(self, func, *args, **kwargs):
concat_index = obj.columns if self.axis == 0 else obj.index
concatenated = concat(applied, join_axes=[concat_index],
axis=self.axis, verify_integrity=False)
concatenated.sort_index(inplace=True)
return concatenated
return self._set_result_index_ordered(concatenated)

def transform(self, func, *args, **kwargs):
"""
Expand Down
22 changes: 21 additions & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,26 @@ def test_transform(self):
transformed = grouped.transform(lambda x: x * x.sum())
self.assertEqual(transformed[7], 12)

# GH 8046
# make sure that we preserve the input order

df = DataFrame(np.arange(6,dtype='int64').reshape(3,2), columns=["a","b"], index=[0,2,1])
key = [0,0,1]
expected = df.sort_index().groupby(key).transform(lambda x: x-x.mean()).groupby(key).mean()
result = df.groupby(key).transform(lambda x: x-x.mean()).groupby(key).mean()
assert_frame_equal(result, expected)

def demean(arr):
return arr - arr.mean()

people = DataFrame(np.random.randn(5, 5),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
key = ['one', 'two', 'one', 'two', 'one']
result = people.groupby(key).transform(demean).groupby(key).mean()
expected = people.groupby(key).apply(demean).groupby(key).mean()
assert_frame_equal(result, expected)

def test_transform_fast(self):

df = DataFrame( { 'id' : np.arange( 100000 ) / 3,
Expand Down Expand Up @@ -2924,7 +2944,7 @@ def __call__(self, x):
lambda x: sum(x),
lambda x: x.sum(),
partial(sum), fn_class()]

expected = df.groupby("foo").agg(sum)
for ecall in equiv_callables:
result = df.groupby('foo').agg(ecall)
Expand Down