Skip to content

Commit 1a47ee4

Browse files
committed
BUG/TST: transform and filter on non-unique index, closes pandas-dev#4620
1 parent 3ebd769 commit 1a47ee4

File tree

3 files changed

+304
-28
lines changed

3 files changed

+304
-28
lines changed

doc/source/release.rst

+3
Original file line numberDiff line numberDiff line change
@@ -762,6 +762,9 @@ Bug Fixes
762762
- Make sure that ``head/tail`` are ``iloc`` based, (:issue:`5370`)
763763
- Fixed bug for ``PeriodIndex`` string representation if there are 1 or 2
764764
elements. (:issue:`5372`)
765+
- The GroupBy methods ``transform`` and ``filter`` can be used on Series
766+
and DataFrames that have repeated (non-unique) indices. (:issue:`4620`)
767+
765768

766769
pandas 0.12.0
767770
-------------

pandas/core/groupby.py

+28-26
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,22 @@ def _concat_objects(self, keys, values, not_indexed_same=False):
566566

567567
return result
568568

569+
def _apply_filter(self, indices, dropna):
570+
if len(indices) == 0:
571+
indices = []
572+
else:
573+
indices = np.sort(np.concatenate(indices))
574+
if dropna:
575+
filtered = self.obj.take(indices)
576+
else:
577+
mask = np.empty(len(self.obj.index), dtype=bool)
578+
mask.fill(False)
579+
mask[indices.astype(int)] = True
580+
# mask fails to broadcast when passed to where; broadcast manually.
581+
mask = np.tile(mask, list(self.obj.shape[1:]) + [1]).T
582+
filtered = self.obj.where(mask) # Fill with NaNs.
583+
return filtered
584+
569585

570586
@Appender(GroupBy.__doc__)
571587
def groupby(obj, by, **kwds):
@@ -1585,14 +1601,13 @@ def transform(self, func, *args, **kwargs):
15851601
group = com.ensure_float(group)
15861602
object.__setattr__(group, 'name', name)
15871603
res = wrapper(group)
1588-
indexer = self.obj.index.get_indexer(group.index)
15891604
if hasattr(res,'values'):
15901605
res = res.values
15911606

15921607
# need to do a safe put here, as the dtype may be different
15931608
# this needs to be an ndarray
15941609
result = Series(result)
1595-
result.loc[indexer] = res
1610+
result.iloc[self.indices[name]] = res
15961611
result = result.values
15971612

15981613
# downcast if we can (and need)
@@ -1630,22 +1645,15 @@ def true_and_notnull(x, *args, **kwargs):
16301645
return b and notnull(b)
16311646

16321647
try:
1633-
indexers = [self.obj.index.get_indexer(group.index) \
1634-
if true_and_notnull(group) else [] \
1635-
for _ , group in self]
1648+
indices = [self.indices[name] if true_and_notnull(group) else []
1649+
for name, group in self]
16361650
except ValueError:
16371651
raise TypeError("the filter must return a boolean result")
16381652
except TypeError:
16391653
raise TypeError("the filter must return a boolean result")
16401654

1641-
if len(indexers) == 0:
1642-
filtered = self.obj.take([]) # because np.concatenate would fail
1643-
else:
1644-
filtered = self.obj.take(np.sort(np.concatenate(indexers)))
1645-
if dropna:
1646-
return filtered
1647-
else:
1648-
return filtered.reindex(self.obj.index) # Fill with NaNs.
1655+
filtered = self._apply_filter(indices, dropna)
1656+
return filtered
16491657

16501658

16511659
class NDFrameGroupBy(GroupBy):
@@ -2125,7 +2133,7 @@ def filter(self, func, dropna=True, *args, **kwargs):
21252133
"""
21262134
from pandas.tools.merge import concat
21272135

2128-
indexers = []
2136+
indices = []
21292137

21302138
obj = self._obj_with_exclusions
21312139
gen = self.grouper.get_iterator(obj, axis=self.axis)
@@ -2146,31 +2154,25 @@ def filter(self, func, dropna=True, *args, **kwargs):
21462154
else:
21472155
res = path(group)
21482156

2149-
def add_indexer():
2150-
indexers.append(self.obj.index.get_indexer(group.index))
2157+
def add_indices():
2158+
indices.append(self.indices[name])
21512159

21522160
# interpret the result of the filter
21532161
if isinstance(res,(bool,np.bool_)):
21542162
if res:
2155-
add_indexer()
2163+
add_indices()
21562164
else:
21572165
if getattr(res,'ndim',None) == 1:
21582166
val = res.ravel()[0]
21592167
if val and notnull(val):
2160-
add_indexer()
2168+
add_indices()
21612169
else:
21622170

21632171
# in theory you could do .all() on the boolean result ?
21642172
raise TypeError("the filter must return a boolean result")
21652173

2166-
if len(indexers) == 0:
2167-
filtered = self.obj.take([]) # because np.concatenate would fail
2168-
else:
2169-
filtered = self.obj.take(np.sort(np.concatenate(indexers)))
2170-
if dropna:
2171-
return filtered
2172-
else:
2173-
return filtered.reindex(self.obj.index) # Fill with NaNs.
2174+
filtered = self._apply_filter(indices, dropna)
2175+
return filtered
21742176

21752177

21762178
class DataFrameGroupBy(NDFrameGroupBy):

0 commit comments

Comments
 (0)