From 1a47ee41d071504b734ac276a8cd7361e4c343aa Mon Sep 17 00:00:00 2001 From: danielballan Date: Wed, 30 Oct 2013 23:14:36 -0400 Subject: [PATCH] BUG/TST: transform and filter on non-unique index, closes #4620 --- doc/source/release.rst | 3 + pandas/core/groupby.py | 54 +++---- pandas/tests/test_groupby.py | 275 ++++++++++++++++++++++++++++++++++- 3 files changed, 304 insertions(+), 28 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 8a9163075fb9c..cd9f688de152d 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -762,6 +762,9 @@ Bug Fixes - Make sure that ``head/tail`` are ``iloc`` based, (:issue:`5370`) - Fixed bug for ``PeriodIndex`` string representation if there are 1 or 2 elements. (:issue:`5372`) + - The GroupBy methods ``transform`` and ``filter`` can be used on Series + and DataFrames that have repeated (non-unique) indices. (:issue:`4620`) + pandas 0.12.0 ------------- diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4beb6ecf1a63b..668c665613c0d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -566,6 +566,22 @@ def _concat_objects(self, keys, values, not_indexed_same=False): return result + def _apply_filter(self, indices, dropna): + if len(indices) == 0: + indices = [] + else: + indices = np.sort(np.concatenate(indices)) + if dropna: + filtered = self.obj.take(indices) + else: + mask = np.empty(len(self.obj.index), dtype=bool) + mask.fill(False) + mask[indices.astype(int)] = True + # mask fails to broadcast when passed to where; broadcast manually. + mask = np.tile(mask, list(self.obj.shape[1:]) + [1]).T + filtered = self.obj.where(mask) # Fill with NaNs. + return filtered + @Appender(GroupBy.__doc__) def groupby(obj, by, **kwds): @@ -1585,14 +1601,13 @@ def transform(self, func, *args, **kwargs): group = com.ensure_float(group) object.__setattr__(group, 'name', name) res = wrapper(group) - indexer = self.obj.index.get_indexer(group.index) if hasattr(res,'values'): res = res.values # need to do a safe put here, as the dtype may be different # this needs to be an ndarray result = Series(result) - result.loc[indexer] = res + result.iloc[self.indices[name]] = res result = result.values # downcast if we can (and need) @@ -1630,22 +1645,15 @@ def true_and_notnull(x, *args, **kwargs): return b and notnull(b) try: - indexers = [self.obj.index.get_indexer(group.index) \ - if true_and_notnull(group) else [] \ - for _ , group in self] + indices = [self.indices[name] if true_and_notnull(group) else [] + for name, group in self] except ValueError: raise TypeError("the filter must return a boolean result") except TypeError: raise TypeError("the filter must return a boolean result") - if len(indexers) == 0: - filtered = self.obj.take([]) # because np.concatenate would fail - else: - filtered = self.obj.take(np.sort(np.concatenate(indexers))) - if dropna: - return filtered - else: - return filtered.reindex(self.obj.index) # Fill with NaNs. + filtered = self._apply_filter(indices, dropna) + return filtered class NDFrameGroupBy(GroupBy): @@ -2125,7 +2133,7 @@ def filter(self, func, dropna=True, *args, **kwargs): """ from pandas.tools.merge import concat - indexers = [] + indices = [] obj = self._obj_with_exclusions gen = self.grouper.get_iterator(obj, axis=self.axis) @@ -2146,31 +2154,25 @@ def filter(self, func, dropna=True, *args, **kwargs): else: res = path(group) - def add_indexer(): - indexers.append(self.obj.index.get_indexer(group.index)) + def add_indices(): + indices.append(self.indices[name]) # interpret the result of the filter if isinstance(res,(bool,np.bool_)): if res: - add_indexer() + add_indices() else: if getattr(res,'ndim',None) == 1: val = res.ravel()[0] if val and notnull(val): - add_indexer() + add_indices() else: # in theory you could do .all() on the boolean result ? raise TypeError("the filter must return a boolean result") - if len(indexers) == 0: - filtered = self.obj.take([]) # because np.concatenate would fail - else: - filtered = self.obj.take(np.sort(np.concatenate(indexers))) - if dropna: - return filtered - else: - return filtered.reindex(self.obj.index) # Fill with NaNs. + filtered = self._apply_filter(indices, dropna) + return filtered class DataFrameGroupBy(NDFrameGroupBy): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index f71d7ff9d096b..ca74f46122d88 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -7,8 +7,8 @@ from datetime import datetime from numpy import nan -from pandas import bdate_range -from pandas.core.index import Index, MultiIndex +from pandas import bdate_range, Timestamp +from pandas.core.index import Index, MultiIndex, Int64Index from pandas.core.common import rands from pandas.core.api import Categorical, DataFrame from pandas.core.groupby import SpecificationError, DataError @@ -2801,6 +2801,277 @@ def test_filter_maintains_ordering(self): expected = s.iloc[[1, 2, 4, 7]] assert_series_equal(actual, expected) + def test_filter_and_transform_with_non_unique_int_index(self): + # GH4620 + index = [1, 1, 1, 2, 1, 1, 0, 1] + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_multiple_non_unique_int_index(self): + # GH4620 + index = [1, 1, 1, 2, 0, 0, 0, 1] + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_float_index(self): + # GH4620 + index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_float_index(self): + # GH4620 + index = np.array([1, 1, 1, 2, 0, 0, 0, 1], dtype=float) + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_timestamp_index(self): + # GH4620 + t0 = Timestamp('2013-09-30 00:05:00') + t1 = Timestamp('2013-10-30 00:05:00') + t2 = Timestamp('2013-11-30 00:05:00') + index = [t1, t1, t1, t2, t1, t1, t0, t1] + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_string_index(self): + # GH4620 + index = list('bbbcbbab') + df = DataFrame({'pid' : [1,1,1,2,2,3,3,3], + 'tag' : [23,45,62,24,45,34,25,62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA,1,1,NA,2,NA,NA,3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index) + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_index_label_overlaps_location(self): + # checking we don't have any label/location confusion in the + # the wake of GH5375 + df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1]) + g = df.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = df.iloc[[1, 3, 4]] + assert_frame_equal(actual, expected) + + ser = df[0] + g = ser.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = ser.take([1, 3, 4]) + assert_series_equal(actual, expected) + + # ... and again, with a generic Index of floats + df.index = df.index.astype(float) + g = df.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = df.iloc[[1, 3, 4]] + assert_frame_equal(actual, expected) + + ser = df[0] + g = ser.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = ser.take([1, 3, 4]) + assert_series_equal(actual, expected) + def test_groupby_whitelist(self): from string import ascii_lowercase letters = np.array(list(ascii_lowercase))