From f9e6c3ddfb7781356324b3c7d633d0f2993d6a8d Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 12 Sep 2015 11:31:02 -0700 Subject: [PATCH] Fix Series.nunique groupby with object --- doc/source/whatsnew/v0.17.0.txt | 2 +- pandas/core/groupby.py | 12 +++++++++++- pandas/tests/test_groupby.py | 16 ++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 9d8532aa3649a..7fe92b580536a 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -1014,7 +1014,7 @@ Performance Improvements - Development support for benchmarking with the `Air Speed Velocity library `_ (:issue:`8316`) - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`) - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`) -- Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` (:issue:`10820`) +- Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` (:issue:`10820`, :issue:`11077`) - Performance improvements in ``DataFrame.drop_duplicates`` with integer dtypes (:issue:`10917`) - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`) - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f34fd6e3d2575..8e44480c0c09b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2565,7 +2565,17 @@ def nunique(self, dropna=True): ids, _, _ = self.grouper.group_info val = self.obj.get_values() - sorter = np.lexsort((val, ids)) + try: + sorter = np.lexsort((val, ids)) + except TypeError: # catches object dtypes + assert val.dtype == object, \ + 'val.dtype must be object, got %s' % val.dtype + val, _ = algos.factorize(val, sort=False) + sorter = np.lexsort((val, ids)) + isnull = lambda a: a == -1 + else: + isnull = com.isnull + ids, val = ids[sorter], val[sorter] # group boundries are where group ids change diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 97b57690ccc49..0336ee2e9b50e 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -5511,6 +5511,22 @@ def test_sort(x): g.apply(test_sort) + def test_nunique_with_object(self): + # GH 11077 + data = pd.DataFrame( + [[100, 1, 'Alice'], + [200, 2, 'Bob'], + [300, 3, 'Charlie'], + [-400, 4, 'Dan'], + [500, 5, 'Edith']], + columns=['amount', 'id', 'name'] + ) + + result = data.groupby(['id', 'amount'])['name'].nunique() + index = MultiIndex.from_arrays([data.id, data.amount]) + expected = pd.Series([1] * 5, name='name', index=index) + tm.assert_series_equal(result, expected) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all()