From ca00c4dd7de4ef4e2ef308656cdbc4a9dd58d5cf Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Sun, 30 Aug 2015 14:47:06 -0400 Subject: [PATCH] PERF: improves performance in SeriesGroupBy.count BUG: closes bug in Series.count when index has nulls --- doc/source/whatsnew/v0.17.0.txt | 2 ++ pandas/core/groupby.py | 9 +++++++++ pandas/core/series.py | 29 +++++++++++++---------------- pandas/lib.pyx | 17 ----------------- pandas/tests/test_series.py | 10 ++++++++++ 5 files changed, 34 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 42752112a64f7..7f56b27f0eab3 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -814,6 +814,8 @@ Bug Fixes - Bug in ``BinGrouper.group_info`` where returned values are not compatible with base class (:issue:`10914`) - Bug in clearing the cache on ``DataFrame.pop`` and a subsequent inplace op (:issue:`10912`) - Bug in indexing with a mixed-integer ``Index`` causing an ``ImportError`` (:issue:`10610`) +- Bug in ``Series.count`` when index has nulls (:issue:`10946`) + - Bug causing ``DataFrame.where`` to not respect the ``axis`` parameter when the frame has a symmetric shape. (:issue:`9736`) - Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f42825a11933b..354c9a6c5579c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2684,6 +2684,15 @@ def value_counts(self, normalize=False, sort=True, ascending=False, return Series(out, index=mi) + def count(self): + ids, _, ngroups = self.grouper.group_info + val = self.obj.get_values() + + mask = (ids != -1) & ~isnull(val) + out = np.bincount(ids[mask], minlength=ngroups) if ngroups != 0 else [] + + return Series(out, index=self.grouper.result_index, name=self.name) + def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2890730956c75..48fe5b6bf2894 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1117,27 +1117,24 @@ def count(self, level=None): ------- nobs : int or Series (if level specified) """ - if level is not None: - mask = notnull(self.values) + from pandas.core.index import _get_na_value - if isinstance(level, compat.string_types): - level = self.index._get_level_number(level) + if level is None: + return notnull(_values_from_object(self)).sum() - level_index = self.index.levels[level] + if isinstance(level, compat.string_types): + level = self.index._get_level_number(level) - if len(self) == 0: - return self._constructor(0, index=level_index)\ - .__finalize__(self) + lev = self.index.levels[level] + lab = np.array(self.index.labels[level], subok=False, copy=True) - # call cython function - max_bin = len(level_index) - labels = com._ensure_int64(self.index.labels[level]) - counts = lib.count_level_1d(mask.view(np.uint8), - labels, max_bin) - return self._constructor(counts, - index=level_index).__finalize__(self) + mask = lab == -1 + if mask.any(): + lab[mask] = cnt = len(lev) + lev = lev.insert(cnt, _get_na_value(lev.dtype.type)) - return notnull(_values_from_object(self)).sum() + out = np.bincount(lab[notnull(self.values)], minlength=len(lev)) + return self._constructor(out, index=lev).__finalize__(self) def mode(self): """Returns the mode(s) of the dataset. diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 07f0c89535a77..720862df97b78 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1253,23 +1253,6 @@ def lookup_values(ndarray[object] values, dict mapping): return maybe_convert_objects(result) -def count_level_1d(ndarray[uint8_t, cast=True] mask, - ndarray[int64_t] labels, Py_ssize_t max_bin): - cdef: - Py_ssize_t i, n - ndarray[int64_t] counts - - counts = np.zeros(max_bin, dtype='i8') - - n = len(mask) - - for i from 0 <= i < n: - if mask[i]: - counts[labels[i]] += 1 - - return counts - - def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, ndarray[int64_t] labels, Py_ssize_t max_bin): cdef: diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 86eafdf7ca2c8..a4392f3045fbb 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4740,6 +4740,16 @@ def test_count(self): self.assertEqual(self.ts.count(), np.isfinite(self.ts).sum()) + mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]]) + ts = Series(np.arange(len(mi)), index=mi) + + left = ts.count(level=1) + right = Series([2, 3, 1], index=[1, 2, nan]) + assert_series_equal(left, right) + + ts.iloc[[0, 3, 5]] = nan + assert_series_equal(ts.count(level=1), right - 1) + def test_dtype(self): self.assertEqual(self.ts.dtype, np.dtype('float64'))