Skip to content

Commit 33723f9

Browse files
behzadnourijreback
authored andcommitted
PERF: improves performance in SeriesGroupBy.count, pandas-dev#10946
BUG: closes bug in Series.count when index has nulls
1 parent 666540f commit 33723f9

File tree

5 files changed

+37
-36
lines changed

5 files changed

+37
-36
lines changed

doc/source/whatsnew/v0.17.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -893,6 +893,8 @@ Bug Fixes
893893
- Bug in ``BinGrouper.group_info`` where returned values are not compatible with base class (:issue:`10914`)
894894
- Bug in clearing the cache on ``DataFrame.pop`` and a subsequent inplace op (:issue:`10912`)
895895
- Bug in indexing with a mixed-integer ``Index`` causing an ``ImportError`` (:issue:`10610`)
896+
- Bug in ``Series.count`` when index has nulls (:issue:`10946`)
897+
896898
- Bug causing ``DataFrame.where`` to not respect the ``axis`` parameter when the frame has a symmetric shape. (:issue:`9736`)
897899

898900
- Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`)

pandas/core/groupby.py

+9
Original file line numberDiff line numberDiff line change
@@ -2684,6 +2684,15 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
26842684

26852685
return Series(out, index=mi)
26862686

2687+
def count(self):
2688+
ids, _, ngroups = self.grouper.group_info
2689+
val = self.obj.get_values()
2690+
2691+
mask = (ids != -1) & ~isnull(val)
2692+
out = np.bincount(ids[mask], minlength=ngroups) if ngroups != 0 else []
2693+
2694+
return Series(out, index=self.grouper.result_index, name=self.name)
2695+
26872696
def _apply_to_column_groupbys(self, func):
26882697
""" return a pass thru """
26892698
return func(self)

pandas/core/series.py

+16-19
Original file line numberDiff line numberDiff line change
@@ -1142,27 +1142,24 @@ def count(self, level=None):
11421142
-------
11431143
nobs : int or Series (if level specified)
11441144
"""
1145-
if level is not None:
1146-
mask = notnull(self._values)
1145+
from pandas.core.index import _get_na_value
11471146

1148-
if isinstance(level, compat.string_types):
1149-
level = self.index._get_level_number(level)
1147+
if level is None:
1148+
return notnull(_values_from_object(self)).sum()
11501149

1151-
level_index = self.index.levels[level]
1150+
if isinstance(level, compat.string_types):
1151+
level = self.index._get_level_number(level)
11521152

1153-
if len(self) == 0:
1154-
return self._constructor(0, index=level_index)\
1155-
.__finalize__(self)
1153+
lev = self.index.levels[level]
1154+
lab = np.array(self.index.labels[level], subok=False, copy=True)
11561155

1157-
# call cython function
1158-
max_bin = len(level_index)
1159-
labels = com._ensure_int64(self.index.labels[level])
1160-
counts = lib.count_level_1d(mask.view(np.uint8),
1161-
labels, max_bin)
1162-
return self._constructor(counts,
1163-
index=level_index).__finalize__(self)
1156+
mask = lab == -1
1157+
if mask.any():
1158+
lab[mask] = cnt = len(lev)
1159+
lev = lev.insert(cnt, _get_na_value(lev.dtype.type))
11641160

1165-
return notnull(_values_from_object(self)).sum()
1161+
out = np.bincount(lab[notnull(self.values)], minlength=len(lev))
1162+
return self._constructor(out, index=lev).__finalize__(self)
11661163

11671164
def mode(self):
11681165
"""Returns the mode(s) of the dataset.
@@ -2104,7 +2101,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
21042101
21052102
>>> import pandas as pd
21062103
>>> import numpy as np
2107-
>>> series = pd.Series([20, 21, 12], index=['London',
2104+
>>> series = pd.Series([20, 21, 12], index=['London',
21082105
... 'New York','Helsinki'])
21092106
London 20
21102107
New York 21
@@ -2132,7 +2129,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
21322129
dtype: int64
21332130
21342131
Define a custom function that needs additional positional
2135-
arguments and pass these additional arguments using the
2132+
arguments and pass these additional arguments using the
21362133
``args`` keyword.
21372134
21382135
>>> def subtract_custom_value(x, custom_value):
@@ -2158,7 +2155,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
21582155
Helsinki 87
21592156
dtype: int64
21602157
2161-
Use a function from the Numpy library.
2158+
Use a function from the Numpy library.
21622159
21632160
>>> series.apply(np.log)
21642161
London 2.995732

pandas/lib.pyx

-17
Original file line numberDiff line numberDiff line change
@@ -1253,23 +1253,6 @@ def lookup_values(ndarray[object] values, dict mapping):
12531253
return maybe_convert_objects(result)
12541254

12551255

1256-
def count_level_1d(ndarray[uint8_t, cast=True] mask,
1257-
ndarray[int64_t] labels, Py_ssize_t max_bin):
1258-
cdef:
1259-
Py_ssize_t i, n
1260-
ndarray[int64_t] counts
1261-
1262-
counts = np.zeros(max_bin, dtype='i8')
1263-
1264-
n = len(mask)
1265-
1266-
for i from 0 <= i < n:
1267-
if mask[i]:
1268-
counts[labels[i]] += 1
1269-
1270-
return counts
1271-
1272-
12731256
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
12741257
ndarray[int64_t] labels, Py_ssize_t max_bin):
12751258
cdef:

pandas/tests/test_series.py

+10
Original file line numberDiff line numberDiff line change
@@ -4907,6 +4907,16 @@ def test_count(self):
49074907

49084908
self.assertEqual(self.ts.count(), np.isfinite(self.ts).sum())
49094909

4910+
mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]])
4911+
ts = Series(np.arange(len(mi)), index=mi)
4912+
4913+
left = ts.count(level=1)
4914+
right = Series([2, 3, 1], index=[1, 2, nan])
4915+
assert_series_equal(left, right)
4916+
4917+
ts.iloc[[0, 3, 5]] = nan
4918+
assert_series_equal(ts.count(level=1), right - 1)
4919+
49104920
def test_dtype(self):
49114921

49124922
self.assertEqual(self.ts.dtype, np.dtype('float64'))

0 commit comments

Comments
 (0)