Skip to content

Commit 4832e34

Browse files
behzadnouriNick Eubank
authored and
Nick Eubank
committed
PERF: improves performance in SeriesGroupBy.count, pandas-dev#10946
BUG: closes bug in Series.count when index has nulls
1 parent 1cbdcb5 commit 4832e34

File tree

5 files changed

+37
-36
lines changed

5 files changed

+37
-36
lines changed

doc/source/whatsnew/v0.17.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -913,6 +913,8 @@ Bug Fixes
913913
- Bug in ``BinGrouper.group_info`` where returned values are not compatible with base class (:issue:`10914`)
914914
- Bug in clearing the cache on ``DataFrame.pop`` and a subsequent inplace op (:issue:`10912`)
915915
- Bug in indexing with a mixed-integer ``Index`` causing an ``ImportError`` (:issue:`10610`)
916+
- Bug in ``Series.count`` when index has nulls (:issue:`10946`)
917+
916918
- Bug causing ``DataFrame.where`` to not respect the ``axis`` parameter when the frame has a symmetric shape. (:issue:`9736`)
917919

918920
- Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`)

pandas/core/groupby.py

+9
Original file line numberDiff line numberDiff line change
@@ -2685,6 +2685,15 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
26852685

26862686
return Series(out, index=mi)
26872687

2688+
def count(self):
2689+
ids, _, ngroups = self.grouper.group_info
2690+
val = self.obj.get_values()
2691+
2692+
mask = (ids != -1) & ~isnull(val)
2693+
out = np.bincount(ids[mask], minlength=ngroups) if ngroups != 0 else []
2694+
2695+
return Series(out, index=self.grouper.result_index, name=self.name)
2696+
26882697
def _apply_to_column_groupbys(self, func):
26892698
""" return a pass thru """
26902699
return func(self)

pandas/core/series.py

+16-19
Original file line numberDiff line numberDiff line change
@@ -1159,27 +1159,24 @@ def count(self, level=None):
11591159
-------
11601160
nobs : int or Series (if level specified)
11611161
"""
1162-
if level is not None:
1163-
mask = notnull(self._values)
1162+
from pandas.core.index import _get_na_value
11641163

1165-
if isinstance(level, compat.string_types):
1166-
level = self.index._get_level_number(level)
1164+
if level is None:
1165+
return notnull(_values_from_object(self)).sum()
11671166

1168-
level_index = self.index.levels[level]
1167+
if isinstance(level, compat.string_types):
1168+
level = self.index._get_level_number(level)
11691169

1170-
if len(self) == 0:
1171-
return self._constructor(0, index=level_index)\
1172-
.__finalize__(self)
1170+
lev = self.index.levels[level]
1171+
lab = np.array(self.index.labels[level], subok=False, copy=True)
11731172

1174-
# call cython function
1175-
max_bin = len(level_index)
1176-
labels = com._ensure_int64(self.index.labels[level])
1177-
counts = lib.count_level_1d(mask.view(np.uint8),
1178-
labels, max_bin)
1179-
return self._constructor(counts,
1180-
index=level_index).__finalize__(self)
1173+
mask = lab == -1
1174+
if mask.any():
1175+
lab[mask] = cnt = len(lev)
1176+
lev = lev.insert(cnt, _get_na_value(lev.dtype.type))
11811177

1182-
return notnull(_values_from_object(self)).sum()
1178+
out = np.bincount(lab[notnull(self.values)], minlength=len(lev))
1179+
return self._constructor(out, index=lev).__finalize__(self)
11831180

11841181
def mode(self):
11851182
"""Returns the mode(s) of the dataset.
@@ -2121,7 +2118,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
21212118
21222119
>>> import pandas as pd
21232120
>>> import numpy as np
2124-
>>> series = pd.Series([20, 21, 12], index=['London',
2121+
>>> series = pd.Series([20, 21, 12], index=['London',
21252122
... 'New York','Helsinki'])
21262123
London 20
21272124
New York 21
@@ -2149,7 +2146,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
21492146
dtype: int64
21502147
21512148
Define a custom function that needs additional positional
2152-
arguments and pass these additional arguments using the
2149+
arguments and pass these additional arguments using the
21532150
``args`` keyword.
21542151
21552152
>>> def subtract_custom_value(x, custom_value):
@@ -2175,7 +2172,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
21752172
Helsinki 87
21762173
dtype: int64
21772174
2178-
Use a function from the Numpy library.
2175+
Use a function from the Numpy library.
21792176
21802177
>>> series.apply(np.log)
21812178
London 2.995732

pandas/lib.pyx

-17
Original file line numberDiff line numberDiff line change
@@ -1253,23 +1253,6 @@ def lookup_values(ndarray[object] values, dict mapping):
12531253
return maybe_convert_objects(result)
12541254

12551255

1256-
def count_level_1d(ndarray[uint8_t, cast=True] mask,
1257-
ndarray[int64_t] labels, Py_ssize_t max_bin):
1258-
cdef:
1259-
Py_ssize_t i, n
1260-
ndarray[int64_t] counts
1261-
1262-
counts = np.zeros(max_bin, dtype='i8')
1263-
1264-
n = len(mask)
1265-
1266-
for i from 0 <= i < n:
1267-
if mask[i]:
1268-
counts[labels[i]] += 1
1269-
1270-
return counts
1271-
1272-
12731256
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
12741257
ndarray[int64_t] labels, Py_ssize_t max_bin):
12751258
cdef:

pandas/tests/test_series.py

+10
Original file line numberDiff line numberDiff line change
@@ -4904,6 +4904,16 @@ def test_count(self):
49044904

49054905
self.assertEqual(self.ts.count(), np.isfinite(self.ts).sum())
49064906

4907+
mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]])
4908+
ts = Series(np.arange(len(mi)), index=mi)
4909+
4910+
left = ts.count(level=1)
4911+
right = Series([2, 3, 1], index=[1, 2, nan])
4912+
assert_series_equal(left, right)
4913+
4914+
ts.iloc[[0, 3, 5]] = nan
4915+
assert_series_equal(ts.count(level=1), right - 1)
4916+
49074917
def test_dtype(self):
49084918

49094919
self.assertEqual(self.ts.dtype, np.dtype('float64'))

0 commit comments

Comments
 (0)