Skip to content

Commit f9e5069

Browse files
kawochenjreback
authored andcommitted
BUG: GH12558 where nulls contributed to normalized value_counts
Closes #12558 Author: Ka Wo Chen <[email protected]> Closes #12566 from kawochen/BUG-FIX-12558 and squashes the following commits: d24c59e [Ka Wo Chen] BUG: GH12558 where nulls contributed to normalized value_counts
1 parent f53b0df commit f9e5069

File tree

4 files changed

+32
-4
lines changed

4 files changed

+32
-4
lines changed

doc/source/whatsnew/v0.18.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,5 @@ Performance Improvements
8787

8888
Bug Fixes
8989
~~~~~~~~~
90+
91+
- Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`)

pandas/core/algorithms.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
342342
result = result.sort_values(ascending=ascending)
343343

344344
if normalize:
345-
result = result / float(values.size)
345+
result = result / float(counts.sum())
346346

347347
return result
348348

pandas/core/groupby.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
callable, map
1212
)
1313
from pandas import compat
14-
14+
from pandas.compat.numpy_compat import _np_version_under1p8
1515
from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
1616
DataError, SpecificationError)
1717
from pandas.core.categorical import Categorical
@@ -2949,8 +2949,18 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
29492949

29502950
if normalize:
29512951
out = out.astype('float')
2952-
acc = rep(np.diff(np.r_[idx, len(ids)]))
2953-
out /= acc[mask] if dropna else acc
2952+
d = np.diff(np.r_[idx, len(ids)])
2953+
if dropna:
2954+
m = ids[lab == -1]
2955+
if _np_version_under1p8:
2956+
mi, ml = algos.factorize(m)
2957+
d[ml] = d[ml] - np.bincount(mi)
2958+
else:
2959+
np.add.at(d, m, -1)
2960+
acc = rep(d)[mask]
2961+
else:
2962+
acc = rep(d)
2963+
out /= acc
29542964

29552965
if sort and bins is None:
29562966
cat = ids[inc][mask] if dropna else ids[inc]

pandas/tests/test_algos.py

+16
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,22 @@ def test_dropna(self):
517517
pd.Series([10.3, 5., 5., None]).value_counts(dropna=False),
518518
pd.Series([2, 1, 1], index=[5., 10.3, np.nan]))
519519

520+
def test_value_counts_normalized(self):
521+
# GH12558
522+
s = Series([1, 2, np.nan, np.nan, np.nan])
523+
dtypes = (np.float64, np.object, 'M8[ns]')
524+
for t in dtypes:
525+
s_typed = s.astype(t)
526+
result = s_typed.value_counts(normalize=True, dropna=False)
527+
expected = Series([0.6, 0.2, 0.2],
528+
index=Series([np.nan, 2.0, 1.0], dtype=t))
529+
tm.assert_series_equal(result, expected)
530+
531+
result = s_typed.value_counts(normalize=True, dropna=True)
532+
expected = Series([0.5, 0.5],
533+
index=Series([2.0, 1.0], dtype=t))
534+
tm.assert_series_equal(result, expected)
535+
520536

521537
class GroupVarTestMixin(object):
522538

0 commit comments

Comments
 (0)