diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 155dd6f8e13a0..d8578ed604ae3 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -152,6 +152,18 @@ def time_value_counts(self, N, dtype): self.s.value_counts() +class ValueCountsObjectDropNAFalse: + + params = [10 ** 3, 10 ** 4, 10 ** 5] + param_names = ["N"] + + def setup(self, N): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + + def time_value_counts(self, N): + self.s.value_counts(dropna=False) + + class Mode: params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]] @@ -164,6 +176,18 @@ def time_mode(self, N, dtype): self.s.mode() +class ModeObjectDropNAFalse: + + params = [10 ** 3, 10 ** 4, 10 ** 5] + param_names = ["N"] + + def setup(self, N): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + + def time_mode(self, N): + self.s.mode(dropna=False) + + class Dir: def setup(self): self.s = Series(index=tm.makeStringIndex(10000)) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 99a66c7e5454b..c909b58b32add 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -240,6 +240,38 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is *New behavior*: +.. ipython:: python + + res + +.. _whatsnew_140.notable_bug_fixes.value_counts_and_mode_do_not_coerse_to_nan: + +Null-values are no longer coerced to NaN-value in value_counts and mode +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, ``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`). + +.. ipython:: python + + s = pd.Series([True, None, pd.NaT, None, pd.NaT, None]) + res = s.value_counts(dropna=False) + +Previously, all null-values were replaced by a NaN-value. + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: res + Out[3]: + NaN 5 + True 1 + dtype: int64 + +Now null-values are no longer mangled. + +*New behavior*: + .. ipython:: python res diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index fb8ce79a924a4..e5e64f8dc7b5f 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -31,7 +31,7 @@ dtypes = [('Complex128', 'complex128', 'complex128', @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN): +cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} @@ -42,7 +42,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): # Don't use Py_ssize_t, since table.n_buckets is unsigned khiter_t k - bint is_null {{c_type}} val @@ -61,11 +60,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): for i in range(n): val = values[i] - is_null = checknull(val) - if not is_null or not dropna: - # all nas become the same representative: - if is_null: - val = navalue + if not dropna or not checknull(val): k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 5431baf493260..23bb4c5d2670c 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -281,5 +281,5 @@ def test_value_counts_with_nan(dropna, index_or_series): if dropna is True: expected = Series([1], index=[True]) else: - expected = Series([2, 1], index=[pd.NA, True]) + expected = Series([1, 1, 1], index=[True, pd.NA, np.nan]) tm.assert_series_equal(res, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a10288b2091ca..7c7e9f79a77ae 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -786,12 +786,12 @@ def test_no_reference_cycle(self): del df assert wr() is None - def test_label_indexing_on_nan(self): + def test_label_indexing_on_nan(self, nulls_fixture): # GH 32431 - df = Series([1, "{1,2}", 1, None]) + df = Series([1, "{1,2}", 1, nulls_fixture]) vc = df.value_counts(dropna=False) - result1 = vc.loc[np.nan] - result2 = vc[np.nan] + result1 = vc.loc[nulls_fixture] + result2 = vc[nulls_fixture] expected = 1 assert result1 == expected diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index bdc02ff0aa7a8..937eccf7a0afe 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -453,13 +453,11 @@ def test_mode_stable(self, dtype, writable): def test_modes_with_nans(): - # GH39007 - values = np.array([True, pd.NA, np.nan], dtype=np.object_) - # pd.Na and np.nan will have the same representative: np.nan - # thus we have 2 nans and 1 True + # GH42688, nans aren't mangled + nulls = [pd.NA, np.nan, pd.NaT, None] + values = np.array([True] + nulls * 2, dtype=np.object_) modes = ht.mode(values, False) - assert modes.size == 1 - assert np.isnan(modes[0]) + assert modes.size == len(nulls) def test_unique_label_indices_intp(writable):