From c97b677c95a62a83b4d49640217642fa81005e7e Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Mon, 26 Jul 2021 20:38:05 +0200 Subject: [PATCH 01/11] do not mangle nans in value_count --- pandas/_libs/hashtable_func_helper.pxi.in | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index fb8ce79a924a4..05cf957a86073 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -31,7 +31,7 @@ dtypes = [('Complex128', 'complex128', 'complex128', @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN): +cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} @@ -63,9 +63,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): val = values[i] is_null = checknull(val) if not is_null or not dropna: - # all nas become the same representative: - if is_null: - val = navalue k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 From 34b960b5988b06d067c71949adb1942abdfc2168 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 27 Jul 2021 06:58:10 +0200 Subject: [PATCH 02/11] fix test cases which assumed mangled nans --- pandas/tests/base/test_value_counts.py | 2 +- pandas/tests/libs/test_hashtable.py | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 5431baf493260..23bb4c5d2670c 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -281,5 +281,5 @@ def test_value_counts_with_nan(dropna, index_or_series): if dropna is True: expected = Series([1], index=[True]) else: - expected = Series([2, 1], index=[pd.NA, True]) + expected = Series([1, 1, 1], index=[True, pd.NA, np.nan]) tm.assert_series_equal(res, expected) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index bdc02ff0aa7a8..3969177a63044 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -453,13 +453,10 @@ def test_mode_stable(self, dtype, writable): def test_modes_with_nans(): - # GH39007 - values = np.array([True, pd.NA, np.nan], dtype=np.object_) - # pd.Na and np.nan will have the same representative: np.nan - # thus we have 2 nans and 1 True + # GH42688, nans aren't mangled + values = np.array([True, pd.NA, np.nan, pd.NA, np.nan], dtype=np.object_) modes = ht.mode(values, False) - assert modes.size == 1 - assert np.isnan(modes[0]) + assert modes.size == 2 def test_unique_label_indices_intp(writable): From ead1da407a11e7682de8b7ca1a51d18fedbaebf1 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 27 Jul 2021 07:07:15 +0200 Subject: [PATCH 03/11] is_null only if really needed --- pandas/_libs/hashtable_func_helper.pxi.in | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 05cf957a86073..e5e64f8dc7b5f 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -42,7 +42,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): # Don't use Py_ssize_t, since table.n_buckets is unsigned khiter_t k - bint is_null {{c_type}} val @@ -61,8 +60,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): for i in range(n): val = values[i] - is_null = checknull(val) - if not is_null or not dropna: + if not dropna or not checknull(val): k = kh_get_{{ttype}}(table, val) if k != table.n_buckets: table.vals[k] += 1 From d53c64c99f352940c117754de24f9136e8ebd66e Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 27 Jul 2021 07:13:43 +0200 Subject: [PATCH 04/11] add asv tests --- asv_bench/benchmarks/series_methods.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 155dd6f8e13a0..ea4d3a54a89e8 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -146,12 +146,24 @@ class ValueCounts: param_names = ["N", "dtype"] def setup(self, N, dtype): - self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype) + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") def time_value_counts(self, N, dtype): self.s.value_counts() +class ValueCountsObjectDropNAFalse: + + params = [10 ** 3, 10 ** 4, 10 ** 5] + param_names = ["N"] + + def setup(self, N): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + + def time_value_counts(self, N): + self.s.value_counts(dropna=False) + + class Mode: params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]] @@ -164,6 +176,18 @@ def time_mode(self, N, dtype): self.s.mode() +class ModeObjectDropNAFalse: + + params = [10 ** 3, 10 ** 4, 10 ** 5] + param_names = ["N"] + + def setup(self, N): + self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + + def time_mode(self, N): + self.s.mode(dropna=False) + + class Dir: def setup(self): self.s = Series(index=tm.makeStringIndex(10000)) From 5b02c52391268824cb278eba0b61ad7160d2e299 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 27 Jul 2021 09:08:15 +0200 Subject: [PATCH 05/11] fix overlooked test case --- pandas/tests/indexing/test_indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a10288b2091ca..49fb953e9342d 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -790,8 +790,8 @@ def test_label_indexing_on_nan(self): # GH 32431 df = Series([1, "{1,2}", 1, None]) vc = df.value_counts(dropna=False) - result1 = vc.loc[np.nan] - result2 = vc[np.nan] + result1 = vc.loc[None] + result2 = vc[None] expected = 1 assert result1 == expected From 94d33785d7bbc91dd579c4134b0d1062f7b06ee9 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 30 Jul 2021 16:35:02 +0200 Subject: [PATCH 06/11] fixing reverting wrong change of asv-test --- asv_bench/benchmarks/series_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index ea4d3a54a89e8..d8578ed604ae3 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -146,7 +146,7 @@ class ValueCounts: param_names = ["N", "dtype"] def setup(self, N, dtype): - self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object") + self.s = Series(np.random.randint(0, N, size=10 * N)).astype(dtype) def time_value_counts(self, N, dtype): self.s.value_counts() From 55d6f6ef3f3d1d6940de0e7da9581e51b79d8d70 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 30 Jul 2021 16:45:42 +0200 Subject: [PATCH 07/11] adding whatsnew note --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 99a66c7e5454b..1719d4d73ba10 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -549,6 +549,7 @@ Missing ^^^^^^^ - Bug in :meth:`DataFrame.fillna` with limit and no method ignores axis='columns' or ``axis = 1`` (:issue:`40989`) - Bug in :meth:`DataFrame.fillna` not replacing missing values when using a dict-like ``value`` and duplicate column names (:issue:`43476`) +- :meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, ``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`) - MultiIndex From e3fd597581e4cab60280f89e6139bdeb53b20ee8 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 30 Jul 2021 17:00:02 +0200 Subject: [PATCH 08/11] parametrize test --- pandas/tests/indexing/test_indexing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 49fb953e9342d..7c7e9f79a77ae 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -786,12 +786,12 @@ def test_no_reference_cycle(self): del df assert wr() is None - def test_label_indexing_on_nan(self): + def test_label_indexing_on_nan(self, nulls_fixture): # GH 32431 - df = Series([1, "{1,2}", 1, None]) + df = Series([1, "{1,2}", 1, nulls_fixture]) vc = df.value_counts(dropna=False) - result1 = vc.loc[None] - result2 = vc[None] + result1 = vc.loc[nulls_fixture] + result2 = vc[nulls_fixture] expected = 1 assert result1 == expected From 088b4946fc348f3debdb83e75b72bcfad7279f1f Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 9 Nov 2021 20:53:32 +0100 Subject: [PATCH 09/11] moving whatsnew note to notable fixes --- doc/source/whatsnew/v1.4.0.rst | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 1719d4d73ba10..e281de348a853 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -240,6 +240,38 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is *New behavior*: +.. ipython:: python + + res + +.. _whatsnew_140.notable_bug_fixes.value_counts_and_mode_do_not_coerse_to_nan: + +Null-values are no longer coerced to NaN-value in value_counts and mode +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, ``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`). + +.. ipython:: python + + s = pd.Series([True, None, pd.NaT, None, pd.NaT, None]) + res = s.value_counts(dropna=False) + +Previously, all null-values were replaced by a NaN-value. + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: res + Out[3]: + NaN 5 + True 1 + dtype: int64 + +Now null-values are no longer mangled. + +*New behavior*: + .. ipython:: python res @@ -549,7 +581,6 @@ Missing ^^^^^^^ - Bug in :meth:`DataFrame.fillna` with limit and no method ignores axis='columns' or ``axis = 1`` (:issue:`40989`) - Bug in :meth:`DataFrame.fillna` not replacing missing values when using a dict-like ``value`` and duplicate column names (:issue:`43476`) -- :meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, ``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`) - MultiIndex From 792cb7b483665bc597f8c972dc45666f1c1ac4db Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 9 Nov 2021 21:10:05 +0100 Subject: [PATCH 10/11] adding pd.NaT to tested null-objects --- pandas/tests/libs/test_hashtable.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 3969177a63044..937eccf7a0afe 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -454,9 +454,10 @@ def test_mode_stable(self, dtype, writable): def test_modes_with_nans(): # GH42688, nans aren't mangled - values = np.array([True, pd.NA, np.nan, pd.NA, np.nan], dtype=np.object_) + nulls = [pd.NA, np.nan, pd.NaT, None] + values = np.array([True] + nulls * 2, dtype=np.object_) modes = ht.mode(values, False) - assert modes.size == 2 + assert modes.size == len(nulls) def test_unique_label_indices_intp(writable): From 89ececeb823cf9c3ffb7612ea8ebe25952f91d1a Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 10 Nov 2021 10:39:55 +0100 Subject: [PATCH 11/11] fixing title line --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index e281de348a853..c909b58b32add 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -247,7 +247,7 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is .. _whatsnew_140.notable_bug_fixes.value_counts_and_mode_do_not_coerse_to_nan: Null-values are no longer coerced to NaN-value in value_counts and mode -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, ``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`).