From 833ca2daffc8f6649c8aa04175f6d649af2ac4b5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 30 Sep 2023 21:01:59 +0100 Subject: [PATCH 1/6] Implement masked algorithm for mode --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/_libs/hashtable_func_helper.pxi.in | 10 +++++++--- pandas/core/algorithms.py | 5 ++++- pandas/core/arrays/masked.py | 10 ++++++++++ pandas/tests/libs/test_hashtable.py | 10 +++++----- pandas/tests/series/test_reductions.py | 22 ++++++++++++++++++++++ 6 files changed, 49 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 9dc095e6de6ff..81b6b42cd4c06 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -77,7 +77,7 @@ Other enhancements - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) -- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) +- Implement masked algorithms for :meth:`Series.value_counts` and :meth:`Series.mode` (:issue:`54984`, :issue:`55339`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 19acd4acbdee7..336af306d410f 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -404,12 +404,13 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): cdef: ndarray[htfunc_t] keys ndarray[htfunc_t] modes + ndarray[uint8_t] res_mask = None int64_t[::1] counts int64_t count, _, max_count = -1 - Py_ssize_t nkeys, k, j = 0 + Py_ssize_t nkeys, k, na_counter, j = 0 - keys, counts, _ = value_count(values, dropna, mask=mask) + keys, counts, na_counter = value_count(values, dropna, mask=mask) nkeys = len(keys) modes = np.empty(nkeys, dtype=values.dtype) @@ -440,7 +441,10 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None): modes[j] = keys[k] - return modes[:j + 1] + if na_counter > 0: + res_mask = np.zeros(j+1, dtype=np.bool_) + res_mask[j] = True + return modes[:j + 1], res_mask {{py: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c952178f4c998..81ae53bb8164f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1039,7 +1039,10 @@ def mode( values = _ensure_data(values) - npresult = htable.mode(values, dropna=dropna, mask=mask) + npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask) + if res_mask is not None: + return npresult, res_mask + try: npresult = np.sort(npresult) except TypeError as err: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9b85fb0477e6f..3b3073cd5f798 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -69,6 +69,7 @@ from pandas.core.algorithms import ( factorize_array, isin, + mode, take, ) from pandas.core.array_algos import ( @@ -1061,6 +1062,15 @@ def value_counts(self, dropna: bool = True) -> Series: ) return Series(arr, index=index, name="count", copy=False) + def _mode(self, dropna: bool = True) -> Self: + if dropna: + result = mode(self._data, dropna=dropna, mask=self._mask) + res_mask = np.zeros(result.shape, dtype=np.bool_) + else: + result, res_mask = mode(self._data, dropna=dropna, mask=self._mask) + result = type(self)(result, res_mask) + return result[result.argsort()] + @doc(ExtensionArray.equals) def equals(self, other) -> bool: if type(self) != type(other): diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 2c8f4c4149528..e54764f9ac4a6 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -644,13 +644,13 @@ def test_mode(self, dtype, writable): values = np.repeat(np.arange(N).astype(dtype), 5) values[0] = 42 values.flags.writeable = writable - result = ht.mode(values, False) + result = ht.mode(values, False)[0] assert result == 42 def test_mode_stable(self, dtype, writable): values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) values.flags.writeable = writable - keys = ht.mode(values, False) + keys = ht.mode(values, False)[0] tm.assert_numpy_array_equal(keys, values) @@ -658,7 +658,7 @@ def test_modes_with_nans(): # GH42688, nans aren't mangled nulls = [pd.NA, np.nan, pd.NaT, None] values = np.array([True] + nulls * 2, dtype=np.object_) - modes = ht.mode(values, False) + modes = ht.mode(values, False)[0] assert modes.size == len(nulls) @@ -724,8 +724,8 @@ def test_ismember_no(self, dtype): def test_mode(self, dtype): values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype) - assert ht.mode(values, True) == 42 - assert np.isnan(ht.mode(values, False)) + assert ht.mode(values, True)[0] == 42 + assert np.isnan(ht.mode(values, False)[0]) def test_ismember_tuple_with_nans(): diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 1e1ac100b21bf..2dbe520672e35 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -29,6 +29,28 @@ def test_mode_extension_dtype(as_period): tm.assert_series_equal(res, ser) +def test_mode_nullable_dtype(any_numeric_ea_dtype): + # GH#55339 + ser = Series([1, 3, 2, pd.NA, 3, 2, pd.NA], dtype=any_numeric_ea_dtype) + result = ser.mode(dropna=False) + expected = Series([2, 3, pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + result = ser.mode(dropna=True) + expected = Series([2, 3], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + ser[-1] = pd.NA + + result = ser.mode(dropna=True) + expected = Series([2, 3], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + result = ser.mode(dropna=False) + expected = Series([pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + def test_reductions_td64_with_nat(): # GH#8617 ser = Series([0, pd.NaT], dtype="m8[ns]") From 3cf29305678d0a32e91d223d8259918ecb70a7fa Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 30 Sep 2023 21:07:27 +0100 Subject: [PATCH 2/6] Fix mypy --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/masked.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 81ae53bb8164f..ba939d93ff839 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1041,7 +1041,7 @@ def mode( npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask) if res_mask is not None: - return npresult, res_mask + return npresult, res_mask # type: ignore[return-value] try: npresult = np.sort(npresult) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 3b3073cd5f798..287dc7c86db31 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1068,7 +1068,7 @@ def _mode(self, dropna: bool = True) -> Self: res_mask = np.zeros(result.shape, dtype=np.bool_) else: result, res_mask = mode(self._data, dropna=dropna, mask=self._mask) - result = type(self)(result, res_mask) + result = type(self)(result, res_mask) # type: ignore[arg-type] return result[result.argsort()] @doc(ExtensionArray.equals) From 5f6baafc7eff37f25d4a0e6a6e915948f8263cb8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 30 Sep 2023 23:39:08 +0100 Subject: [PATCH 3/6] Update v2.2.0.rst --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 81b6b42cd4c06..caad970cc87a5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -77,7 +77,7 @@ Other enhancements - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) -- Implement masked algorithms for :meth:`Series.value_counts` and :meth:`Series.mode` (:issue:`54984`, :issue:`55339`) +- Implement masked algorithms for :meth:`Series.value_counts` and :meth:`Series.mode` (:issue:`54984`, :issue:`55340`) - .. --------------------------------------------------------------------------- From 87e7a1ee662c28f6ec9a04bae5cb7a699f893028 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 30 Sep 2023 23:39:23 +0100 Subject: [PATCH 4/6] Update test_reductions.py --- pandas/tests/series/test_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 2dbe520672e35..13082ee99b47a 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -30,7 +30,7 @@ def test_mode_extension_dtype(as_period): def test_mode_nullable_dtype(any_numeric_ea_dtype): - # GH#55339 + # GH#55340 ser = Series([1, 3, 2, pd.NA, 3, 2, pd.NA], dtype=any_numeric_ea_dtype) result = ser.mode(dropna=False) expected = Series([2, 3, pd.NA], dtype=any_numeric_ea_dtype) From eaf3d13e1456e180bd8df960105ed6fbc57cf85d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:45:35 +0200 Subject: [PATCH 5/6] Update v2.2.0.rst --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index caad970cc87a5..310728e54132f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -77,7 +77,6 @@ Other enhancements - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) -- Implement masked algorithms for :meth:`Series.value_counts` and :meth:`Series.mode` (:issue:`54984`, :issue:`55340`) - .. --------------------------------------------------------------------------- @@ -236,6 +235,7 @@ Other Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement for :meth:`Series.value_counts` and :meth:`Series.mode` (:issue:`54984`, :issue:`55340`) - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) From 0ed2479eb91d8ef025b3d306fe3e28b595daa67f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Oct 2023 15:59:42 +0200 Subject: [PATCH 6/6] Update v2.2.0.rst --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 1263373dbb8fa..c7c1bc3427618 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -272,7 +272,7 @@ Other Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvement for :meth:`Series.value_counts` and :meth:`Series.mode` (:issue:`54984`, :issue:`55340`) +- Performance improvement for :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`) - Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) - Performance improvement in :func:`to_dict` on converting DataFrame to dictionary (:issue:`50990`) - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)