From 0cf2abe5446ed7abdb3f32d4d945679a2779e685 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 19 Aug 2022 11:55:01 +0200 Subject: [PATCH 1/5] ENH: Support mask in duplicated algorithm --- asv_bench/benchmarks/algorithms.py | 23 ++++++ pandas/_libs/hashtable.pyi | 1 + pandas/_libs/hashtable_func_helper.pxi.in | 90 +++++++++++++++-------- pandas/core/algorithms.py | 3 + 4 files changed, 86 insertions(+), 31 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 0008a589ca71f..e669eee84b354 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -95,6 +95,29 @@ def time_duplicated(self, unique, keep, dtype): self.idx.duplicated(keep=keep) +class DuplicatedMaskedArray: + + params = [ + [True, False], + ["first", "last", False], + ["Int64", "Float64"], + ] + param_names = ["unique", "keep", "dtype"] + + def setup(self, unique, keep, dtype): + N = 10**5 + data = pd.Series(np.arange(N), dtype=dtype) + data[list(range(1, N, 100))] = pd.NA + if not unique: + data = data.repeat(5) + self.ser = data + # cache is_unique + self.ser.is_unique + + def time_duplicated(self, unique, keep, dtype): + self.ser.duplicated(keep=keep) + + class Hashing: def setup_cache(self): N = 10**5 diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 8500fdf2f602e..c9944da703735 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -183,6 +183,7 @@ class IntpHashTable(HashTable): ... def duplicated( values: np.ndarray, keep: Literal["last", "first", False] = ..., + mask: npt.NDArray[np.bool_] | None = ..., ) -> npt.NDArray[np.bool_]: ... def mode( values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ... diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index f7c41b32864be..61cdc8a87d2cf 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -118,9 +118,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8 @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): +cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None): {{else}} -cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): +cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None): {{endif}} cdef: int ret = 0 @@ -129,10 +129,12 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): {{else}} PyObject* value {{endif}} - Py_ssize_t i, n = len(values) + Py_ssize_t i, n = len(values), first_na = -1 khiter_t k kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + bint seen_na = False, uses_mask = mask is not None + bint seen_multiple_na = False kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) @@ -147,9 +149,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): {{endif}} for i in range(n - 1, -1, -1): # equivalent: range(n)[::-1], which cython doesn't like in nogil - value = {{to_c_type}}(values[i]) - kh_put_{{ttype}}(table, value, &ret) - out[i] = ret == 0 + if uses_mask and mask[i]: + if seen_na: + out[i] = True + else: + out[i] = False + seen_na = True + else: + value = {{to_c_type}}(values[i]) + kh_put_{{ttype}}(table, value, &ret) + out[i] = ret == 0 elif keep == 'first': {{if dtype == 'object'}} @@ -158,9 +167,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): with nogil: {{endif}} for i in range(n): - value = {{to_c_type}}(values[i]) - kh_put_{{ttype}}(table, value, &ret) - out[i] = ret == 0 + if uses_mask and mask[i]: + if seen_na: + out[i] = True + else: + out[i] = False + seen_na = True + else: + value = {{to_c_type}}(values[i]) + kh_put_{{ttype}}(table, value, &ret) + out[i] = ret == 0 else: {{if dtype == 'object'}} @@ -169,15 +185,27 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): with nogil: {{endif}} for i in range(n): - value = {{to_c_type}}(values[i]) - k = kh_get_{{ttype}}(table, value) - if k != table.n_buckets: - out[table.vals[k]] = 1 - out[i] = 1 + if uses_mask and mask[i]: + if not seen_na: + first_na = i + seen_na = True + elif not seen_multiple_na: + out[i] = 1 + out[first_na] = 1 + seen_multiple_na = True + else: + out[i] = 1 + else: - k = kh_put_{{ttype}}(table, value, &ret) - table.vals[k] = i - out[i] = 0 + value = {{to_c_type}}(values[i]) + k = kh_get_{{ttype}}(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_{{ttype}}(table, value, &ret) + table.vals[k] = i + out[i] = 0 kh_destroy_{{ttype}}(table) return out @@ -301,37 +329,37 @@ cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=N raise TypeError(values.dtype) -cpdef duplicated(ndarray[htfunc_t] values, object keep="first"): +cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None): if htfunc_t is object: - return duplicated_object(values, keep) + return duplicated_object(values, keep, mask=mask) elif htfunc_t is int8_t: - return duplicated_int8(values, keep) + return duplicated_int8(values, keep, mask=mask) elif htfunc_t is int16_t: - return duplicated_int16(values, keep) + return duplicated_int16(values, keep, mask=mask) elif htfunc_t is int32_t: - return duplicated_int32(values, keep) + return duplicated_int32(values, keep, mask=mask) elif htfunc_t is int64_t: - return duplicated_int64(values, keep) + return duplicated_int64(values, keep, mask=mask) elif htfunc_t is uint8_t: - return duplicated_uint8(values, keep) + return duplicated_uint8(values, keep, mask=mask) elif htfunc_t is uint16_t: - return duplicated_uint16(values, keep) + return duplicated_uint16(values, keep, mask=mask) elif htfunc_t is uint32_t: - return duplicated_uint32(values, keep) + return duplicated_uint32(values, keep, mask=mask) elif htfunc_t is uint64_t: - return duplicated_uint64(values, keep) + return duplicated_uint64(values, keep, mask=mask) elif htfunc_t is float64_t: - return duplicated_float64(values, keep) + return duplicated_float64(values, keep, mask=mask) elif htfunc_t is float32_t: - return duplicated_float32(values, keep) + return duplicated_float32(values, keep, mask=mask) elif htfunc_t is complex128_t: - return duplicated_complex128(values, keep) + return duplicated_complex128(values, keep, mask=mask) elif htfunc_t is complex64_t: - return duplicated_complex64(values, keep) + return duplicated_complex64(values, keep, mask=mask) else: raise TypeError(values.dtype) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2e6737b2e61aa..b258f8236e5cc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1041,6 +1041,9 @@ def duplicated( ------- duplicated : ndarray[bool] """ + if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype): + return htable.duplicated(values._data, keep=keep, mask=values._mask) + values = _ensure_data(values) return htable.duplicated(values, keep=keep) From b40deb1d55c964b1fe161e90b3fb3ef6b7eac81f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 19 Aug 2022 13:32:15 +0200 Subject: [PATCH 2/5] Fix mypy --- pandas/core/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b258f8236e5cc..4740c52a2563d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1042,6 +1042,7 @@ def duplicated( duplicated : ndarray[bool] """ if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype): + values = cast("BaseMaskedArray", values) return htable.duplicated(values._data, keep=keep, mask=values._mask) values = _ensure_data(values) From c5226a44d6b5ceeebc622f8b14c4fc94fc5dcefb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 6 Sep 2022 22:49:07 +0200 Subject: [PATCH 3/5] Add tests --- pandas/tests/series/methods/test_duplicated.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py index 1c547ee99efed..e6d36b6d56fee 100644 --- a/pandas/tests/series/methods/test_duplicated.py +++ b/pandas/tests/series/methods/test_duplicated.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + NA, Categorical, Series, ) @@ -50,3 +51,15 @@ def test_duplicated_categorical_bool_na(nulls_fixture): result = ser.duplicated() expected = Series([False, False, True, True, False]) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "keep, vals", + [("last", [True, False]), ("first", [False, True]), (False, [True, True])], +) +def test_duplicated_mask(keep, vals): + # GH#48150 + ser = Series([1, 2, NA, NA], dtype="Int64") + result = ser.duplicated(keep=keep) + expected = Series([False, False] + vals) + tm.assert_series_equal(result, expected) From a1ae1d8392b14a870993f602df647082348d3dc7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 7 Sep 2022 21:03:39 +0200 Subject: [PATCH 4/5] Improve test --- pandas/tests/series/methods/test_duplicated.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py index e6d36b6d56fee..197e2003c6df9 100644 --- a/pandas/tests/series/methods/test_duplicated.py +++ b/pandas/tests/series/methods/test_duplicated.py @@ -55,11 +55,15 @@ def test_duplicated_categorical_bool_na(nulls_fixture): @pytest.mark.parametrize( "keep, vals", - [("last", [True, False]), ("first", [False, True]), (False, [True, True])], + [ + ("last", [True, True, False]), + ("first", [False, True, True]), + (False, [True, True, True]), + ], ) def test_duplicated_mask(keep, vals): # GH#48150 - ser = Series([1, 2, NA, NA], dtype="Int64") + ser = Series([1, 2, NA, NA, NA], dtype="Int64") result = ser.duplicated(keep=keep) expected = Series([False, False] + vals) tm.assert_series_equal(result, expected) From a334865dcdae78afc78a2abde29baeca99e33a5e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Sep 2022 22:55:17 +0200 Subject: [PATCH 5/5] Fix bug --- pandas/_libs/hashtable_func_helper.pxi.in | 1 + pandas/tests/series/methods/test_duplicated.py | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 61cdc8a87d2cf..68e253fd03620 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -189,6 +189,7 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons if not seen_na: first_na = i seen_na = True + out[i] = 0 elif not seen_multiple_na: out[i] = 1 out[first_na] = 1 diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py index 197e2003c6df9..29a523a4468f6 100644 --- a/pandas/tests/series/methods/test_duplicated.py +++ b/pandas/tests/series/methods/test_duplicated.py @@ -67,3 +67,12 @@ def test_duplicated_mask(keep, vals): result = ser.duplicated(keep=keep) expected = Series([False, False] + vals) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keep", ["last", "first", False]) +def test_duplicated_mask_no_duplicated_na(keep): + # GH#48150 + ser = Series([1, 2, NA], dtype="Int64") + result = ser.duplicated(keep=keep) + expected = Series([False, False, False]) + tm.assert_series_equal(result, expected)