From c614bce9e1f64c98573b4d06cfa83ccc93f57d30 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 22 Sep 2023 22:29:14 -0400 Subject: [PATCH 01/14] PERF: Series.duplicated for pyarrow timestamp and duration types --- asv_bench/benchmarks/algorithms.py | 13 ++++++++++++- pandas/core/algorithms.py | 10 +++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 2584e1f13853a..3ff3d3093a56c 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -72,7 +72,16 @@ class Duplicated: params = [ [True, False], ["first", "last", False], - ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"], + [ + "int", + "uint", + "float", + "string", + "datetime64[ns]", + "datetime64[ns, tz]", + "timestamp[ms][pyarrow]", + "duration[s][pyarrow]", + ], ] param_names = ["unique", "keep", "dtype"] @@ -87,6 +96,8 @@ def setup(self, unique, keep, dtype): "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), + "timestamp[ms][pyarrow]": pd.Index(np.arange(N), dtype=dtype), + "duration[s][pyarrow]": pd.Index(np.arange(N), dtype=dtype), }[dtype] if not unique: data = data.repeat(5) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1d74bb8b83e4e..7acca0f3e8bbe 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1000,9 +1000,13 @@ def duplicated( duplicated : ndarray[bool] """ if hasattr(values, "dtype"): - if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub": - values = values._to_masked() # type: ignore[union-attr] - + if isinstance(values.dtype, ArrowDtype): + if values.dtype.kind in "ifub": + values = values._to_masked() # type: ignore[union-attr] + else: + values = ( + values._maybe_convert_datelike_array() # type: ignore[union-attr] + ) if isinstance(values.dtype, BaseMaskedDtype): values = cast("BaseMaskedArray", values) return htable.duplicated(values._data, keep=keep, mask=values._mask) From b508d13c1b5685ecc9013ea24ce3a24d3d4ab7e7 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 22 Sep 2023 22:32:08 -0400 Subject: [PATCH 02/14] whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 930e03ae7d75a..d3dc7b5ff49b6 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -239,6 +239,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) +- Performance improvement in :meth:`Series.duplicated` for pyarrow timestamp and duration dtypes (:issue:`55255`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - Performance improvement when localizing time to UTC (:issue:`55241`) From bbf5c6e0e30f625861d3b71640c5bd5c23bb0db5 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 23 Sep 2023 06:52:59 -0400 Subject: [PATCH 03/14] fix setup --- asv_bench/benchmarks/algorithms.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 3ff3d3093a56c..031e383dca35d 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -96,8 +96,12 @@ def setup(self, unique, keep, dtype): "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), - "timestamp[ms][pyarrow]": pd.Index(np.arange(N), dtype=dtype), - "duration[s][pyarrow]": pd.Index(np.arange(N), dtype=dtype), + "timestamp[ms][pyarrow]": pd.Index( + np.arange(N), dtype="timestamp[ms][pyarrow]" + ), + "duration[s][pyarrow]": pd.Index( + np.arange(N), dtype="duration[s][pyarrow]" + ), }[dtype] if not unique: data = data.repeat(5) From ccdec9e9f0b85580c888e134b5fb4a561d0c103b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 26 Sep 2023 07:05:08 -0400 Subject: [PATCH 04/14] add ExtensionArray.duplicated --- asv_bench/benchmarks/algorithms.py | 7 ++-- doc/source/whatsnew/v2.2.0.rst | 3 +- pandas/core/algorithms.py | 49 ++++++++++++++++++-------- pandas/core/arrays/arrow/array.py | 25 +++++++++++++ pandas/core/arrays/base.py | 26 ++++++++++++++ pandas/core/arrays/masked.py | 8 +++++ pandas/tests/extension/base/methods.py | 12 +++++++ 7 files changed, 111 insertions(+), 19 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 031e383dca35d..2c86fb02da46d 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,6 +1,7 @@ from importlib import import_module import numpy as np +import pyarrow as pa import pandas as pd @@ -97,11 +98,9 @@ def setup(self, unique, keep, dtype): "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), "timestamp[ms][pyarrow]": pd.Index( - np.arange(N), dtype="timestamp[ms][pyarrow]" - ), - "duration[s][pyarrow]": pd.Index( - np.arange(N), dtype="duration[s][pyarrow]" + np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms")) ), + "string[pyarrow]": tm.makeStringIndex(N).astype(pd.ArrowDtype(pa.string)), }[dtype] if not unique: data = data.repeat(5) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d3dc7b5ff49b6..a358add3d3a9b 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -76,6 +76,7 @@ Other enhancements - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) +- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - @@ -239,7 +240,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) - Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) - Performance improvement in :meth:`Index.difference` (:issue:`55108`) -- Performance improvement in :meth:`Series.duplicated` for pyarrow timestamp and duration dtypes (:issue:`55255`) +- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`) - Performance improvement when indexing with more than 4 keys (:issue:`54550`) - Performance improvement when localizing time to UTC (:issue:`55241`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7acca0f3e8bbe..c0b39d7438884 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -55,7 +55,6 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( - ArrowDtype, BaseMaskedDtype, CategoricalDtype, ExtensionDtype, @@ -984,9 +983,11 @@ def duplicated( """ Return boolean ndarray denoting duplicate values. + Dispatches to ExtensionArray.duplicated for extension arrays. + Parameters ---------- - values : nd.array, ExtensionArray or Series + values : np.ndarray or ExtensionArray Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first @@ -999,20 +1000,40 @@ def duplicated( ------- duplicated : ndarray[bool] """ - if hasattr(values, "dtype"): - if isinstance(values.dtype, ArrowDtype): - if values.dtype.kind in "ifub": - values = values._to_masked() # type: ignore[union-attr] - else: - values = ( - values._maybe_convert_datelike_array() # type: ignore[union-attr] - ) - if isinstance(values.dtype, BaseMaskedDtype): - values = cast("BaseMaskedArray", values) - return htable.duplicated(values._data, keep=keep, mask=values._mask) + if isinstance(values, ABCExtensionArray): + # dispatch to extension dtype's duplicated + return values.duplicated(keep=keep) + + return duplicated_array(values, keep=keep) + + +def duplicated_array( + values: ArrayLike, + keep: Literal["first", "last", False] = "first", + mask: npt.NDArray[np.bool_] | None = None, +) -> npt.NDArray[np.bool_]: + """ + Return boolean ndarray denoting duplicate values. + Parameters + ---------- + values : np.ndarray or ExtensionArray + Array over which to check for duplicate values. + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first + occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last + occurrence. + - False : Mark all duplicates as ``True``. + mask : ndarray[bool] + array indicating which elements to exclude from checking + + Returns + ------- + duplicated : ndarray[bool] + """ values = _ensure_data(values) - return htable.duplicated(values, keep=keep) + return htable.duplicated(values, keep=keep, mask=mask) def mode( diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4b79d0dbb683e..ffec291d04d04 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -42,6 +42,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ( + algorithms as algos, missing, roperator, ) @@ -1289,6 +1290,30 @@ def to_numpy( result[~mask] = data[~mask]._pa_array.to_numpy() return result + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + pa_type = self._pa_array.type + if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type): + values = self.to_numpy(na_value=0) + elif pa.types.is_boolean(pa_type): + values = self.to_numpy(na_value=False) + elif pa.types.is_temporal(pa_type): + if pa_type.bit_width == 32: + pa_type = pa.int32() + else: + pa_type = pa.int64() + arr = self.astype(ArrowDtype(pa_type)) + values = arr.to_numpy(na_value=0) + else: + # factorize the values to avoid the performance penalty of + # converting to object dtype + values = self.factorize()[0] + + mask = self.isna() if self._hasna else None + return algos.duplicated_array(values, keep=keep, mask=mask) + def unique(self) -> Self: """ Compute the ArrowExtensionArray of unique values. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 933944dbd4632..40013a2cffde5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -61,6 +61,7 @@ roperator, ) from pandas.core.algorithms import ( + duplicated_array, factorize_array, isin, map_array, @@ -125,6 +126,7 @@ class ExtensionArray: astype copy dropna + duplicated factorize fillna equals @@ -1116,6 +1118,30 @@ def dropna(self) -> Self: # error: Unsupported operand type for ~ ("ExtensionArray") return self[~self.isna()] # type: ignore[operator] + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + """ + Return boolean ndarray denoting duplicate values. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + ndarray[bool] + + Examples + -------- + >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() + array([False, True, False, False, True]) + """ + return duplicated_array(values=self, keep=keep) + def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: """ Shift values by desired number. diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index fdcbe67bbc371..5829b1f4a92e4 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -952,6 +952,14 @@ def copy(self) -> Self: mask = self._mask.copy() return self._simple_new(data, mask) + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = self._data + mask = self._mask + return algos.duplicated_array(values, keep=keep, mask=mask) + def unique(self) -> Self: """ Compute the BaseMaskedArray of unique values. diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4e0bc8d804bab..e10c6ef9a7018 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -248,6 +248,18 @@ def test_sort_values_frame(self, data_for_sorting, ascending): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_duplicated(self, data, keep): + arr = data.take([0, 1, 0, 1]) + result = arr.duplicated(keep=keep) + if keep == "first": + expected = np.array([False, False, True, True]) + elif keep == "last": + expected = np.array([True, True, False, False]) + else: + expected = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): From f1ebefafcb8fce0deae2b4ee1249c109c701ce67 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 26 Sep 2023 17:46:07 -0400 Subject: [PATCH 05/14] fix --- asv_bench/benchmarks/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 2c86fb02da46d..0d53e338be511 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -81,7 +81,7 @@ class Duplicated: "datetime64[ns]", "datetime64[ns, tz]", "timestamp[ms][pyarrow]", - "duration[s][pyarrow]", + "string[pyarrow]", ], ] param_names = ["unique", "keep", "dtype"] @@ -100,7 +100,7 @@ def setup(self, unique, keep, dtype): "timestamp[ms][pyarrow]": pd.Index( np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms")) ), - "string[pyarrow]": tm.makeStringIndex(N).astype(pd.ArrowDtype(pa.string)), + "string[pyarrow]": tm.makeStringIndex(N).astype(pd.ArrowDtype(pa.string())), }[dtype] if not unique: data = data.repeat(5) From d01074566b6aff35619db97780645633d2c6f184 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 26 Sep 2023 18:05:25 -0400 Subject: [PATCH 06/14] simplify --- pandas/core/algorithms.py | 30 ------------------------------ pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/masked.py | 2 +- pandas/core/base.py | 5 ++++- 5 files changed, 8 insertions(+), 35 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c0b39d7438884..168a952c04916 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -978,36 +978,6 @@ def value_counts_arraylike( def duplicated( - values: ArrayLike, keep: Literal["first", "last", False] = "first" -) -> npt.NDArray[np.bool_]: - """ - Return boolean ndarray denoting duplicate values. - - Dispatches to ExtensionArray.duplicated for extension arrays. - - Parameters - ---------- - values : np.ndarray or ExtensionArray - Array over which to check for duplicate values. - keep : {'first', 'last', False}, default 'first' - - ``first`` : Mark duplicates as ``True`` except for the first - occurrence. - - ``last`` : Mark duplicates as ``True`` except for the last - occurrence. - - False : Mark all duplicates as ``True``. - - Returns - ------- - duplicated : ndarray[bool] - """ - if isinstance(values, ABCExtensionArray): - # dispatch to extension dtype's duplicated - return values.duplicated(keep=keep) - - return duplicated_array(values, keep=keep) - - -def duplicated_array( values: ArrayLike, keep: Literal["first", "last", False] = "first", mask: npt.NDArray[np.bool_] | None = None, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ffec291d04d04..0579aa3760531 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1312,7 +1312,7 @@ def duplicated( values = self.factorize()[0] mask = self.isna() if self._hasna else None - return algos.duplicated_array(values, keep=keep, mask=mask) + return algos.duplicated(values, keep=keep, mask=mask) def unique(self) -> Self: """ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 40013a2cffde5..7ccea91eb27db 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -61,7 +61,7 @@ roperator, ) from pandas.core.algorithms import ( - duplicated_array, + duplicated, factorize_array, isin, map_array, @@ -1140,7 +1140,7 @@ def duplicated( >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() array([False, True, False, False, True]) """ - return duplicated_array(values=self, keep=keep) + return duplicated(values=self, keep=keep) def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 5829b1f4a92e4..819a4370e5510 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -958,7 +958,7 @@ def duplicated( ) -> npt.NDArray[np.bool_]: values = self._data mask = self._mask - return algos.duplicated_array(values, keep=keep, mask=mask) + return algos.duplicated(values, keep=keep, mask=mask) def unique(self) -> Self: """ diff --git a/pandas/core/base.py b/pandas/core/base.py index 3026189e747bb..d4421560bcea7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1365,7 +1365,10 @@ def drop_duplicates(self, *, keep: DropKeep = "first"): @final def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: - return algorithms.duplicated(self._values, keep=keep) + arr = self._values + if isinstance(arr, ExtensionArray): + return arr.duplicated(keep=keep) + return algorithms.duplicated(arr, keep=keep) def _arith_method(self, other, op): res_name = ops.get_op_result_name(self, other) From 4c9e4dd8c3ec46705e5c36f6b841ec546654e554 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 26 Sep 2023 21:52:02 -0400 Subject: [PATCH 07/14] add SparseArray.duplicated --- pandas/core/arrays/sparse/array.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 00cbe1286c195..2d76715bd834b 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -28,6 +28,7 @@ from pandas._libs.tslibs import NaT from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning +from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, @@ -830,6 +831,18 @@ def _first_fill_value_loc(self): diff = np.r_[np.diff(indices), 2] return indices[(diff > 1).argmax()] + 1 + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + null_mask = self.isna() + duplicated_isna = algos.duplicated(null_mask[null_mask], keep=keep) + duplicated_notna = algos.duplicated(self.sp_values, keep=keep) + result = np.zeros(len(self), dtype=np.bool_) + np.putmask(result, null_mask, duplicated_isna) + np.putmask(result, ~null_mask, duplicated_notna) + return result + def unique(self) -> Self: uniques = algos.unique(self.sp_values) if len(self.sp_values) != len(self): From 0af7e27e7e3531ea4a7893259ebdd7da4742c5bd Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 27 Sep 2023 05:46:50 -0400 Subject: [PATCH 08/14] simplify --- pandas/core/arrays/sparse/array.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 2d76715bd834b..cb15711a3281d 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -835,13 +835,7 @@ def _first_fill_value_loc(self): def duplicated( self, keep: Literal["first", "last", False] = "first" ) -> npt.NDArray[np.bool_]: - null_mask = self.isna() - duplicated_isna = algos.duplicated(null_mask[null_mask], keep=keep) - duplicated_notna = algos.duplicated(self.sp_values, keep=keep) - result = np.zeros(len(self), dtype=np.bool_) - np.putmask(result, null_mask, duplicated_isna) - np.putmask(result, ~null_mask, duplicated_notna) - return result + return algos.duplicated(np.asarray(self), keep=keep) def unique(self) -> Self: uniques = algos.unique(self.sp_values) From 6aca6e507e65d326ae944b6438d392b1da2c44b2 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 27 Sep 2023 05:53:42 -0400 Subject: [PATCH 09/14] docs --- doc/source/reference/extensions.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 83f830bb11198..e412793a328a3 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -49,6 +49,7 @@ objects. api.extensions.ExtensionArray.copy api.extensions.ExtensionArray.view api.extensions.ExtensionArray.dropna + api.extensions.ExtensionArray.duplicated api.extensions.ExtensionArray.equals api.extensions.ExtensionArray.factorize api.extensions.ExtensionArray.fillna From 8690245252115eb060ee2b7f85464f91b7c3c685 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 27 Sep 2023 17:58:49 -0400 Subject: [PATCH 10/14] pass mask --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7ccea91eb27db..5d7b333f98fc7 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1140,7 +1140,7 @@ def duplicated( >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() array([False, True, False, False, True]) """ - return duplicated(values=self, keep=keep) + return duplicated(values=self, keep=keep, mask=self.isna()) def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: """ From c5acd57e2ed2f002f75bb935b469e79193a03b83 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 27 Sep 2023 18:45:56 -0400 Subject: [PATCH 11/14] mypy --- pandas/core/arrays/base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5d7b333f98fc7..1f990acd936f6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1140,7 +1140,12 @@ def duplicated( >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() array([False, True, False, False, True]) """ - return duplicated(values=self, keep=keep, mask=self.isna()) + null_mask = self.isna() + if isinstance(null_mask, np.ndarray): + mask = null_mask + else: + mask = None + return duplicated(values=self, keep=keep, mask=mask) def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: """ From cc7cf1454246b366aa6fa57fd9c76fe8603faa4a Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 28 Sep 2023 18:21:23 -0400 Subject: [PATCH 12/14] use mask --- pandas/core/arrays/base.py | 6 +----- pandas/core/arrays/sparse/array.py | 4 +++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1f990acd936f6..c06bf7366447b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1140,11 +1140,7 @@ def duplicated( >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() array([False, True, False, False, True]) """ - null_mask = self.isna() - if isinstance(null_mask, np.ndarray): - mask = null_mask - else: - mask = None + mask = self.isna().astype(np.bool_, copy=False) return duplicated(values=self, keep=keep, mask=mask) def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index cb15711a3281d..608468da486b5 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -835,7 +835,9 @@ def _first_fill_value_loc(self): def duplicated( self, keep: Literal["first", "last", False] = "first" ) -> npt.NDArray[np.bool_]: - return algos.duplicated(np.asarray(self), keep=keep) + values = np.asarray(self) + mask = np.asarray(self.isna()) + return algos.duplicated(values, keep=keep, mask=mask) def unique(self) -> Self: uniques = algos.unique(self.sp_values) From 1bdbe6e8393f60aa483604b6fb8d0cdf6450b6c9 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 29 Sep 2023 21:36:23 -0400 Subject: [PATCH 13/14] add optional to docstring --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 168a952c04916..8c14d8c030ee3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -995,7 +995,7 @@ def duplicated( - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. - mask : ndarray[bool] + mask : ndarray[bool], optional array indicating which elements to exclude from checking Returns From 12cec9bb1a353b54d93b84c6f2b64a5eba595d52 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 1 Oct 2023 19:40:15 -0400 Subject: [PATCH 14/14] revert asv change --- asv_bench/benchmarks/algorithms.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 0d53e338be511..192f19c36b47d 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -81,7 +81,7 @@ class Duplicated: "datetime64[ns]", "datetime64[ns, tz]", "timestamp[ms][pyarrow]", - "string[pyarrow]", + "duration[s][pyarrow]", ], ] param_names = ["unique", "keep", "dtype"] @@ -100,7 +100,9 @@ def setup(self, unique, keep, dtype): "timestamp[ms][pyarrow]": pd.Index( np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms")) ), - "string[pyarrow]": tm.makeStringIndex(N).astype(pd.ArrowDtype(pa.string())), + "duration[s][pyarrow]": pd.Index( + np.arange(N), dtype=pd.ArrowDtype(pa.duration("s")) + ), }[dtype] if not unique: data = data.repeat(5)