Skip to content

Commit 3bf0f64

Browse files
authored
ENH/PERF: add ExtensionArray.duplicated (#55255)
* PERF: Series.duplicated for pyarrow timestamp and duration types * whatsnew * fix setup * add ExtensionArray.duplicated * fix * simplify * add SparseArray.duplicated * simplify * docs * pass mask * mypy * use mask * add optional to docstring * revert asv change
1 parent 6e6a683 commit 3bf0f64

File tree

10 files changed

+112
-14
lines changed

10 files changed

+112
-14
lines changed

asv_bench/benchmarks/algorithms.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from importlib import import_module
22

33
import numpy as np
4+
import pyarrow as pa
45

56
import pandas as pd
67

@@ -72,7 +73,16 @@ class Duplicated:
7273
params = [
7374
[True, False],
7475
["first", "last", False],
75-
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
76+
[
77+
"int",
78+
"uint",
79+
"float",
80+
"string",
81+
"datetime64[ns]",
82+
"datetime64[ns, tz]",
83+
"timestamp[ms][pyarrow]",
84+
"duration[s][pyarrow]",
85+
],
7686
]
7787
param_names = ["unique", "keep", "dtype"]
7888

@@ -87,6 +97,12 @@ def setup(self, unique, keep, dtype):
8797
"datetime64[ns, tz]": pd.date_range(
8898
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
8999
),
100+
"timestamp[ms][pyarrow]": pd.Index(
101+
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
102+
),
103+
"duration[s][pyarrow]": pd.Index(
104+
np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
105+
),
90106
}[dtype]
91107
if not unique:
92108
data = data.repeat(5)

doc/source/reference/extensions.rst

+1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ objects.
4949
api.extensions.ExtensionArray.copy
5050
api.extensions.ExtensionArray.view
5151
api.extensions.ExtensionArray.dropna
52+
api.extensions.ExtensionArray.duplicated
5253
api.extensions.ExtensionArray.equals
5354
api.extensions.ExtensionArray.factorize
5455
api.extensions.ExtensionArray.fillna

doc/source/whatsnew/v2.2.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ Other enhancements
7676

7777
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
7878
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
79+
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
7980
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
8081
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
8182
-
@@ -241,6 +242,7 @@ Performance improvements
241242
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
242243
- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
243244
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
245+
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
244246
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
245247
- Performance improvement when localizing time to UTC (:issue:`55241`)
246248

pandas/core/algorithms.py

+7-12
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@
5555
)
5656
from pandas.core.dtypes.concat import concat_compat
5757
from pandas.core.dtypes.dtypes import (
58-
ArrowDtype,
5958
BaseMaskedDtype,
6059
CategoricalDtype,
6160
ExtensionDtype,
@@ -979,36 +978,32 @@ def value_counts_arraylike(
979978

980979

981980
def duplicated(
982-
values: ArrayLike, keep: Literal["first", "last", False] = "first"
981+
values: ArrayLike,
982+
keep: Literal["first", "last", False] = "first",
983+
mask: npt.NDArray[np.bool_] | None = None,
983984
) -> npt.NDArray[np.bool_]:
984985
"""
985986
Return boolean ndarray denoting duplicate values.
986987
987988
Parameters
988989
----------
989-
values : nd.array, ExtensionArray or Series
990+
values : np.ndarray or ExtensionArray
990991
Array over which to check for duplicate values.
991992
keep : {'first', 'last', False}, default 'first'
992993
- ``first`` : Mark duplicates as ``True`` except for the first
993994
occurrence.
994995
- ``last`` : Mark duplicates as ``True`` except for the last
995996
occurrence.
996997
- False : Mark all duplicates as ``True``.
998+
mask : ndarray[bool], optional
999+
array indicating which elements to exclude from checking
9971000
9981001
Returns
9991002
-------
10001003
duplicated : ndarray[bool]
10011004
"""
1002-
if hasattr(values, "dtype"):
1003-
if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub":
1004-
values = values._to_masked() # type: ignore[union-attr]
1005-
1006-
if isinstance(values.dtype, BaseMaskedDtype):
1007-
values = cast("BaseMaskedArray", values)
1008-
return htable.duplicated(values._data, keep=keep, mask=values._mask)
1009-
10101005
values = _ensure_data(values)
1011-
return htable.duplicated(values, keep=keep)
1006+
return htable.duplicated(values, keep=keep, mask=mask)
10121007

10131008

10141009
def mode(

pandas/core/arrays/arrow/array.py

+25
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from pandas.core.dtypes.missing import isna
4343

4444
from pandas.core import (
45+
algorithms as algos,
4546
missing,
4647
roperator,
4748
)
@@ -1289,6 +1290,30 @@ def to_numpy(
12891290
result[~mask] = data[~mask]._pa_array.to_numpy()
12901291
return result
12911292

1293+
@doc(ExtensionArray.duplicated)
1294+
def duplicated(
1295+
self, keep: Literal["first", "last", False] = "first"
1296+
) -> npt.NDArray[np.bool_]:
1297+
pa_type = self._pa_array.type
1298+
if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
1299+
values = self.to_numpy(na_value=0)
1300+
elif pa.types.is_boolean(pa_type):
1301+
values = self.to_numpy(na_value=False)
1302+
elif pa.types.is_temporal(pa_type):
1303+
if pa_type.bit_width == 32:
1304+
pa_type = pa.int32()
1305+
else:
1306+
pa_type = pa.int64()
1307+
arr = self.astype(ArrowDtype(pa_type))
1308+
values = arr.to_numpy(na_value=0)
1309+
else:
1310+
# factorize the values to avoid the performance penalty of
1311+
# converting to object dtype
1312+
values = self.factorize()[0]
1313+
1314+
mask = self.isna() if self._hasna else None
1315+
return algos.duplicated(values, keep=keep, mask=mask)
1316+
12921317
def unique(self) -> Self:
12931318
"""
12941319
Compute the ArrowExtensionArray of unique values.

pandas/core/arrays/base.py

+27
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
roperator,
6262
)
6363
from pandas.core.algorithms import (
64+
duplicated,
6465
factorize_array,
6566
isin,
6667
map_array,
@@ -125,6 +126,7 @@ class ExtensionArray:
125126
astype
126127
copy
127128
dropna
129+
duplicated
128130
factorize
129131
fillna
130132
equals
@@ -1116,6 +1118,31 @@ def dropna(self) -> Self:
11161118
# error: Unsupported operand type for ~ ("ExtensionArray")
11171119
return self[~self.isna()] # type: ignore[operator]
11181120

1121+
def duplicated(
1122+
self, keep: Literal["first", "last", False] = "first"
1123+
) -> npt.NDArray[np.bool_]:
1124+
"""
1125+
Return boolean ndarray denoting duplicate values.
1126+
1127+
Parameters
1128+
----------
1129+
keep : {'first', 'last', False}, default 'first'
1130+
- ``first`` : Mark duplicates as ``True`` except for the first occurrence.
1131+
- ``last`` : Mark duplicates as ``True`` except for the last occurrence.
1132+
- False : Mark all duplicates as ``True``.
1133+
1134+
Returns
1135+
-------
1136+
ndarray[bool]
1137+
1138+
Examples
1139+
--------
1140+
>>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated()
1141+
array([False, True, False, False, True])
1142+
"""
1143+
mask = self.isna().astype(np.bool_, copy=False)
1144+
return duplicated(values=self, keep=keep, mask=mask)
1145+
11191146
def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray:
11201147
"""
11211148
Shift values by desired number.

pandas/core/arrays/masked.py

+8
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,14 @@ def copy(self) -> Self:
952952
mask = self._mask.copy()
953953
return self._simple_new(data, mask)
954954

955+
@doc(ExtensionArray.duplicated)
956+
def duplicated(
957+
self, keep: Literal["first", "last", False] = "first"
958+
) -> npt.NDArray[np.bool_]:
959+
values = self._data
960+
mask = self._mask
961+
return algos.duplicated(values, keep=keep, mask=mask)
962+
955963
def unique(self) -> Self:
956964
"""
957965
Compute the BaseMaskedArray of unique values.

pandas/core/arrays/sparse/array.py

+9
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from pandas._libs.tslibs import NaT
2929
from pandas.compat.numpy import function as nv
3030
from pandas.errors import PerformanceWarning
31+
from pandas.util._decorators import doc
3132
from pandas.util._exceptions import find_stack_level
3233
from pandas.util._validators import (
3334
validate_bool_kwarg,
@@ -830,6 +831,14 @@ def _first_fill_value_loc(self):
830831
diff = np.r_[np.diff(indices), 2]
831832
return indices[(diff > 1).argmax()] + 1
832833

834+
@doc(ExtensionArray.duplicated)
835+
def duplicated(
836+
self, keep: Literal["first", "last", False] = "first"
837+
) -> npt.NDArray[np.bool_]:
838+
values = np.asarray(self)
839+
mask = np.asarray(self.isna())
840+
return algos.duplicated(values, keep=keep, mask=mask)
841+
833842
def unique(self) -> Self:
834843
uniques = algos.unique(self.sp_values)
835844
if len(self.sp_values) != len(self):

pandas/core/base.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1365,7 +1365,10 @@ def drop_duplicates(self, *, keep: DropKeep = "first"):
13651365

13661366
@final
13671367
def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
1368-
return algorithms.duplicated(self._values, keep=keep)
1368+
arr = self._values
1369+
if isinstance(arr, ExtensionArray):
1370+
return arr.duplicated(keep=keep)
1371+
return algorithms.duplicated(arr, keep=keep)
13691372

13701373
def _arith_method(self, other, op):
13711374
res_name = ops.get_op_result_name(self, other)

pandas/tests/extension/base/methods.py

+12
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,18 @@ def test_sort_values_frame(self, data_for_sorting, ascending):
248248
)
249249
tm.assert_frame_equal(result, expected)
250250

251+
@pytest.mark.parametrize("keep", ["first", "last", False])
252+
def test_duplicated(self, data, keep):
253+
arr = data.take([0, 1, 0, 1])
254+
result = arr.duplicated(keep=keep)
255+
if keep == "first":
256+
expected = np.array([False, False, True, True])
257+
elif keep == "last":
258+
expected = np.array([True, True, False, False])
259+
else:
260+
expected = np.array([True, True, True, True])
261+
tm.assert_numpy_array_equal(result, expected)
262+
251263
@pytest.mark.parametrize("box", [pd.Series, lambda x: x])
252264
@pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
253265
def test_unique(self, data, box, method):

0 commit comments

Comments
 (0)