Skip to content

ENH/PERF: add ExtensionArray.duplicated #55255

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Oct 3, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from importlib import import_module

import numpy as np
import pyarrow as pa

import pandas as pd

Expand Down Expand Up @@ -80,7 +81,7 @@ class Duplicated:
"datetime64[ns]",
"datetime64[ns, tz]",
"timestamp[ms][pyarrow]",
"duration[s][pyarrow]",
"string[pyarrow]",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why getting rid of duration here?

i think at some pint "string" on L80 will become redundant with "string[pyarrow]" here, so maybe a TODO(3.0) to get rid of one when that happens?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original version of this PR only targeted pyarrow timestamp and duration types. The current version improves perf for a larger number of pyarrow types so thought I'd add a non-temporal type as well. I can add it back if you want.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im fine either way, just curious. usually for asvs we go throw the kitchen sink at it but there are downsides to that

],
]
param_names = ["unique", "keep", "dtype"]
Expand All @@ -97,11 +98,9 @@ def setup(self, unique, keep, dtype):
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"timestamp[ms][pyarrow]": pd.Index(
np.arange(N), dtype="timestamp[ms][pyarrow]"
),
"duration[s][pyarrow]": pd.Index(
np.arange(N), dtype="duration[s][pyarrow]"
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
),
"string[pyarrow]": tm.makeStringIndex(N).astype(pd.ArrowDtype(pa.string())),
}[dtype]
if not unique:
data = data.repeat(5)
Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ Other enhancements

- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
-

Expand Down Expand Up @@ -239,7 +240,7 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
- Performance improvement in :meth:`Series.duplicated` for pyarrow timestamp and duration dtypes (:issue:`55255`)
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
- Performance improvement when localizing time to UTC (:issue:`55241`)

Expand Down
23 changes: 7 additions & 16 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.dtypes import (
ArrowDtype,
BaseMaskedDtype,
CategoricalDtype,
ExtensionDtype,
Expand Down Expand Up @@ -979,40 +978,32 @@ def value_counts_arraylike(


def duplicated(
values: ArrayLike, keep: Literal["first", "last", False] = "first"
values: ArrayLike,
keep: Literal["first", "last", False] = "first",
mask: npt.NDArray[np.bool_] | None = None,
) -> npt.NDArray[np.bool_]:
"""
Return boolean ndarray denoting duplicate values.

Parameters
----------
values : nd.array, ExtensionArray or Series
values : np.ndarray or ExtensionArray
Array over which to check for duplicate values.
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first
occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last
occurrence.
- False : Mark all duplicates as ``True``.
mask : ndarray[bool]
array indicating which elements to exclude from checking

Returns
-------
duplicated : ndarray[bool]
"""
if hasattr(values, "dtype"):
if isinstance(values.dtype, ArrowDtype):
if values.dtype.kind in "ifub":
values = values._to_masked() # type: ignore[union-attr]
else:
values = (
values._maybe_convert_datelike_array() # type: ignore[union-attr]
)
if isinstance(values.dtype, BaseMaskedDtype):
values = cast("BaseMaskedArray", values)
return htable.duplicated(values._data, keep=keep, mask=values._mask)

values = _ensure_data(values)
return htable.duplicated(values, keep=keep)
return htable.duplicated(values, keep=keep, mask=mask)


def mode(
Expand Down
25 changes: 25 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from pandas.core.dtypes.missing import isna

from pandas.core import (
algorithms as algos,
missing,
roperator,
)
Expand Down Expand Up @@ -1289,6 +1290,30 @@ def to_numpy(
result[~mask] = data[~mask]._pa_array.to_numpy()
return result

@doc(ExtensionArray.duplicated)
def duplicated(
self, keep: Literal["first", "last", False] = "first"
) -> npt.NDArray[np.bool_]:
pa_type = self._pa_array.type
if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
values = self.to_numpy(na_value=0)
elif pa.types.is_boolean(pa_type):
values = self.to_numpy(na_value=False)
elif pa.types.is_temporal(pa_type):
if pa_type.bit_width == 32:
pa_type = pa.int32()
else:
pa_type = pa.int64()
arr = self.astype(ArrowDtype(pa_type))
values = arr.to_numpy(na_value=0)
else:
# factorize the values to avoid the performance penalty of
# converting to object dtype
values = self.factorize()[0]

mask = self.isna() if self._hasna else None
return algos.duplicated(values, keep=keep, mask=mask)

def unique(self) -> Self:
"""
Compute the ArrowExtensionArray of unique values.
Expand Down
26 changes: 26 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
roperator,
)
from pandas.core.algorithms import (
duplicated,
factorize_array,
isin,
map_array,
Expand Down Expand Up @@ -125,6 +126,7 @@ class ExtensionArray:
astype
copy
dropna
duplicated
factorize
fillna
equals
Expand Down Expand Up @@ -1116,6 +1118,30 @@ def dropna(self) -> Self:
# error: Unsupported operand type for ~ ("ExtensionArray")
return self[~self.isna()] # type: ignore[operator]

def duplicated(
self, keep: Literal["first", "last", False] = "first"
) -> npt.NDArray[np.bool_]:
"""
Return boolean ndarray denoting duplicate values.

Parameters
----------
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last occurrence.
- False : Mark all duplicates as ``True``.

Returns
-------
ndarray[bool]

Examples
--------
>>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated()
array([False, True, False, False, True])
"""
return duplicated(values=self, keep=keep)

def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray:
"""
Shift values by desired number.
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,14 @@ def copy(self) -> Self:
mask = self._mask.copy()
return self._simple_new(data, mask)

@doc(ExtensionArray.duplicated)
def duplicated(
self, keep: Literal["first", "last", False] = "first"
) -> npt.NDArray[np.bool_]:
values = self._data
mask = self._mask
return algos.duplicated(values, keep=keep, mask=mask)

def unique(self) -> Self:
"""
Compute the BaseMaskedArray of unique values.
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1365,7 +1365,10 @@ def drop_duplicates(self, *, keep: DropKeep = "first"):

@final
def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
return algorithms.duplicated(self._values, keep=keep)
arr = self._values
if isinstance(arr, ExtensionArray):
return arr.duplicated(keep=keep)
return algorithms.duplicated(arr, keep=keep)

def _arith_method(self, other, op):
res_name = ops.get_op_result_name(self, other)
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,18 @@ def test_sort_values_frame(self, data_for_sorting, ascending):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("keep", ["first", "last", False])
def test_duplicated(self, data, keep):
arr = data.take([0, 1, 0, 1])
result = arr.duplicated(keep=keep)
if keep == "first":
expected = np.array([False, False, True, True])
elif keep == "last":
expected = np.array([True, True, False, False])
else:
expected = np.array([True, True, True, True])
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("box", [pd.Series, lambda x: x])
@pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
def test_unique(self, data, box, method):
Expand Down