Skip to content

Commit 235113e

Browse files
authored
PERF: Improve performance for df.duplicated with one column subset (#45534)
1 parent 461772a commit 235113e

File tree

4 files changed

+27
-19
lines changed

4 files changed

+27
-19
lines changed

asv_bench/benchmarks/frame_methods.py

+3
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,9 @@ def time_frame_duplicated(self):
611611
def time_frame_duplicated_wide(self):
612612
self.df2.duplicated()
613613

614+
def time_frame_duplicated_subset(self):
615+
self.df.duplicated(subset=["a"])
616+
614617

615618
class XS:
616619

doc/source/whatsnew/v1.5.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ Other Deprecations
154154

155155
Performance improvements
156156
~~~~~~~~~~~~~~~~~~~~~~~~
157-
-
157+
- Performance improvement in :meth:`DataFrame.duplicated` when subset consists of only one column (:issue:`45236`)
158158
-
159159

160160
.. ---------------------------------------------------------------------------

pandas/core/frame.py

+21-16
Original file line numberDiff line numberDiff line change
@@ -6252,22 +6252,27 @@ def f(vals) -> tuple[np.ndarray, int]:
62526252
# Verify all columns in subset exist in the queried dataframe
62536253
# Otherwise, raise a KeyError, same as if you try to __getitem__ with a
62546254
# key that doesn't exist.
6255-
diff = Index(subset).difference(self.columns)
6256-
if not diff.empty:
6257-
raise KeyError(diff)
6258-
6259-
vals = (col.values for name, col in self.items() if name in subset)
6260-
labels, shape = map(list, zip(*map(f, vals)))
6261-
6262-
ids = get_group_index(
6263-
labels,
6264-
# error: Argument 1 to "tuple" has incompatible type "List[_T]";
6265-
# expected "Iterable[int]"
6266-
tuple(shape), # type: ignore[arg-type]
6267-
sort=False,
6268-
xnull=False,
6269-
)
6270-
result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
6255+
diff = set(subset) - set(self.columns)
6256+
if diff:
6257+
raise KeyError(Index(diff))
6258+
6259+
if len(subset) == 1 and self.columns.is_unique:
6260+
# GH#45236 This is faster than get_group_index below
6261+
result = self[subset[0]].duplicated(keep)
6262+
result.name = None
6263+
else:
6264+
vals = (col.values for name, col in self.items() if name in subset)
6265+
labels, shape = map(list, zip(*map(f, vals)))
6266+
6267+
ids = get_group_index(
6268+
labels,
6269+
# error: Argument 1 to "tuple" has incompatible type "List[_T]";
6270+
# expected "Iterable[int]"
6271+
tuple(shape), # type: ignore[arg-type]
6272+
sort=False,
6273+
xnull=False,
6274+
)
6275+
result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
62716276
return result.__finalize__(self, method="duplicated")
62726277

62736278
# ----------------------------------------------------------------------

pandas/tests/frame/methods/test_duplicated.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def test_duplicated_keep(keep, expected):
6262
],
6363
)
6464
def test_duplicated_nan_none(keep, expected):
65-
df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object)
65+
df = DataFrame({"C": [np.nan, 3, 3, None, np.nan], "x": 1}, dtype=object)
6666

6767
result = df.duplicated(keep=keep)
6868
tm.assert_series_equal(result, expected)
@@ -109,5 +109,5 @@ def test_frame_datetime64_duplicated():
109109
assert (-result).all()
110110

111111
tst = DataFrame({"date": dates})
112-
result = tst.duplicated()
112+
result = tst.date.duplicated()
113113
assert (-result).all()

0 commit comments

Comments
 (0)