Skip to content

Commit 614939a

Browse files
authored
ENH: Allow Iterable[Hashable] in drop_duplicates
1 parent fb6842d commit 614939a

File tree

3 files changed

+49
-13
lines changed

3 files changed

+49
-13
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Other enhancements
5353
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
5454
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
5555
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
56+
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
5657
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
5758
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
5859

pandas/core/frame.py

+10-13
Original file line numberDiff line numberDiff line change
@@ -6406,7 +6406,7 @@ def dropna(
64066406
64076407
thresh : int, optional
64086408
Require that many non-NA values. Cannot be combined with how.
6409-
subset : column label or sequence of labels, optional
6409+
subset : column label or iterable of labels, optional
64106410
Labels along other axis to consider, e.g. if you are dropping rows
64116411
these would be a list of columns to include.
64126412
inplace : bool, default False
@@ -6536,7 +6536,7 @@ def dropna(
65366536
@overload
65376537
def drop_duplicates(
65386538
self,
6539-
subset: Hashable | Sequence[Hashable] | None = ...,
6539+
subset: Hashable | Iterable[Hashable] | None = ...,
65406540
*,
65416541
keep: DropKeep = ...,
65426542
inplace: Literal[True],
@@ -6546,7 +6546,7 @@ def drop_duplicates(
65466546
@overload
65476547
def drop_duplicates(
65486548
self,
6549-
subset: Hashable | Sequence[Hashable] | None = ...,
6549+
subset: Hashable | Iterable[Hashable] | None = ...,
65506550
*,
65516551
keep: DropKeep = ...,
65526552
inplace: Literal[False] = ...,
@@ -6556,7 +6556,7 @@ def drop_duplicates(
65566556
@overload
65576557
def drop_duplicates(
65586558
self,
6559-
subset: Hashable | Sequence[Hashable] | None = ...,
6559+
subset: Hashable | Iterable[Hashable] | None = ...,
65606560
*,
65616561
keep: DropKeep = ...,
65626562
inplace: bool = ...,
@@ -6565,7 +6565,7 @@ def drop_duplicates(
65656565

65666566
def drop_duplicates(
65676567
self,
6568-
subset: Hashable | Sequence[Hashable] | None = None,
6568+
subset: Hashable | Iterable[Hashable] | None = None,
65696569
*,
65706570
keep: DropKeep = "first",
65716571
inplace: bool = False,
@@ -6579,7 +6579,7 @@ def drop_duplicates(
65796579
65806580
Parameters
65816581
----------
6582-
subset : column label or sequence of labels, optional
6582+
subset : column label or iterable of labels, optional
65836583
Only consider certain columns for identifying duplicates, by
65846584
default use all of the columns.
65856585
keep : {'first', 'last', ``False``}, default 'first'
@@ -6669,7 +6669,7 @@ def drop_duplicates(
66696669

66706670
def duplicated(
66716671
self,
6672-
subset: Hashable | Sequence[Hashable] | None = None,
6672+
subset: Hashable | Iterable[Hashable] | None = None,
66736673
keep: DropKeep = "first",
66746674
) -> Series:
66756675
"""
@@ -6679,7 +6679,7 @@ def duplicated(
66796679
66806680
Parameters
66816681
----------
6682-
subset : column label or sequence of labels, optional
6682+
subset : column label or iterable of labels, optional
66836683
Only consider certain columns for identifying duplicates, by
66846684
default use all of the columns.
66856685
keep : {'first', 'last', False}, default 'first'
@@ -6771,10 +6771,7 @@ def f(vals) -> tuple[np.ndarray, int]:
67716771
return labels.astype("i8"), len(shape)
67726772

67736773
if subset is None:
6774-
# https://github.com/pandas-dev/pandas/issues/28770
6775-
# Incompatible types in assignment (expression has type "Index", variable
6776-
# has type "Sequence[Any]")
6777-
subset = self.columns # type: ignore[assignment]
6774+
subset = self.columns
67786775
elif (
67796776
not np.iterable(subset)
67806777
or isinstance(subset, str)
@@ -6795,7 +6792,7 @@ def f(vals) -> tuple[np.ndarray, int]:
67956792

67966793
if len(subset) == 1 and self.columns.is_unique:
67976794
# GH#45236 This is faster than get_group_index below
6798-
result = self[subset[0]].duplicated(keep)
6795+
result = self[next(iter(subset))].duplicated(keep)
67996796
result.name = None
68006797
else:
68016798
vals = (col.values for name, col in self.items() if name in subset)

pandas/tests/frame/methods/test_drop_duplicates.py

+38
Original file line numberDiff line numberDiff line change
@@ -476,3 +476,41 @@ def test_drop_duplicates_non_boolean_ignore_index(arg):
476476
msg = '^For argument "ignore_index" expected type bool, received type .*.$'
477477
with pytest.raises(ValueError, match=msg):
478478
df.drop_duplicates(ignore_index=arg)
479+
480+
481+
def test_drop_duplicates_set():
482+
# GH#59237
483+
df = DataFrame(
484+
{
485+
"AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
486+
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
487+
"C": [1, 1, 2, 2, 2, 2, 1, 2],
488+
"D": range(8),
489+
}
490+
)
491+
# single column
492+
result = df.drop_duplicates({"AAA"})
493+
expected = df[:2]
494+
tm.assert_frame_equal(result, expected)
495+
496+
result = df.drop_duplicates({"AAA"}, keep="last")
497+
expected = df.loc[[6, 7]]
498+
tm.assert_frame_equal(result, expected)
499+
500+
result = df.drop_duplicates({"AAA"}, keep=False)
501+
expected = df.loc[[]]
502+
tm.assert_frame_equal(result, expected)
503+
assert len(result) == 0
504+
505+
# multi column
506+
expected = df.loc[[0, 1, 2, 3]]
507+
result = df.drop_duplicates({"AAA", "B"})
508+
tm.assert_frame_equal(result, expected)
509+
510+
result = df.drop_duplicates({"AAA", "B"}, keep="last")
511+
expected = df.loc[[0, 5, 6, 7]]
512+
tm.assert_frame_equal(result, expected)
513+
514+
result = df.drop_duplicates({"AAA", "B"}, keep=False)
515+
expected = df.loc[[0]]
516+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)