Skip to content

Commit 5549a54

Browse files
SaturnFromTitanSeeminSyed
authored andcommitted
CLN: Remove redundant tests for .duplicated and .drop_duplicates in tests/base (pandas-dev#32487)
1 parent 1675ac5 commit 5549a54

File tree

4 files changed

+86
-126
lines changed

4 files changed

+86
-126
lines changed

pandas/conftest.py

+9
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,15 @@ def nselect_method(request):
425425
return request.param
426426

427427

428+
@pytest.fixture(params=["first", "last", False])
429+
def keep(request):
430+
"""
431+
Valid values for the 'keep' parameter used in
432+
.duplicated or .drop_duplicates
433+
"""
434+
return request.param
435+
436+
428437
@pytest.fixture(params=["left", "right", "both", "neither"])
429438
def closed(request):
430439
"""

pandas/tests/base/test_ops.py

-102
Original file line numberDiff line numberDiff line change
@@ -570,108 +570,6 @@ def test_factorize(self, index_or_series_obj, sort):
570570
tm.assert_numpy_array_equal(result_codes, expected_codes)
571571
tm.assert_index_equal(result_uniques, expected_uniques)
572572

573-
def test_duplicated_drop_duplicates_index(self):
574-
# GH 4060
575-
for original in self.objs:
576-
if isinstance(original, Index):
577-
578-
# special case
579-
if original.is_boolean():
580-
result = original.drop_duplicates()
581-
expected = Index([False, True], name="a")
582-
tm.assert_index_equal(result, expected)
583-
continue
584-
585-
# original doesn't have duplicates
586-
expected = np.array([False] * len(original), dtype=bool)
587-
duplicated = original.duplicated()
588-
tm.assert_numpy_array_equal(duplicated, expected)
589-
assert duplicated.dtype == bool
590-
result = original.drop_duplicates()
591-
tm.assert_index_equal(result, original)
592-
assert result is not original
593-
594-
# has_duplicates
595-
assert not original.has_duplicates
596-
597-
# create repeated values, 3rd and 5th values are duplicated
598-
idx = original[list(range(len(original))) + [5, 3]]
599-
expected = np.array([False] * len(original) + [True, True], dtype=bool)
600-
duplicated = idx.duplicated()
601-
tm.assert_numpy_array_equal(duplicated, expected)
602-
assert duplicated.dtype == bool
603-
tm.assert_index_equal(idx.drop_duplicates(), original)
604-
605-
base = [False] * len(idx)
606-
base[3] = True
607-
base[5] = True
608-
expected = np.array(base)
609-
610-
duplicated = idx.duplicated(keep="last")
611-
tm.assert_numpy_array_equal(duplicated, expected)
612-
assert duplicated.dtype == bool
613-
result = idx.drop_duplicates(keep="last")
614-
tm.assert_index_equal(result, idx[~expected])
615-
616-
base = [False] * len(original) + [True, True]
617-
base[3] = True
618-
base[5] = True
619-
expected = np.array(base)
620-
621-
duplicated = idx.duplicated(keep=False)
622-
tm.assert_numpy_array_equal(duplicated, expected)
623-
assert duplicated.dtype == bool
624-
result = idx.drop_duplicates(keep=False)
625-
tm.assert_index_equal(result, idx[~expected])
626-
627-
with pytest.raises(
628-
TypeError,
629-
match=r"drop_duplicates\(\) got an unexpected keyword argument",
630-
):
631-
idx.drop_duplicates(inplace=True)
632-
633-
else:
634-
expected = Series(
635-
[False] * len(original), index=original.index, name="a"
636-
)
637-
tm.assert_series_equal(original.duplicated(), expected)
638-
result = original.drop_duplicates()
639-
tm.assert_series_equal(result, original)
640-
assert result is not original
641-
642-
idx = original.index[list(range(len(original))) + [5, 3]]
643-
values = original._values[list(range(len(original))) + [5, 3]]
644-
s = Series(values, index=idx, name="a")
645-
646-
expected = Series(
647-
[False] * len(original) + [True, True], index=idx, name="a"
648-
)
649-
tm.assert_series_equal(s.duplicated(), expected)
650-
tm.assert_series_equal(s.drop_duplicates(), original)
651-
652-
base = [False] * len(idx)
653-
base[3] = True
654-
base[5] = True
655-
expected = Series(base, index=idx, name="a")
656-
657-
tm.assert_series_equal(s.duplicated(keep="last"), expected)
658-
tm.assert_series_equal(
659-
s.drop_duplicates(keep="last"), s[~np.array(base)]
660-
)
661-
662-
base = [False] * len(original) + [True, True]
663-
base[3] = True
664-
base[5] = True
665-
expected = Series(base, index=idx, name="a")
666-
667-
tm.assert_series_equal(s.duplicated(keep=False), expected)
668-
tm.assert_series_equal(
669-
s.drop_duplicates(keep=False), s[~np.array(base)]
670-
)
671-
672-
s.drop_duplicates(inplace=True)
673-
tm.assert_series_equal(s, original)
674-
675573
def test_drop_duplicates_series_vs_dataframe(self):
676574
# GH 14192
677575
df = pd.DataFrame(

pandas/tests/indexes/test_common.py

+57-24
Original file line numberDiff line numberDiff line change
@@ -302,32 +302,65 @@ def test_pickle(self, indices):
302302
assert indices.equals(unpickled)
303303
indices.name = original_name
304304

305-
@pytest.mark.parametrize("keep", ["first", "last", False])
306-
def test_duplicated(self, indices, keep):
307-
if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)):
308-
# MultiIndex tested separately in:
309-
# tests/indexes/multi/test_unique_and_duplicates
310-
pytest.skip("Skip check for empty Index, MultiIndex, RangeIndex")
311-
305+
def test_drop_duplicates(self, indices, keep):
306+
if isinstance(indices, MultiIndex):
307+
pytest.skip("MultiIndex is tested separately")
308+
if isinstance(indices, RangeIndex):
309+
pytest.skip(
310+
"RangeIndex is tested in test_drop_duplicates_no_duplicates"
311+
" as it cannot hold duplicates"
312+
)
313+
if len(indices) == 0:
314+
pytest.skip(
315+
"empty index is tested in test_drop_duplicates_no_duplicates"
316+
" as it cannot hold duplicates"
317+
)
318+
319+
# make unique index
312320
holder = type(indices)
321+
unique_values = list(set(indices))
322+
unique_idx = holder(unique_values)
323+
324+
# make duplicated index
325+
n = len(unique_idx)
326+
duplicated_selection = np.random.choice(n, int(n * 1.5))
327+
idx = holder(unique_idx.values[duplicated_selection])
328+
329+
# Series.duplicated is tested separately
330+
expected_duplicated = (
331+
pd.Series(duplicated_selection).duplicated(keep=keep).values
332+
)
333+
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected_duplicated)
334+
335+
# Series.drop_duplicates is tested separately
336+
expected_dropped = holder(pd.Series(idx).drop_duplicates(keep=keep))
337+
tm.assert_index_equal(idx.drop_duplicates(keep=keep), expected_dropped)
338+
339+
def test_drop_duplicates_no_duplicates(self, indices):
340+
if isinstance(indices, MultiIndex):
341+
pytest.skip("MultiIndex is tested separately")
313342

314-
idx = holder(indices)
315-
if idx.has_duplicates:
316-
# We are testing the duplicated-method here, so we need to know
317-
# exactly which indices are duplicate and how (for the result).
318-
# This is not possible if "idx" has duplicates already, which we
319-
# therefore remove. This is seemingly circular, as drop_duplicates
320-
# invokes duplicated, but in the end, it all works out because we
321-
# cross-check with Series.duplicated, which is tested separately.
322-
idx = idx.drop_duplicates()
323-
324-
n, k = len(idx), 10
325-
duplicated_selection = np.random.choice(n, k * n)
326-
expected = pd.Series(duplicated_selection).duplicated(keep=keep).values
327-
idx = holder(idx.values[duplicated_selection])
328-
329-
result = idx.duplicated(keep=keep)
330-
tm.assert_numpy_array_equal(result, expected)
343+
# make unique index
344+
if isinstance(indices, RangeIndex):
345+
# RangeIndex cannot have duplicates
346+
unique_idx = indices
347+
else:
348+
holder = type(indices)
349+
unique_values = list(set(indices))
350+
unique_idx = holder(unique_values)
351+
352+
# check on unique index
353+
expected_duplicated = np.array([False] * len(unique_idx), dtype="bool")
354+
tm.assert_numpy_array_equal(unique_idx.duplicated(), expected_duplicated)
355+
result_dropped = unique_idx.drop_duplicates()
356+
tm.assert_index_equal(result_dropped, unique_idx)
357+
# validate shallow copy
358+
assert result_dropped is not unique_idx
359+
360+
def test_drop_duplicates_inplace(self, indices):
361+
msg = r"drop_duplicates\(\) got an unexpected keyword argument"
362+
with pytest.raises(TypeError, match=msg):
363+
indices.drop_duplicates(inplace=True)
331364

332365
def test_has_duplicates(self, indices):
333366
holder = type(indices)

pandas/tests/series/methods/test_drop_duplicates.py

+20
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,26 @@ def test_drop_duplicates_bool(keep, expected):
4444
tm.assert_series_equal(sc, tc[~expected])
4545

4646

47+
@pytest.mark.parametrize("values", [[], list(range(5))])
48+
def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values):
49+
tc = Series(values, dtype=np.dtype(any_numpy_dtype))
50+
expected = Series([False] * len(tc), dtype="bool")
51+
52+
if tc.dtype == "bool":
53+
# 0 -> False and 1-> True
54+
# any other value would be duplicated
55+
tc = tc[:2]
56+
expected = expected[:2]
57+
58+
tm.assert_series_equal(tc.duplicated(keep=keep), expected)
59+
60+
result_dropped = tc.drop_duplicates(keep=keep)
61+
tm.assert_series_equal(result_dropped, tc)
62+
63+
# validate shallow copy
64+
assert result_dropped is not tc
65+
66+
4767
class TestSeriesDropDuplicates:
4868
@pytest.mark.parametrize(
4969
"dtype",

0 commit comments

Comments
 (0)