Skip to content

CLN: Remove redundant tests for .duplicated and .drop_duplicates in tests/base #32487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

9 changes: 9 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,15 @@ def nselect_method(request):
return request.param


@pytest.fixture(params=["first", "last", False])
def keep(request):
"""
Valid values for the 'keep' parameter used in
.duplicated or .drop_duplicates
"""
return request.param


@pytest.fixture(params=["left", "right", "both", "neither"])
def closed(request):
"""
Expand Down
102 changes: 0 additions & 102 deletions pandas/tests/base/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,108 +594,6 @@ def test_factorize_repeated(self):
expected = o[5:10].append(o[:5])
tm.assert_index_equal(uniques, expected, check_names=False)

def test_duplicated_drop_duplicates_index(self):
# GH 4060
for original in self.objs:
if isinstance(original, Index):

# special case
if original.is_boolean():
result = original.drop_duplicates()
expected = Index([False, True], name="a")
tm.assert_index_equal(result, expected)
continue

# original doesn't have duplicates
expected = np.array([False] * len(original), dtype=bool)
duplicated = original.duplicated()
tm.assert_numpy_array_equal(duplicated, expected)
assert duplicated.dtype == bool
result = original.drop_duplicates()
tm.assert_index_equal(result, original)
assert result is not original
Comment on lines -609 to -616
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now tested in test_drop_duplicates_no_duplicates in tests/indexes/test_common.py, see below


# has_duplicates
assert not original.has_duplicates

# create repeated values, 3rd and 5th values are duplicated
idx = original[list(range(len(original))) + [5, 3]]
expected = np.array([False] * len(original) + [True, True], dtype=bool)
duplicated = idx.duplicated()
tm.assert_numpy_array_equal(duplicated, expected)
assert duplicated.dtype == bool
tm.assert_index_equal(idx.drop_duplicates(), original)

base = [False] * len(idx)
base[3] = True
base[5] = True
expected = np.array(base)

duplicated = idx.duplicated(keep="last")
tm.assert_numpy_array_equal(duplicated, expected)
assert duplicated.dtype == bool
result = idx.drop_duplicates(keep="last")
tm.assert_index_equal(result, idx[~expected])

base = [False] * len(original) + [True, True]
base[3] = True
base[5] = True
expected = np.array(base)

duplicated = idx.duplicated(keep=False)
tm.assert_numpy_array_equal(duplicated, expected)
assert duplicated.dtype == bool
result = idx.drop_duplicates(keep=False)
tm.assert_index_equal(result, idx[~expected])
Comment on lines -618 to -649
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tested in test_drop_duplicates in tests/indexes/test_common.py which was extended and refactored, see below


with pytest.raises(
TypeError,
match=r"drop_duplicates\(\) got an unexpected keyword argument",
):
idx.drop_duplicates(inplace=True)
Comment on lines -651 to -655
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now tested in test_drop_duplicates_inplace in tests/indexes/test_common.py, see below


else:
expected = Series(
[False] * len(original), index=original.index, name="a"
)
tm.assert_series_equal(original.duplicated(), expected)
result = original.drop_duplicates()
tm.assert_series_equal(result, original)
assert result is not original
Comment on lines -658 to -664
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now tested in test_drop_duplicates_no_duplicates in tests/series/methods/test_drop_duplicates.py, see below


idx = original.index[list(range(len(original))) + [5, 3]]
values = original._values[list(range(len(original))) + [5, 3]]
s = Series(values, index=idx, name="a")

expected = Series(
[False] * len(original) + [True, True], index=idx, name="a"
)
tm.assert_series_equal(s.duplicated(), expected)
tm.assert_series_equal(s.drop_duplicates(), original)

base = [False] * len(idx)
base[3] = True
base[5] = True
expected = Series(base, index=idx, name="a")

tm.assert_series_equal(s.duplicated(keep="last"), expected)
tm.assert_series_equal(
s.drop_duplicates(keep="last"), s[~np.array(base)]
)

base = [False] * len(original) + [True, True]
base[3] = True
base[5] = True
expected = Series(base, index=idx, name="a")

tm.assert_series_equal(s.duplicated(keep=False), expected)
tm.assert_series_equal(
s.drop_duplicates(keep=False), s[~np.array(base)]
)

s.drop_duplicates(inplace=True)
tm.assert_series_equal(s, original)
Comment on lines -666 to -697
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


def test_drop_duplicates_series_vs_dataframe(self):
# GH 14192
df = pd.DataFrame(
Expand Down
81 changes: 57 additions & 24 deletions pandas/tests/indexes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,32 +302,65 @@ def test_pickle(self, indices):
assert indices.equals(unpickled)
indices.name = original_name

@pytest.mark.parametrize("keep", ["first", "last", False])
def test_duplicated(self, indices, keep):
if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)):
# MultiIndex tested separately in:
# tests/indexes/multi/test_unique_and_duplicates
pytest.skip("Skip check for empty Index, MultiIndex, RangeIndex")

def test_drop_duplicates(self, indices, keep):
if isinstance(indices, MultiIndex):
pytest.skip("MultiIndex is tested separately")
if isinstance(indices, RangeIndex):
pytest.skip(
"RangeIndex is tested in test_drop_duplicates_no_duplicates"
" as it cannot hold duplicates"
)
if len(indices) == 0:
pytest.skip(
"empty index is tested in test_drop_duplicates_no_duplicates"
" as it cannot hold duplicates"
)

# make unique index
holder = type(indices)
unique_values = list(set(indices))
unique_idx = holder(unique_values)

# make duplicated index
n = len(unique_idx)
duplicated_selection = np.random.choice(n, int(n * 1.5))
idx = holder(unique_idx.values[duplicated_selection])

# Series.duplicated is tested separately
expected_duplicated = (
pd.Series(duplicated_selection).duplicated(keep=keep).values
)
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected_duplicated)

# Series.drop_duplicates is tested separately
expected_dropped = holder(pd.Series(idx).drop_duplicates(keep=keep))
tm.assert_index_equal(idx.drop_duplicates(keep=keep), expected_dropped)

def test_drop_duplicates_no_duplicates(self, indices):
if isinstance(indices, MultiIndex):
pytest.skip("MultiIndex is tested separately")

idx = holder(indices)
if idx.has_duplicates:
# We are testing the duplicated-method here, so we need to know
# exactly which indices are duplicate and how (for the result).
# This is not possible if "idx" has duplicates already, which we
# therefore remove. This is seemingly circular, as drop_duplicates
# invokes duplicated, but in the end, it all works out because we
# cross-check with Series.duplicated, which is tested separately.
idx = idx.drop_duplicates()

n, k = len(idx), 10
duplicated_selection = np.random.choice(n, k * n)
expected = pd.Series(duplicated_selection).duplicated(keep=keep).values
idx = holder(idx.values[duplicated_selection])

result = idx.duplicated(keep=keep)
tm.assert_numpy_array_equal(result, expected)
# make unique index
if isinstance(indices, RangeIndex):
# RangeIndex cannot have duplicates
unique_idx = indices
else:
holder = type(indices)
unique_values = list(set(indices))
unique_idx = holder(unique_values)

# check on unique index
expected_duplicated = np.array([False] * len(unique_idx), dtype="bool")
tm.assert_numpy_array_equal(unique_idx.duplicated(), expected_duplicated)
result_dropped = unique_idx.drop_duplicates()
tm.assert_index_equal(result_dropped, unique_idx)
# validate shallow copy
assert result_dropped is not unique_idx

def test_drop_duplicates_inplace(self, indices):
msg = r"drop_duplicates\(\) got an unexpected keyword argument"
with pytest.raises(TypeError, match=msg):
indices.drop_duplicates(inplace=True)

def test_has_duplicates(self, indices):
holder = type(indices)
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/series/methods/test_drop_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,26 @@ def test_drop_duplicates_bool(keep, expected):
tm.assert_series_equal(sc, tc[~expected])


@pytest.mark.parametrize("values", [[], list(range(5))])
def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values):
tc = Series(values, dtype=np.dtype(any_numpy_dtype))
expected = Series([False] * len(tc), dtype="bool")

if tc.dtype == "bool":
# 0 -> False and 1-> True
# any other value would be duplicated
tc = tc[:2]
expected = expected[:2]

tm.assert_series_equal(tc.duplicated(keep=keep), expected)

result_dropped = tc.drop_duplicates(keep=keep)
tm.assert_series_equal(result_dropped, tc)

# validate shallow copy
assert result_dropped is not tc


class TestSeriesDropDuplicates:
@pytest.mark.parametrize(
"dtype",
Expand Down