Skip to content

TST: Add tests for duplicated and drop_duplicates #32575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Apr 6, 2020
72 changes: 67 additions & 5 deletions pandas/tests/indexes/categorical/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pandas.core.dtypes.dtypes import CategoricalDtype

import pandas as pd
from pandas import Categorical, IntervalIndex
from pandas import Categorical, IntervalIndex, Series
import pandas._testing as tm
from pandas.core.indexes.api import CategoricalIndex, Index

Expand Down Expand Up @@ -353,16 +353,78 @@ def test_is_monotonic(self, data, non_lexsorted_data):
assert c.is_monotonic_decreasing is False

def test_has_duplicates(self):

idx = CategoricalIndex([0, 0, 0], name="foo")
assert idx.is_unique is False
assert idx.has_duplicates is True

idx = CategoricalIndex([0, 1], categories=[2, 3], name="foo")
assert idx.is_unique is False
assert idx.has_duplicates is True

idx = CategoricalIndex([0, 1, 2, 3], categories=[1, 2, 3], name="foo")
assert idx.is_unique is True
assert idx.has_duplicates is False

def _test_drop_duplicates(self, idx, keep, expected, index):
for k, e, i in zip(keep, expected, index):
tm.assert_numpy_array_equal(idx.duplicated(keep=k), e)
e = idx[~e]

result = idx.drop_duplicates(keep=k)
tm.assert_index_equal(result, e)

result = Series(idx).drop_duplicates(keep=k)
tm.assert_series_equal(result, Series(e, i))

def test_drop_duplicates(self):
keep = ["first", "last", False]

categories = [[1, 2, 3], list("abc")]
expected = [
np.array([False, True, True]),
np.array([True, True, False]),
np.array([True, True, True]),
]
index = [[0], [2], np.empty(shape=(0), dtype=int)]
for c in categories:
idx = pd.CategoricalIndex([1, 1, 1], categories=c, name="foo")
self._test_drop_duplicates(idx, keep, expected, index)

categories = ["a", "b", "c"]
idx = CategoricalIndex([2, "a", "b"], categories=categories, name="foo")
expected = np.zeros(shape=(3, 3), dtype=np.bool)
index = [[0, 1, 2], [0, 1, 2], [0, 1, 2]]
self._test_drop_duplicates(idx, keep, expected, index)

idx = CategoricalIndex(list("abb"), categories=categories, name="foo")
expected = [
np.array([False, False, True]),
np.array([False, True, False]),
np.array([False, True, True]),
]
index = [[0, 1], [0, 2], [0]]
self._test_drop_duplicates(idx, keep, expected, index)

def test_unique(self):

categories = [1, 2, 3]
idx = CategoricalIndex([1, 1, 1], categories=categories)
expected = CategoricalIndex([1], categories=[1])
tm.assert_index_equal(idx.unique(), expected)

idx = CategoricalIndex([0, 0, 0], name="foo")
expected = CategoricalIndex([0], name="foo")
tm.assert_index_equal(idx.drop_duplicates(), expected)
categories = list("abc")
idx = CategoricalIndex([1, 1, 1], categories=categories)
expected = CategoricalIndex([np.nan], categories=[])
tm.assert_index_equal(idx.unique(), expected)

categories = [1, 2, 3]
idx = CategoricalIndex([1, 2, "a"], categories=categories)
expected = CategoricalIndex([1, 2, np.nan], categories=[1, 2])
tm.assert_index_equal(idx.unique(), expected)

categories = list("abc")
idx = CategoricalIndex([2, "a", "b"], categories=categories)
expected = CategoricalIndex([np.nan, "a", "b"], categories=["a", "b"])
tm.assert_index_equal(idx.unique(), expected)

def test_repr_roundtrip(self):
Expand Down
53 changes: 32 additions & 21 deletions pandas/tests/indexes/datetimes/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,12 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
assert ordered.freq is None

def test_drop_duplicates_metadata(self):
@pytest.mark.parametrize(
"freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]
)
def test_drop_duplicates_metadata(self, freq):
# GH 10115
idx = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx")
idx = pd.date_range("2011-01-01", freq=freq, periods=10, name="idx")
result = idx.drop_duplicates()
tm.assert_index_equal(idx, result)
assert idx.freq == result.freq
Expand All @@ -272,26 +275,34 @@ def test_drop_duplicates_metadata(self):
tm.assert_index_equal(idx, result)
assert result.freq is None

def test_drop_duplicates(self):
@pytest.mark.parametrize(
"freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]
)
@pytest.mark.parametrize(
"keep, expected, index",
[
("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
(
False,
np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
np.arange(5, 10),
),
],
)
def test_drop_duplicates(self, freq, keep, expected, index):
# to check Index/Series compat
base = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx")
idx = base.append(base[:5])

res = idx.drop_duplicates()
tm.assert_index_equal(res, base)
res = Series(idx).drop_duplicates()
tm.assert_series_equal(res, Series(base))

res = idx.drop_duplicates(keep="last")
exp = base[5:].append(base[:5])
tm.assert_index_equal(res, exp)
res = Series(idx).drop_duplicates(keep="last")
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))

res = idx.drop_duplicates(keep=False)
tm.assert_index_equal(res, base[5:])
res = Series(idx).drop_duplicates(keep=False)
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
idx = pd.date_range("2011-01-01", freq=freq, periods=10, name="idx")
idx = idx.append(idx[:5])

tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
expected = idx[~expected]

result = idx.drop_duplicates(keep=keep)
tm.assert_index_equal(result, expected)

result = Series(idx).drop_duplicates(keep=keep)
tm.assert_series_equal(result, Series(expected, index=index))

@pytest.mark.parametrize(
"freq",
Expand Down
49 changes: 28 additions & 21 deletions pandas/tests/indexes/period/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ def test_value_counts_unique(self):

tm.assert_index_equal(idx.unique(), exp_idx)

def test_drop_duplicates_metadata(self):
@pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"])
def test_drop_duplicates_metadata(self, freq):
# GH 10115
idx = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx")
idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx")
result = idx.drop_duplicates()
tm.assert_index_equal(idx, result)
assert idx.freq == result.freq
Expand All @@ -93,26 +94,32 @@ def test_drop_duplicates_metadata(self):
tm.assert_index_equal(idx, result)
assert idx.freq == result.freq

def test_drop_duplicates(self):
@pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"])
@pytest.mark.parametrize(
"keep, expected, index",
[
("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
(
False,
np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
np.arange(5, 10),
),
],
)
def test_drop_duplicates(self, freq, keep, expected, index):
# to check Index/Series compat
base = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx")
idx = base.append(base[:5])

res = idx.drop_duplicates()
tm.assert_index_equal(res, base)
res = Series(idx).drop_duplicates()
tm.assert_series_equal(res, Series(base))

res = idx.drop_duplicates(keep="last")
exp = base[5:].append(base[:5])
tm.assert_index_equal(res, exp)
res = Series(idx).drop_duplicates(keep="last")
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))

res = idx.drop_duplicates(keep=False)
tm.assert_index_equal(res, base[5:])
res = Series(idx).drop_duplicates(keep=False)
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx")
idx = idx.append(idx[:5])

tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
expected = idx[~expected]

result = idx.drop_duplicates(keep=keep)
tm.assert_index_equal(result, expected)

result = Series(idx).drop_duplicates(keep=keep)
tm.assert_series_equal(result, Series(expected, index=index))

def test_order_compat(self):
def _check_freq(index, expected_index):
Expand Down
53 changes: 32 additions & 21 deletions pandas/tests/indexes/timedeltas/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,12 @@ def test_order(self):
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
assert ordered.freq is None

def test_drop_duplicates_metadata(self):
@pytest.mark.parametrize(
"freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]
)
def test_drop_duplicates_metadata(self, freq):
# GH 10115
idx = pd.timedelta_range("1 day", "31 day", freq="D", name="idx")
idx = pd.timedelta_range("1 day", periods=10, freq=freq, name="idx")
result = idx.drop_duplicates()
tm.assert_index_equal(idx, result)
assert idx.freq == result.freq
Expand All @@ -149,26 +152,34 @@ def test_drop_duplicates_metadata(self):
tm.assert_index_equal(idx, result)
assert result.freq is None

def test_drop_duplicates(self):
@pytest.mark.parametrize(
"freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]
)
@pytest.mark.parametrize(
"keep, expected, index",
[
("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
(
False,
np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
np.arange(5, 10),
),
],
)
def test_drop_duplicates(self, freq, keep, expected, index):
# to check Index/Series compat
base = pd.timedelta_range("1 day", "31 day", freq="D", name="idx")
idx = base.append(base[:5])

res = idx.drop_duplicates()
tm.assert_index_equal(res, base)
res = Series(idx).drop_duplicates()
tm.assert_series_equal(res, Series(base))

res = idx.drop_duplicates(keep="last")
exp = base[5:].append(base[:5])
tm.assert_index_equal(res, exp)
res = Series(idx).drop_duplicates(keep="last")
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))

res = idx.drop_duplicates(keep=False)
tm.assert_index_equal(res, base[5:])
res = Series(idx).drop_duplicates(keep=False)
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
idx = pd.timedelta_range("1 day", periods=10, freq=freq, name="idx")
idx = idx.append(idx[:5])

tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
expected = idx[~expected]

result = idx.drop_duplicates(keep=keep)
tm.assert_index_equal(result, expected)

result = Series(idx).drop_duplicates(keep=keep)
tm.assert_series_equal(result, Series(expected, index=index))

@pytest.mark.parametrize(
"freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]
Expand Down