Skip to content

TST: Add tests for duplicated and drop_duplicates #32575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Apr 6, 2020
75 changes: 70 additions & 5 deletions pandas/tests/indexes/categorical/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,16 +292,81 @@ def test_is_monotonic(self, data, non_lexsorted_data):
assert c.is_monotonic_decreasing is False

def test_has_duplicates(self):

idx = CategoricalIndex([0, 0, 0], name="foo")
assert idx.is_unique is False
assert idx.has_duplicates is True

def test_drop_duplicates(self):
idx = CategoricalIndex([0, 1], categories=[2, 3], name="foo")
assert idx.is_unique is False
assert idx.has_duplicates is True

idx = CategoricalIndex([0, 0, 0], name="foo")
expected = CategoricalIndex([0], name="foo")
tm.assert_index_equal(idx.drop_duplicates(), expected)
idx = CategoricalIndex([0, 1, 2, 3], categories=[1, 2, 3], name="foo")
assert idx.is_unique is True
assert idx.has_duplicates is False

@pytest.mark.parametrize(
"data, categories, expected",
[
(
[1, 1, 1],
[1, 2, 3],
{
"first": np.array([False, True, True]),
"last": np.array([True, True, False]),
False: np.array([True, True, True]),
},
),
(
[1, 1, 1],
list("abc"),
{
"first": np.array([False, True, True]),
"last": np.array([True, True, False]),
False: np.array([True, True, True]),
},
),
(
[2, "a", "b"],
list("abc"),
{
"first": np.zeros(shape=(3), dtype=np.bool),
"last": np.zeros(shape=(3), dtype=np.bool),
False: np.zeros(shape=(3), dtype=np.bool),
},
),
(
list("abb"),
list("abc"),
{
"first": np.array([False, False, True]),
"last": np.array([False, True, False]),
False: np.array([False, True, True]),
},
),
],
)
def test_drop_duplicates(self, data, categories, expected):

idx = CategoricalIndex(data, categories=categories, name="foo")
for keep, e in expected.items():
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), e)
e = idx[~e]
result = idx.drop_duplicates(keep=keep)
tm.assert_index_equal(result, e)

@pytest.mark.parametrize(
"data, categories, expected_data, expected_categories",
[
([1, 1, 1], [1, 2, 3], [1], [1]),
([1, 1, 1], list("abc"), [np.nan], []),
([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]),
([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]),
],
)
def test_unique(self, data, categories, expected_data, expected_categories):

idx = CategoricalIndex(data, categories=categories)
expected = CategoricalIndex(expected_data, categories=expected_categories)
tm.assert_index_equal(idx.unique(), expected)

def test_repr_roundtrip(self):
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/indexes/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,12 @@ def sort(request):
in in the Index setops methods.
"""
return request.param


@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"])
def freq_sample(request):
"""
Valid values for 'freq' parameter used to create date_range and
timedelta_range..
"""
return request.param
71 changes: 26 additions & 45 deletions pandas/tests/indexes/datetimes/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,9 +264,9 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
assert ordered.freq is None

def test_drop_duplicates_metadata(self):
def test_drop_duplicates_metadata(self, freq_sample):
# GH 10115
idx = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx")
idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx")
result = idx.drop_duplicates()
tm.assert_index_equal(idx, result)
assert idx.freq == result.freq
Expand All @@ -277,57 +277,38 @@ def test_drop_duplicates_metadata(self):
tm.assert_index_equal(idx, result)
assert result.freq is None

def test_drop_duplicates(self):
@pytest.mark.parametrize(
"keep, expected, index",
[
("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
(
False,
np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
np.arange(5, 10),
),
],
)
def test_drop_duplicates(self, freq_sample, keep, expected, index):
# to check Index/Series compat
base = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx")
idx = base.append(base[:5])
idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx")
idx = idx.append(idx[:5])

res = idx.drop_duplicates()
tm.assert_index_equal(res, base)
res = Series(idx).drop_duplicates()
tm.assert_series_equal(res, Series(base))
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
expected = idx[~expected]

res = idx.drop_duplicates(keep="last")
exp = base[5:].append(base[:5])
tm.assert_index_equal(res, exp)
res = Series(idx).drop_duplicates(keep="last")
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
result = idx.drop_duplicates(keep=keep)
tm.assert_index_equal(result, expected)

res = idx.drop_duplicates(keep=False)
tm.assert_index_equal(res, base[5:])
res = Series(idx).drop_duplicates(keep=False)
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
result = Series(idx).drop_duplicates(keep=keep)
tm.assert_series_equal(result, Series(expected, index=index))

@pytest.mark.parametrize(
"freq",
[
"A",
"2A",
"-2A",
"Q",
"-1Q",
"M",
"-1M",
"D",
"3D",
"-3D",
"W",
"-1W",
"H",
"2H",
"-2H",
"T",
"2T",
"S",
"-3S",
],
)
def test_infer_freq(self, freq):
def test_infer_freq(self, freq_sample):
# GH 11018
idx = pd.date_range("2011-01-01 09:00:00", freq=freq, periods=10)
idx = pd.date_range("2011-01-01 09:00:00", freq=freq_sample, periods=10)
result = pd.DatetimeIndex(idx.asi8, freq="infer")
tm.assert_index_equal(idx, result)
assert result.freq == freq
assert result.freq == freq_sample

def test_nat(self, tz_naive_fixture):
tz = tz_naive_fixture
Expand Down
49 changes: 28 additions & 21 deletions pandas/tests/indexes/period/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ def test_value_counts_unique(self):

tm.assert_index_equal(idx.unique(), exp_idx)

def test_drop_duplicates_metadata(self):
@pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"])
def test_drop_duplicates_metadata(self, freq):
# GH 10115
idx = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx")
idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx")
result = idx.drop_duplicates()
tm.assert_index_equal(idx, result)
assert idx.freq == result.freq
Expand All @@ -93,26 +94,32 @@ def test_drop_duplicates_metadata(self):
tm.assert_index_equal(idx, result)
assert idx.freq == result.freq

def test_drop_duplicates(self):
@pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"])
@pytest.mark.parametrize(
"keep, expected, index",
[
("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
(
False,
np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
np.arange(5, 10),
),
],
)
def test_drop_duplicates(self, freq, keep, expected, index):
# to check Index/Series compat
base = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx")
idx = base.append(base[:5])

res = idx.drop_duplicates()
tm.assert_index_equal(res, base)
res = Series(idx).drop_duplicates()
tm.assert_series_equal(res, Series(base))

res = idx.drop_duplicates(keep="last")
exp = base[5:].append(base[:5])
tm.assert_index_equal(res, exp)
res = Series(idx).drop_duplicates(keep="last")
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))

res = idx.drop_duplicates(keep=False)
tm.assert_index_equal(res, base[5:])
res = Series(idx).drop_duplicates(keep=False)
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx")
idx = idx.append(idx[:5])

tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
expected = idx[~expected]

result = idx.drop_duplicates(keep=keep)
tm.assert_index_equal(result, expected)

result = Series(idx).drop_duplicates(keep=keep)
tm.assert_series_equal(result, Series(expected, index=index))

def test_order_compat(self):
def _check_freq(index, expected_index):
Expand Down
50 changes: 26 additions & 24 deletions pandas/tests/indexes/timedeltas/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ def test_order(self):
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
assert ordered.freq is None

def test_drop_duplicates_metadata(self):
def test_drop_duplicates_metadata(self, freq_sample):
# GH 10115
idx = pd.timedelta_range("1 day", "31 day", freq="D", name="idx")
idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx")
result = idx.drop_duplicates()
tm.assert_index_equal(idx, result)
assert idx.freq == result.freq
Expand All @@ -147,36 +147,38 @@ def test_drop_duplicates_metadata(self):
tm.assert_index_equal(idx, result)
assert result.freq is None

def test_drop_duplicates(self):
@pytest.mark.parametrize(
"keep, expected, index",
[
("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
(
False,
np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
np.arange(5, 10),
),
],
)
def test_drop_duplicates(self, freq_sample, keep, expected, index):
# to check Index/Series compat
base = pd.timedelta_range("1 day", "31 day", freq="D", name="idx")
idx = base.append(base[:5])
idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx")
idx = idx.append(idx[:5])

res = idx.drop_duplicates()
tm.assert_index_equal(res, base)
res = Series(idx).drop_duplicates()
tm.assert_series_equal(res, Series(base))
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
expected = idx[~expected]

res = idx.drop_duplicates(keep="last")
exp = base[5:].append(base[:5])
tm.assert_index_equal(res, exp)
res = Series(idx).drop_duplicates(keep="last")
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
result = idx.drop_duplicates(keep=keep)
tm.assert_index_equal(result, expected)

res = idx.drop_duplicates(keep=False)
tm.assert_index_equal(res, base[5:])
res = Series(idx).drop_duplicates(keep=False)
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
result = Series(idx).drop_duplicates(keep=keep)
tm.assert_series_equal(result, Series(expected, index=index))

@pytest.mark.parametrize(
"freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]
)
def test_infer_freq(self, freq):
def test_infer_freq(self, freq_sample):
# GH#11018
idx = pd.timedelta_range("1", freq=freq, periods=10)
idx = pd.timedelta_range("1", freq=freq_sample, periods=10)
result = pd.TimedeltaIndex(idx.asi8, freq="infer")
tm.assert_index_equal(idx, result)
assert result.freq == freq
assert result.freq == freq_sample

def test_repeat(self):
index = pd.timedelta_range("1 days", periods=2, freq="D")
Expand Down