Skip to content

Commit 0e382f2

Browse files
mproszewskajbrockmendel
authored andcommitted
TST: Add tests for duplicated and drop_duplicates (pandas-dev#32575)
1 parent 047e5d7 commit 0e382f2

File tree

5 files changed

+159
-95
lines changed

5 files changed

+159
-95
lines changed

pandas/tests/indexes/categorical/test_category.py

+70-5
Original file line numberDiff line numberDiff line change
@@ -292,16 +292,81 @@ def test_is_monotonic(self, data, non_lexsorted_data):
292292
assert c.is_monotonic_decreasing is False
293293

294294
def test_has_duplicates(self):
295-
296295
idx = CategoricalIndex([0, 0, 0], name="foo")
297296
assert idx.is_unique is False
298297
assert idx.has_duplicates is True
299298

300-
def test_drop_duplicates(self):
299+
idx = CategoricalIndex([0, 1], categories=[2, 3], name="foo")
300+
assert idx.is_unique is False
301+
assert idx.has_duplicates is True
301302

302-
idx = CategoricalIndex([0, 0, 0], name="foo")
303-
expected = CategoricalIndex([0], name="foo")
304-
tm.assert_index_equal(idx.drop_duplicates(), expected)
303+
idx = CategoricalIndex([0, 1, 2, 3], categories=[1, 2, 3], name="foo")
304+
assert idx.is_unique is True
305+
assert idx.has_duplicates is False
306+
307+
@pytest.mark.parametrize(
308+
"data, categories, expected",
309+
[
310+
(
311+
[1, 1, 1],
312+
[1, 2, 3],
313+
{
314+
"first": np.array([False, True, True]),
315+
"last": np.array([True, True, False]),
316+
False: np.array([True, True, True]),
317+
},
318+
),
319+
(
320+
[1, 1, 1],
321+
list("abc"),
322+
{
323+
"first": np.array([False, True, True]),
324+
"last": np.array([True, True, False]),
325+
False: np.array([True, True, True]),
326+
},
327+
),
328+
(
329+
[2, "a", "b"],
330+
list("abc"),
331+
{
332+
"first": np.zeros(shape=(3), dtype=np.bool),
333+
"last": np.zeros(shape=(3), dtype=np.bool),
334+
False: np.zeros(shape=(3), dtype=np.bool),
335+
},
336+
),
337+
(
338+
list("abb"),
339+
list("abc"),
340+
{
341+
"first": np.array([False, False, True]),
342+
"last": np.array([False, True, False]),
343+
False: np.array([False, True, True]),
344+
},
345+
),
346+
],
347+
)
348+
def test_drop_duplicates(self, data, categories, expected):
349+
350+
idx = CategoricalIndex(data, categories=categories, name="foo")
351+
for keep, e in expected.items():
352+
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), e)
353+
e = idx[~e]
354+
result = idx.drop_duplicates(keep=keep)
355+
tm.assert_index_equal(result, e)
356+
357+
@pytest.mark.parametrize(
358+
"data, categories, expected_data, expected_categories",
359+
[
360+
([1, 1, 1], [1, 2, 3], [1], [1]),
361+
([1, 1, 1], list("abc"), [np.nan], []),
362+
([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]),
363+
([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]),
364+
],
365+
)
366+
def test_unique(self, data, categories, expected_data, expected_categories):
367+
368+
idx = CategoricalIndex(data, categories=categories)
369+
expected = CategoricalIndex(expected_data, categories=expected_categories)
305370
tm.assert_index_equal(idx.unique(), expected)
306371

307372
def test_repr_roundtrip(self):

pandas/tests/indexes/conftest.py

+9
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,12 @@ def sort(request):
1616
in in the Index setops methods.
1717
"""
1818
return request.param
19+
20+
21+
@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"])
22+
def freq_sample(request):
23+
"""
24+
Valid values for 'freq' parameter used to create date_range and
25+
timedelta_range..
26+
"""
27+
return request.param

pandas/tests/indexes/datetimes/test_ops.py

+26-45
Original file line numberDiff line numberDiff line change
@@ -264,9 +264,9 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
264264
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
265265
assert ordered.freq is None
266266

267-
def test_drop_duplicates_metadata(self):
267+
def test_drop_duplicates_metadata(self, freq_sample):
268268
# GH 10115
269-
idx = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx")
269+
idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx")
270270
result = idx.drop_duplicates()
271271
tm.assert_index_equal(idx, result)
272272
assert idx.freq == result.freq
@@ -277,57 +277,38 @@ def test_drop_duplicates_metadata(self):
277277
tm.assert_index_equal(idx, result)
278278
assert result.freq is None
279279

280-
def test_drop_duplicates(self):
280+
@pytest.mark.parametrize(
281+
"keep, expected, index",
282+
[
283+
("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
284+
("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
285+
(
286+
False,
287+
np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
288+
np.arange(5, 10),
289+
),
290+
],
291+
)
292+
def test_drop_duplicates(self, freq_sample, keep, expected, index):
281293
# to check Index/Series compat
282-
base = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx")
283-
idx = base.append(base[:5])
294+
idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx")
295+
idx = idx.append(idx[:5])
284296

285-
res = idx.drop_duplicates()
286-
tm.assert_index_equal(res, base)
287-
res = Series(idx).drop_duplicates()
288-
tm.assert_series_equal(res, Series(base))
297+
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
298+
expected = idx[~expected]
289299

290-
res = idx.drop_duplicates(keep="last")
291-
exp = base[5:].append(base[:5])
292-
tm.assert_index_equal(res, exp)
293-
res = Series(idx).drop_duplicates(keep="last")
294-
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
300+
result = idx.drop_duplicates(keep=keep)
301+
tm.assert_index_equal(result, expected)
295302

296-
res = idx.drop_duplicates(keep=False)
297-
tm.assert_index_equal(res, base[5:])
298-
res = Series(idx).drop_duplicates(keep=False)
299-
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
303+
result = Series(idx).drop_duplicates(keep=keep)
304+
tm.assert_series_equal(result, Series(expected, index=index))
300305

301-
@pytest.mark.parametrize(
302-
"freq",
303-
[
304-
"A",
305-
"2A",
306-
"-2A",
307-
"Q",
308-
"-1Q",
309-
"M",
310-
"-1M",
311-
"D",
312-
"3D",
313-
"-3D",
314-
"W",
315-
"-1W",
316-
"H",
317-
"2H",
318-
"-2H",
319-
"T",
320-
"2T",
321-
"S",
322-
"-3S",
323-
],
324-
)
325-
def test_infer_freq(self, freq):
306+
def test_infer_freq(self, freq_sample):
326307
# GH 11018
327-
idx = pd.date_range("2011-01-01 09:00:00", freq=freq, periods=10)
308+
idx = pd.date_range("2011-01-01 09:00:00", freq=freq_sample, periods=10)
328309
result = pd.DatetimeIndex(idx.asi8, freq="infer")
329310
tm.assert_index_equal(idx, result)
330-
assert result.freq == freq
311+
assert result.freq == freq_sample
331312

332313
def test_nat(self, tz_naive_fixture):
333314
tz = tz_naive_fixture

pandas/tests/indexes/period/test_ops.py

+28-21
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,10 @@ def test_value_counts_unique(self):
8181

8282
tm.assert_index_equal(idx.unique(), exp_idx)
8383

84-
def test_drop_duplicates_metadata(self):
84+
@pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"])
85+
def test_drop_duplicates_metadata(self, freq):
8586
# GH 10115
86-
idx = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx")
87+
idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx")
8788
result = idx.drop_duplicates()
8889
tm.assert_index_equal(idx, result)
8990
assert idx.freq == result.freq
@@ -93,26 +94,32 @@ def test_drop_duplicates_metadata(self):
9394
tm.assert_index_equal(idx, result)
9495
assert idx.freq == result.freq
9596

96-
def test_drop_duplicates(self):
97+
@pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"])
98+
@pytest.mark.parametrize(
99+
"keep, expected, index",
100+
[
101+
("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
102+
("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
103+
(
104+
False,
105+
np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
106+
np.arange(5, 10),
107+
),
108+
],
109+
)
110+
def test_drop_duplicates(self, freq, keep, expected, index):
97111
# to check Index/Series compat
98-
base = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx")
99-
idx = base.append(base[:5])
100-
101-
res = idx.drop_duplicates()
102-
tm.assert_index_equal(res, base)
103-
res = Series(idx).drop_duplicates()
104-
tm.assert_series_equal(res, Series(base))
105-
106-
res = idx.drop_duplicates(keep="last")
107-
exp = base[5:].append(base[:5])
108-
tm.assert_index_equal(res, exp)
109-
res = Series(idx).drop_duplicates(keep="last")
110-
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
111-
112-
res = idx.drop_duplicates(keep=False)
113-
tm.assert_index_equal(res, base[5:])
114-
res = Series(idx).drop_duplicates(keep=False)
115-
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
112+
idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx")
113+
idx = idx.append(idx[:5])
114+
115+
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
116+
expected = idx[~expected]
117+
118+
result = idx.drop_duplicates(keep=keep)
119+
tm.assert_index_equal(result, expected)
120+
121+
result = Series(idx).drop_duplicates(keep=keep)
122+
tm.assert_series_equal(result, Series(expected, index=index))
116123

117124
def test_order_compat(self):
118125
def _check_freq(index, expected_index):

pandas/tests/indexes/timedeltas/test_ops.py

+26-24
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,9 @@ def test_order(self):
134134
tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
135135
assert ordered.freq is None
136136

137-
def test_drop_duplicates_metadata(self):
137+
def test_drop_duplicates_metadata(self, freq_sample):
138138
# GH 10115
139-
idx = pd.timedelta_range("1 day", "31 day", freq="D", name="idx")
139+
idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx")
140140
result = idx.drop_duplicates()
141141
tm.assert_index_equal(idx, result)
142142
assert idx.freq == result.freq
@@ -147,36 +147,38 @@ def test_drop_duplicates_metadata(self):
147147
tm.assert_index_equal(idx, result)
148148
assert result.freq is None
149149

150-
def test_drop_duplicates(self):
150+
@pytest.mark.parametrize(
151+
"keep, expected, index",
152+
[
153+
("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
154+
("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
155+
(
156+
False,
157+
np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
158+
np.arange(5, 10),
159+
),
160+
],
161+
)
162+
def test_drop_duplicates(self, freq_sample, keep, expected, index):
151163
# to check Index/Series compat
152-
base = pd.timedelta_range("1 day", "31 day", freq="D", name="idx")
153-
idx = base.append(base[:5])
164+
idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx")
165+
idx = idx.append(idx[:5])
154166

155-
res = idx.drop_duplicates()
156-
tm.assert_index_equal(res, base)
157-
res = Series(idx).drop_duplicates()
158-
tm.assert_series_equal(res, Series(base))
167+
tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
168+
expected = idx[~expected]
159169

160-
res = idx.drop_duplicates(keep="last")
161-
exp = base[5:].append(base[:5])
162-
tm.assert_index_equal(res, exp)
163-
res = Series(idx).drop_duplicates(keep="last")
164-
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
170+
result = idx.drop_duplicates(keep=keep)
171+
tm.assert_index_equal(result, expected)
165172

166-
res = idx.drop_duplicates(keep=False)
167-
tm.assert_index_equal(res, base[5:])
168-
res = Series(idx).drop_duplicates(keep=False)
169-
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
173+
result = Series(idx).drop_duplicates(keep=keep)
174+
tm.assert_series_equal(result, Series(expected, index=index))
170175

171-
@pytest.mark.parametrize(
172-
"freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]
173-
)
174-
def test_infer_freq(self, freq):
176+
def test_infer_freq(self, freq_sample):
175177
# GH#11018
176-
idx = pd.timedelta_range("1", freq=freq, periods=10)
178+
idx = pd.timedelta_range("1", freq=freq_sample, periods=10)
177179
result = pd.TimedeltaIndex(idx.asi8, freq="infer")
178180
tm.assert_index_equal(idx, result)
179-
assert result.freq == freq
181+
assert result.freq == freq_sample
180182

181183
def test_repeat(self):
182184
index = pd.timedelta_range("1 days", periods=2, freq="D")

0 commit comments

Comments
 (0)