|
3 | 3 | import numpy as np
|
4 | 4 | import pytest
|
5 | 5 |
|
6 |
| -from pandas import DataFrame, Series |
| 6 | +from pandas import DataFrame |
7 | 7 | import pandas.util.testing as tm
|
8 | 8 |
|
9 | 9 |
|
10 | 10 | @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
|
11 |
| -def test_duplicated_with_misspelled_column_name(subset): |
| 11 | +def test_drop_duplicates_with_misspelled_column_name(subset): |
12 | 12 | # GH 19730
|
13 | 13 | df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
|
14 | 14 | msg = re.escape("Index(['a'], dtype='object')")
|
15 | 15 |
|
16 |
| - with pytest.raises(KeyError, match=msg): |
17 |
| - df.duplicated(subset) |
18 |
| - |
19 | 16 | with pytest.raises(KeyError, match=msg):
|
20 | 17 | df.drop_duplicates(subset)
|
21 | 18 |
|
22 | 19 |
|
23 |
| -@pytest.mark.slow |
24 |
| -def test_duplicated_do_not_fail_on_wide_dataframes(): |
25 |
| - # gh-21524 |
26 |
| - # Given the wide dataframe with a lot of columns |
27 |
| - # with different (important!) values |
28 |
| - data = { |
29 |
| - "col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100) |
30 |
| - } |
31 |
| - df = DataFrame(data).T |
32 |
| - result = df.duplicated() |
33 |
| - |
34 |
| - # Then duplicates produce the bool Series as a result and don't fail during |
35 |
| - # calculation. Actual values doesn't matter here, though usually it's all |
36 |
| - # False in this case |
37 |
| - assert isinstance(result, Series) |
38 |
| - assert result.dtype == np.bool |
39 |
| - |
40 |
| - |
41 |
| -@pytest.mark.parametrize( |
42 |
| - "keep, expected", |
43 |
| - [ |
44 |
| - ("first", Series([False, False, True, False, True])), |
45 |
| - ("last", Series([True, True, False, False, False])), |
46 |
| - (False, Series([True, True, True, False, True])), |
47 |
| - ], |
48 |
| -) |
49 |
| -def test_duplicated_keep(keep, expected): |
50 |
| - df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) |
51 |
| - |
52 |
| - result = df.duplicated(keep=keep) |
53 |
| - tm.assert_series_equal(result, expected) |
54 |
| - |
55 |
| - |
56 |
| -@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") |
57 |
| -@pytest.mark.parametrize( |
58 |
| - "keep, expected", |
59 |
| - [ |
60 |
| - ("first", Series([False, False, True, False, True])), |
61 |
| - ("last", Series([True, True, False, False, False])), |
62 |
| - (False, Series([True, True, True, False, True])), |
63 |
| - ], |
64 |
| -) |
65 |
| -def test_duplicated_nan_none(keep, expected): |
66 |
| - df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object) |
67 |
| - |
68 |
| - result = df.duplicated(keep=keep) |
69 |
| - tm.assert_series_equal(result, expected) |
70 |
| - |
71 |
| - |
72 |
| -@pytest.mark.parametrize("keep", ["first", "last", False]) |
73 |
| -@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) |
74 |
| -def test_duplicated_subset(subset, keep): |
75 |
| - df = DataFrame( |
76 |
| - { |
77 |
| - "A": [0, 1, 1, 2, 0], |
78 |
| - "B": ["a", "b", "b", "c", "a"], |
79 |
| - "C": [np.nan, 3, 3, None, np.nan], |
80 |
| - } |
81 |
| - ) |
82 |
| - |
83 |
| - if subset is None: |
84 |
| - subset = list(df.columns) |
85 |
| - elif isinstance(subset, str): |
86 |
| - # need to have a DataFrame, not a Series |
87 |
| - # -> select columns with singleton list, not string |
88 |
| - subset = [subset] |
89 |
| - |
90 |
| - expected = df[subset].duplicated(keep=keep) |
91 |
| - result = df.duplicated(keep=keep, subset=subset) |
92 |
| - tm.assert_series_equal(result, expected) |
93 |
| - |
94 |
| - |
95 | 20 | def test_drop_duplicates():
|
96 | 21 | df = DataFrame(
|
97 | 22 | {
|
@@ -188,17 +113,6 @@ def test_drop_duplicates():
|
188 | 113 | assert df.duplicated(keep=keep).sum() == 0
|
189 | 114 |
|
190 | 115 |
|
191 |
| -def test_duplicated_on_empty_frame(): |
192 |
| - # GH 25184 |
193 |
| - |
194 |
| - df = DataFrame(columns=["a", "b"]) |
195 |
| - dupes = df.duplicated("a") |
196 |
| - |
197 |
| - result = df[dupes] |
198 |
| - expected = df.copy() |
199 |
| - tm.assert_frame_equal(result, expected) |
200 |
| - |
201 |
| - |
202 | 116 | def test_drop_duplicates_with_duplicate_column_names():
|
203 | 117 | # GH17836
|
204 | 118 | df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
|
|
0 commit comments