TST: Add tests for duplicated and drop_duplicates (pandas-dev#32575)

mproszewska · jbrockmendel · commit 0e382f2f305e · 2020-04-06T17:36:54.000-07:00
diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py
@@ -292,16 +292,81 @@ def test_is_monotonic(self, data, non_lexsorted_data):
         assert c.is_monotonic_decreasing is False
 
     def test_has_duplicates(self):
-
         idx = CategoricalIndex([0, 0, 0], name="foo")
         assert idx.is_unique is False
         assert idx.has_duplicates is True
 
-    def test_drop_duplicates(self):
+        idx = CategoricalIndex([0, 1], categories=[2, 3], name="foo")
+        assert idx.is_unique is False
+        assert idx.has_duplicates is True
 
-        idx = CategoricalIndex([0, 0, 0], name="foo")
-        expected = CategoricalIndex([0], name="foo")
-        tm.assert_index_equal(idx.drop_duplicates(), expected)
+        idx = CategoricalIndex([0, 1, 2, 3], categories=[1, 2, 3], name="foo")
+        assert idx.is_unique is True
+        assert idx.has_duplicates is False
+
+    @pytest.mark.parametrize(
+        "data, categories, expected",
+        [
+            (
+                [1, 1, 1],
+                [1, 2, 3],
+                {
+                    "first": np.array([False, True, True]),
+                    "last": np.array([True, True, False]),
+                    False: np.array([True, True, True]),
+                },
+            ),
+            (
+                [1, 1, 1],
+                list("abc"),
+                {
+                    "first": np.array([False, True, True]),
+                    "last": np.array([True, True, False]),
+                    False: np.array([True, True, True]),
+                },
+            ),
+            (
+                [2, "a", "b"],
+                list("abc"),
+                {
+                    "first": np.zeros(shape=(3), dtype=np.bool),
+                    "last": np.zeros(shape=(3), dtype=np.bool),
+                    False: np.zeros(shape=(3), dtype=np.bool),
+                },
+            ),
+            (
+                list("abb"),
+                list("abc"),
+                {
+                    "first": np.array([False, False, True]),
+                    "last": np.array([False, True, False]),
+                    False: np.array([False, True, True]),
+                },
+            ),
+        ],
+    )
+    def test_drop_duplicates(self, data, categories, expected):
+
+        idx = CategoricalIndex(data, categories=categories, name="foo")
+        for keep, e in expected.items():
+            tm.assert_numpy_array_equal(idx.duplicated(keep=keep), e)
+            e = idx[~e]
+            result = idx.drop_duplicates(keep=keep)
+            tm.assert_index_equal(result, e)
+
+    @pytest.mark.parametrize(
+        "data, categories, expected_data, expected_categories",
+        [
+            ([1, 1, 1], [1, 2, 3], [1], [1]),
+            ([1, 1, 1], list("abc"), [np.nan], []),
+            ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]),
+            ([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]),
+        ],
+    )
+    def test_unique(self, data, categories, expected_data, expected_categories):
+
+        idx = CategoricalIndex(data, categories=categories)
+        expected = CategoricalIndex(expected_data, categories=expected_categories)
         tm.assert_index_equal(idx.unique(), expected)
 
     def test_repr_roundtrip(self):
diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py
@@ -16,3 +16,12 @@ def sort(request):
         in in the Index setops methods.
     """
     return request.param
+
+
+@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"])
+def freq_sample(request):
+    """
+    Valid values for 'freq' parameter used to create date_range and
+    timedelta_range..
+    """
+    return request.param
diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py
@@ -264,9 +264,9 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture)
         tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
         assert ordered.freq is None
 
-    def test_drop_duplicates_metadata(self):
+    def test_drop_duplicates_metadata(self, freq_sample):
         # GH 10115
-        idx = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx")
+        idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx")
         result = idx.drop_duplicates()
         tm.assert_index_equal(idx, result)
         assert idx.freq == result.freq
@@ -277,57 +277,38 @@ def test_drop_duplicates_metadata(self):
         tm.assert_index_equal(idx, result)
         assert result.freq is None
 
-    def test_drop_duplicates(self):
+    @pytest.mark.parametrize(
+        "keep, expected, index",
+        [
+            ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
+            ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
+            (
+                False,
+                np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
+                np.arange(5, 10),
+            ),
+        ],
+    )
+    def test_drop_duplicates(self, freq_sample, keep, expected, index):
         # to check Index/Series compat
-        base = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx")
-        idx = base.append(base[:5])
+        idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx")
+        idx = idx.append(idx[:5])
 
-        res = idx.drop_duplicates()
-        tm.assert_index_equal(res, base)
-        res = Series(idx).drop_duplicates()
-        tm.assert_series_equal(res, Series(base))
+        tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
+        expected = idx[~expected]
 
-        res = idx.drop_duplicates(keep="last")
-        exp = base[5:].append(base[:5])
-        tm.assert_index_equal(res, exp)
-        res = Series(idx).drop_duplicates(keep="last")
-        tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
+        result = idx.drop_duplicates(keep=keep)
+        tm.assert_index_equal(result, expected)
 
-        res = idx.drop_duplicates(keep=False)
-        tm.assert_index_equal(res, base[5:])
-        res = Series(idx).drop_duplicates(keep=False)
-        tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
+        result = Series(idx).drop_duplicates(keep=keep)
+        tm.assert_series_equal(result, Series(expected, index=index))
 
-    @pytest.mark.parametrize(
-        "freq",
-        [
-            "A",
-            "2A",
-            "-2A",
-            "Q",
-            "-1Q",
-            "M",
-            "-1M",
-            "D",
-            "3D",
-            "-3D",
-            "W",
-            "-1W",
-            "H",
-            "2H",
-            "-2H",
-            "T",
-            "2T",
-            "S",
-            "-3S",
-        ],
-    )
-    def test_infer_freq(self, freq):
+    def test_infer_freq(self, freq_sample):
         # GH 11018
-        idx = pd.date_range("2011-01-01 09:00:00", freq=freq, periods=10)
+        idx = pd.date_range("2011-01-01 09:00:00", freq=freq_sample, periods=10)
         result = pd.DatetimeIndex(idx.asi8, freq="infer")
         tm.assert_index_equal(idx, result)
-        assert result.freq == freq
+        assert result.freq == freq_sample
 
     def test_nat(self, tz_naive_fixture):
         tz = tz_naive_fixture
diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py
@@ -81,9 +81,10 @@ def test_value_counts_unique(self):
 
         tm.assert_index_equal(idx.unique(), exp_idx)
 
-    def test_drop_duplicates_metadata(self):
+    @pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"])
+    def test_drop_duplicates_metadata(self, freq):
         # GH 10115
-        idx = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx")
+        idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx")
         result = idx.drop_duplicates()
         tm.assert_index_equal(idx, result)
         assert idx.freq == result.freq
@@ -93,26 +94,32 @@ def test_drop_duplicates_metadata(self):
         tm.assert_index_equal(idx, result)
         assert idx.freq == result.freq
 
-    def test_drop_duplicates(self):
+    @pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"])
+    @pytest.mark.parametrize(
+        "keep, expected, index",
+        [
+            ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
+            ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
+            (
+                False,
+                np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
+                np.arange(5, 10),
+            ),
+        ],
+    )
+    def test_drop_duplicates(self, freq, keep, expected, index):
         # to check Index/Series compat
-        base = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx")
-        idx = base.append(base[:5])
-
-        res = idx.drop_duplicates()
-        tm.assert_index_equal(res, base)
-        res = Series(idx).drop_duplicates()
-        tm.assert_series_equal(res, Series(base))
-
-        res = idx.drop_duplicates(keep="last")
-        exp = base[5:].append(base[:5])
-        tm.assert_index_equal(res, exp)
-        res = Series(idx).drop_duplicates(keep="last")
-        tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
-
-        res = idx.drop_duplicates(keep=False)
-        tm.assert_index_equal(res, base[5:])
-        res = Series(idx).drop_duplicates(keep=False)
-        tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
+        idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx")
+        idx = idx.append(idx[:5])
+
+        tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
+        expected = idx[~expected]
+
+        result = idx.drop_duplicates(keep=keep)
+        tm.assert_index_equal(result, expected)
+
+        result = Series(idx).drop_duplicates(keep=keep)
+        tm.assert_series_equal(result, Series(expected, index=index))
 
     def test_order_compat(self):
         def _check_freq(index, expected_index):
diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py
@@ -134,9 +134,9 @@ def test_order(self):
             tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
             assert ordered.freq is None
 
-    def test_drop_duplicates_metadata(self):
+    def test_drop_duplicates_metadata(self, freq_sample):
         # GH 10115
-        idx = pd.timedelta_range("1 day", "31 day", freq="D", name="idx")
+        idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx")
         result = idx.drop_duplicates()
         tm.assert_index_equal(idx, result)
         assert idx.freq == result.freq
@@ -147,36 +147,38 @@ def test_drop_duplicates_metadata(self):
         tm.assert_index_equal(idx, result)
         assert result.freq is None
 
-    def test_drop_duplicates(self):
+    @pytest.mark.parametrize(
+        "keep, expected, index",
+        [
+            ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)),
+            ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)),
+            (
+                False,
+                np.concatenate(([True] * 5, [False] * 5, [True] * 5)),
+                np.arange(5, 10),
+            ),
+        ],
+    )
+    def test_drop_duplicates(self, freq_sample, keep, expected, index):
         # to check Index/Series compat
-        base = pd.timedelta_range("1 day", "31 day", freq="D", name="idx")
-        idx = base.append(base[:5])
+        idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx")
+        idx = idx.append(idx[:5])
 
-        res = idx.drop_duplicates()
-        tm.assert_index_equal(res, base)
-        res = Series(idx).drop_duplicates()
-        tm.assert_series_equal(res, Series(base))
+        tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected)
+        expected = idx[~expected]
 
-        res = idx.drop_duplicates(keep="last")
-        exp = base[5:].append(base[:5])
-        tm.assert_index_equal(res, exp)
-        res = Series(idx).drop_duplicates(keep="last")
-        tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))
+        result = idx.drop_duplicates(keep=keep)
+        tm.assert_index_equal(result, expected)
 
-        res = idx.drop_duplicates(keep=False)
-        tm.assert_index_equal(res, base[5:])
-        res = Series(idx).drop_duplicates(keep=False)
-        tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))
+        result = Series(idx).drop_duplicates(keep=keep)
+        tm.assert_series_equal(result, Series(expected, index=index))
 
-    @pytest.mark.parametrize(
-        "freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]
-    )
-    def test_infer_freq(self, freq):
+    def test_infer_freq(self, freq_sample):
         # GH#11018
-        idx = pd.timedelta_range("1", freq=freq, periods=10)
+        idx = pd.timedelta_range("1", freq=freq_sample, periods=10)
         result = pd.TimedeltaIndex(idx.asi8, freq="infer")
         tm.assert_index_equal(idx, result)
-        assert result.freq == freq
+        assert result.freq == freq_sample
 
     def test_repeat(self):
         index = pd.timedelta_range("1 days", periods=2, freq="D")