From e3cdb6eb7c80e3323b938b505f0968a7f9390c99 Mon Sep 17 00:00:00 2001
From: tmnhat2001 <tmnhat2001@users.noreply.github.com>
Date: Wed, 1 Nov 2017 23:49:40 -0400
Subject: [PATCH 1/6] Add test_drop_duplicates for Categorical dtypes

---
 pandas/tests/test_categorical.py | 143 +++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index 6366aae8ccdf6..3aefce3ef3e9e 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -797,6 +797,149 @@ def test_set_categories_inplace(self):
         cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
         tm.assert_index_equal(cat.categories, pd.Index(['a', 'b', 'c', 'd']))
 
+    @pytest.mark.parametrize(
+            "input1, input2, cat_array",
+            [
+                (
+                    np.array([1, 2, 3, 3], dtype=np.dtype('int_')),
+                    np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_')),
+                    np.array([1, 2, 3, 4, 5], dtype=np.dtype('int_'))
+                ),
+                (
+                    np.array([1, 2, 3, 3], dtype=np.dtype('uint')),
+                    np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint')),
+                    np.array([1, 2, 3, 4, 5], dtype=np.dtype('uint'))
+                ),
+                (
+                    np.array([1, 2, 3, 3], dtype=np.dtype('float_')),
+                    np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_')),
+                    np.array([1, 2, 3, 4, 5], dtype=np.dtype('float_'))
+                ),
+                (
+                    np.array(
+                        [1, 2, 3, 3], dtype=np.dtype('unicode_')
+                    ),
+                    np.array(
+                        [1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_')
+                    ),
+                    np.array(
+                        [1, 2, 3, 4, 5], dtype=np.dtype('unicode_')
+                    )
+                ),
+                (
+                    np.array(
+                        [
+                            '2017-01-01 10:00:00', '2017-02-01 10:00:00',
+                            '2017-03-01 10:00:00', '2017-03-01 10:00:00'
+                        ],
+                        dtype='datetime64'
+                    ),
+                    np.array(
+                        [
+                            '2017-01-01 10:00:00', '2017-02-01 10:00:00',
+                            '2017-03-01 10:00:00', '2017-05-01 10:00:00',
+                            '2017-03-01 10:00:00', '2017-02-01 10:00:00',
+                            '2017-04-01 10:00:00'
+                        ],
+                        dtype='datetime64'
+                    ),
+                    np.array(
+                        [
+                            '2017-01-01 10:00:00', '2017-02-01 10:00:00',
+                            '2017-03-01 10:00:00', '2017-04-01 10:00:00',
+                            '2017-05-01 10:00:00'
+                        ],
+                        dtype='datetime64'
+                    )
+                ),
+                (
+                    pd.to_timedelta(['1 days', '2 days', '3 days', '3 days'],
+                                    unit="D"),
+                    pd.to_timedelta(['1 days', '2 days', '3 days', '5 days',
+                                     '3 days', '2 days', '4 days'], unit="D"),
+                    pd.timedelta_range("1 days", periods=5, freq="D")
+                )
+            ]
+        )
+    @pytest.mark.parametrize("is_ordered", [True, False])
+    def test_drop_duplicates_non_bool(self, input1, input2,
+                                      cat_array, is_ordered):
+        # Test case 1
+        tc1 = Series(Categorical(input1, categories=cat_array,
+                                 ordered=is_ordered))
+        expected = Series([False, False, False, True])
+        tm.assert_series_equal(tc1.duplicated(), expected)
+        tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
+        sc = tc1.copy()
+        sc.drop_duplicates(inplace=True)
+        tm.assert_series_equal(sc, tc1[~expected])
+
+        expected = Series([False, False, True, False])
+        tm.assert_series_equal(tc1.duplicated(keep='last'), expected)
+        tm.assert_series_equal(tc1.drop_duplicates(keep='last'),
+                               tc1[~expected])
+        sc = tc1.copy()
+        sc.drop_duplicates(keep='last', inplace=True)
+        tm.assert_series_equal(sc, tc1[~expected])
+
+        expected = Series([False, False, True, True])
+        tm.assert_series_equal(tc1.duplicated(keep=False), expected)
+        tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
+        sc = tc1.copy()
+        sc.drop_duplicates(keep=False, inplace=True)
+        tm.assert_series_equal(sc, tc1[~expected])
+
+        # Test case 2
+        tc2 = Series(Categorical(input2, categories=cat_array,
+                                 ordered=is_ordered))
+        expected = Series([False, False, False, False, True, True, False])
+        tm.assert_series_equal(tc2.duplicated(), expected)
+        tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
+        sc = tc2.copy()
+        sc.drop_duplicates(inplace=True)
+        tm.assert_series_equal(sc, tc2[~expected])
+
+        expected = Series([False, True, True, False, False, False, False])
+        tm.assert_series_equal(tc2.duplicated(keep='last'), expected)
+        tm.assert_series_equal(tc2.drop_duplicates(keep='last'),
+                               tc2[~expected])
+        sc = tc2.copy()
+        sc.drop_duplicates(keep='last', inplace=True)
+        tm.assert_series_equal(sc, tc2[~expected])
+
+        expected = Series([False, True, True, False, True, True, False])
+        tm.assert_series_equal(tc2.duplicated(keep=False), expected)
+        tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
+        sc = tc2.copy()
+        sc.drop_duplicates(keep=False, inplace=True)
+        tm.assert_series_equal(sc, tc2[~expected])
+
+    @pytest.mark.parametrize("is_ordered", [True, False])
+    def test_drop_duplicates_bool(self, is_ordered):
+        tc = Series(Categorical([True, False, True, False],
+                                categories=[True, False], ordered=is_ordered))
+
+        expected = Series([False, False, True, True])
+        tm.assert_series_equal(tc.duplicated(), expected)
+        tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
+        sc = tc.copy()
+        sc.drop_duplicates(inplace=True)
+        tm.assert_series_equal(sc, tc[~expected])
+
+        expected = Series([True, True, False, False])
+        tm.assert_series_equal(tc.duplicated(keep='last'), expected)
+        tm.assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected])
+        sc = tc.copy()
+        sc.drop_duplicates(keep='last', inplace=True)
+        tm.assert_series_equal(sc, tc[~expected])
+
+        expected = Series([True, True, True, True])
+        tm.assert_series_equal(tc.duplicated(keep=False), expected)
+        tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
+        sc = tc.copy()
+        sc.drop_duplicates(keep=False, inplace=True)
+        tm.assert_series_equal(sc, tc[~expected])
+
     def test_describe(self):
         # string type
         desc = self.factor.describe()

From 289ad35e21ba0d5b3068acd99b16713976c7c67d Mon Sep 17 00:00:00 2001
From: tmnhat2001 <tmnhat2001@users.noreply.github.com>
Date: Wed, 1 Nov 2017 23:56:53 -0400
Subject: [PATCH 2/6] Fix PEP8 issue

---
 pandas/tests/test_categorical.py | 116 +++++++++++++++----------------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index 3aefce3ef3e9e..722f8d23a2ea9 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -798,69 +798,69 @@ def test_set_categories_inplace(self):
         tm.assert_index_equal(cat.categories, pd.Index(['a', 'b', 'c', 'd']))
 
     @pytest.mark.parametrize(
-            "input1, input2, cat_array",
-            [
-                (
-                    np.array([1, 2, 3, 3], dtype=np.dtype('int_')),
-                    np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_')),
-                    np.array([1, 2, 3, 4, 5], dtype=np.dtype('int_'))
-                ),
-                (
-                    np.array([1, 2, 3, 3], dtype=np.dtype('uint')),
-                    np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint')),
-                    np.array([1, 2, 3, 4, 5], dtype=np.dtype('uint'))
+        "input1, input2, cat_array",
+        [
+            (
+                np.array([1, 2, 3, 3], dtype=np.dtype('int_')),
+                np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_')),
+                np.array([1, 2, 3, 4, 5], dtype=np.dtype('int_'))
+            ),
+            (
+                np.array([1, 2, 3, 3], dtype=np.dtype('uint')),
+                np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint')),
+                np.array([1, 2, 3, 4, 5], dtype=np.dtype('uint'))
+            ),
+            (
+                np.array([1, 2, 3, 3], dtype=np.dtype('float_')),
+                np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_')),
+                np.array([1, 2, 3, 4, 5], dtype=np.dtype('float_'))
+            ),
+            (
+                np.array(
+                    [1, 2, 3, 3], dtype=np.dtype('unicode_')
                 ),
-                (
-                    np.array([1, 2, 3, 3], dtype=np.dtype('float_')),
-                    np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_')),
-                    np.array([1, 2, 3, 4, 5], dtype=np.dtype('float_'))
+                np.array(
+                    [1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_')
                 ),
-                (
-                    np.array(
-                        [1, 2, 3, 3], dtype=np.dtype('unicode_')
-                    ),
-                    np.array(
-                        [1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_')
-                    ),
-                    np.array(
-                        [1, 2, 3, 4, 5], dtype=np.dtype('unicode_')
-                    )
+                np.array(
+                    [1, 2, 3, 4, 5], dtype=np.dtype('unicode_')
+                )
+            ),
+            (
+                np.array(
+                    [
+                        '2017-01-01 10:00:00', '2017-02-01 10:00:00',
+                        '2017-03-01 10:00:00', '2017-03-01 10:00:00'
+                    ],
+                    dtype='datetime64'
                 ),
-                (
-                    np.array(
-                        [
-                            '2017-01-01 10:00:00', '2017-02-01 10:00:00',
-                            '2017-03-01 10:00:00', '2017-03-01 10:00:00'
-                        ],
-                        dtype='datetime64'
-                    ),
-                    np.array(
-                        [
-                            '2017-01-01 10:00:00', '2017-02-01 10:00:00',
-                            '2017-03-01 10:00:00', '2017-05-01 10:00:00',
-                            '2017-03-01 10:00:00', '2017-02-01 10:00:00',
-                            '2017-04-01 10:00:00'
-                        ],
-                        dtype='datetime64'
-                    ),
-                    np.array(
-                        [
-                            '2017-01-01 10:00:00', '2017-02-01 10:00:00',
-                            '2017-03-01 10:00:00', '2017-04-01 10:00:00',
-                            '2017-05-01 10:00:00'
-                        ],
-                        dtype='datetime64'
-                    )
+                np.array(
+                    [
+                        '2017-01-01 10:00:00', '2017-02-01 10:00:00',
+                        '2017-03-01 10:00:00', '2017-05-01 10:00:00',
+                        '2017-03-01 10:00:00', '2017-02-01 10:00:00',
+                        '2017-04-01 10:00:00'
+                    ],
+                    dtype='datetime64'
                 ),
-                (
-                    pd.to_timedelta(['1 days', '2 days', '3 days', '3 days'],
-                                    unit="D"),
-                    pd.to_timedelta(['1 days', '2 days', '3 days', '5 days',
-                                     '3 days', '2 days', '4 days'], unit="D"),
-                    pd.timedelta_range("1 days", periods=5, freq="D")
+                np.array(
+                    [
+                        '2017-01-01 10:00:00', '2017-02-01 10:00:00',
+                        '2017-03-01 10:00:00', '2017-04-01 10:00:00',
+                        '2017-05-01 10:00:00'
+                    ],
+                    dtype='datetime64'
                 )
-            ]
-        )
+            ),
+            (
+                pd.to_timedelta(['1 days', '2 days', '3 days', '3 days'],
+                                unit="D"),
+                pd.to_timedelta(['1 days', '2 days', '3 days', '5 days',
+                                 '3 days', '2 days', '4 days'], unit="D"),
+                pd.timedelta_range("1 days", periods=5, freq="D")
+            )
+        ]
+    )
     @pytest.mark.parametrize("is_ordered", [True, False])
     def test_drop_duplicates_non_bool(self, input1, input2,
                                       cat_array, is_ordered):

From e1dc3c9475b7da494f511bd92243285662931edb Mon Sep 17 00:00:00 2001
From: tmnhat2001 <tmnhat2001@gmail.com>
Date: Thu, 2 Nov 2017 22:02:38 -0400
Subject: [PATCH 3/6] Parameterize dtypes

---
 pandas/tests/test_categorical.py | 74 +++++---------------------------
 1 file changed, 10 insertions(+), 64 deletions(-)

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index 722f8d23a2ea9..d77f8c52437e5 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -798,75 +798,19 @@ def test_set_categories_inplace(self):
         tm.assert_index_equal(cat.categories, pd.Index(['a', 'b', 'c', 'd']))
 
     @pytest.mark.parametrize(
-        "input1, input2, cat_array",
-        [
-            (
-                np.array([1, 2, 3, 3], dtype=np.dtype('int_')),
-                np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_')),
-                np.array([1, 2, 3, 4, 5], dtype=np.dtype('int_'))
-            ),
-            (
-                np.array([1, 2, 3, 3], dtype=np.dtype('uint')),
-                np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint')),
-                np.array([1, 2, 3, 4, 5], dtype=np.dtype('uint'))
-            ),
-            (
-                np.array([1, 2, 3, 3], dtype=np.dtype('float_')),
-                np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_')),
-                np.array([1, 2, 3, 4, 5], dtype=np.dtype('float_'))
-            ),
-            (
-                np.array(
-                    [1, 2, 3, 3], dtype=np.dtype('unicode_')
-                ),
-                np.array(
-                    [1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_')
-                ),
-                np.array(
-                    [1, 2, 3, 4, 5], dtype=np.dtype('unicode_')
-                )
-            ),
-            (
-                np.array(
-                    [
-                        '2017-01-01 10:00:00', '2017-02-01 10:00:00',
-                        '2017-03-01 10:00:00', '2017-03-01 10:00:00'
-                    ],
-                    dtype='datetime64'
-                ),
-                np.array(
-                    [
-                        '2017-01-01 10:00:00', '2017-02-01 10:00:00',
-                        '2017-03-01 10:00:00', '2017-05-01 10:00:00',
-                        '2017-03-01 10:00:00', '2017-02-01 10:00:00',
-                        '2017-04-01 10:00:00'
-                    ],
-                    dtype='datetime64'
-                ),
-                np.array(
-                    [
-                        '2017-01-01 10:00:00', '2017-02-01 10:00:00',
-                        '2017-03-01 10:00:00', '2017-04-01 10:00:00',
-                        '2017-05-01 10:00:00'
-                    ],
-                    dtype='datetime64'
-                )
-            ),
-            (
-                pd.to_timedelta(['1 days', '2 days', '3 days', '3 days'],
-                                unit="D"),
-                pd.to_timedelta(['1 days', '2 days', '3 days', '5 days',
-                                 '3 days', '2 days', '4 days'], unit="D"),
-                pd.timedelta_range("1 days", periods=5, freq="D")
-            )
-        ]
+        "dtype",
+        ["int_", "uint", "float_",
+         "unicode_", "datetime64[h]", "timedelta64[h]"]
     )
     @pytest.mark.parametrize("is_ordered", [True, False])
-    def test_drop_duplicates_non_bool(self, input1, input2,
-                                      cat_array, is_ordered):
+    def test_drop_duplicates_non_bool(self, dtype, is_ordered):
+        cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype))
+
         # Test case 1
+        input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype))
         tc1 = Series(Categorical(input1, categories=cat_array,
                                  ordered=is_ordered))
+
         expected = Series([False, False, False, True])
         tm.assert_series_equal(tc1.duplicated(), expected)
         tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
@@ -890,8 +834,10 @@ def test_drop_duplicates_non_bool(self, input1, input2,
         tm.assert_series_equal(sc, tc1[~expected])
 
         # Test case 2
+        input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
         tc2 = Series(Categorical(input2, categories=cat_array,
                                  ordered=is_ordered))
+        
         expected = Series([False, False, False, False, True, True, False])
         tm.assert_series_equal(tc2.duplicated(), expected)
         tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])

From 20c41e38f15ba525ae833a04dfb09b79e5f7ca0f Mon Sep 17 00:00:00 2001
From: tmnhat2001 <tmnhat2001@gmail.com>
Date: Thu, 2 Nov 2017 23:24:22 -0400
Subject: [PATCH 4/6] Fix PEP8 issue

---
 pandas/tests/test_categorical.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index d77f8c52437e5..e78b22f949e08 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -837,7 +837,7 @@ def test_drop_duplicates_non_bool(self, dtype, is_ordered):
         input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
         tc2 = Series(Categorical(input2, categories=cat_array,
                                  ordered=is_ordered))
-        
+                                  
         expected = Series([False, False, False, False, True, True, False])
         tm.assert_series_equal(tc2.duplicated(), expected)
         tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])

From 7b7e577884daabc92e6c8f88d8254b2b9006d812 Mon Sep 17 00:00:00 2001
From: tmnhat2001 <tmnhat2001@gmail.com>
Date: Fri, 3 Nov 2017 20:13:22 -0400
Subject: [PATCH 5/6] Fix PEP8 issue

---
 pandas/tests/test_categorical.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index e78b22f949e08..d35960c7d53b4 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -835,9 +835,10 @@ def test_drop_duplicates_non_bool(self, dtype, is_ordered):
 
         # Test case 2
         input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype))
-        tc2 = Series(Categorical(input2, categories=cat_array,
-                                 ordered=is_ordered))
-                                  
+        tc2 = Series(Categorical(
+            input2, categories=cat_array, ordered=is_ordered)
+        )
+
         expected = Series([False, False, False, False, True, True, False])
         tm.assert_series_equal(tc2.duplicated(), expected)
         tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])

From f9c65c77119779e1b397319b94f3e9340a2a9bd3 Mon Sep 17 00:00:00 2001
From: tmnhat2001 <tmnhat2001@gmail.com>
Date: Sun, 5 Nov 2017 21:46:35 -0500
Subject: [PATCH 6/6] Use datetime64[D] and mark it as xfail

---
 pandas/tests/test_categorical.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
index d35960c7d53b4..b77e2d1dcda8a 100644
--- a/pandas/tests/test_categorical.py
+++ b/pandas/tests/test_categorical.py
@@ -799,8 +799,9 @@ def test_set_categories_inplace(self):
 
     @pytest.mark.parametrize(
         "dtype",
-        ["int_", "uint", "float_",
-         "unicode_", "datetime64[h]", "timedelta64[h]"]
+        ["int_", "uint", "float_", "unicode_", "timedelta64[h]",
+         pytest.param("datetime64[D]",
+                      marks=pytest.mark.xfail(reason="issue7996"))]
     )
     @pytest.mark.parametrize("is_ordered", [True, False])
     def test_drop_duplicates_non_bool(self, dtype, is_ordered):