From e3cdb6eb7c80e3323b938b505f0968a7f9390c99 Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Wed, 1 Nov 2017 23:49:40 -0400 Subject: [PATCH 1/6] Add test_drop_duplicates for Categorical dtypes --- pandas/tests/test_categorical.py | 143 +++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 6366aae8ccdf6..3aefce3ef3e9e 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -797,6 +797,149 @@ def test_set_categories_inplace(self): cat.set_categories(['a', 'b', 'c', 'd'], inplace=True) tm.assert_index_equal(cat.categories, pd.Index(['a', 'b', 'c', 'd'])) + @pytest.mark.parametrize( + "input1, input2, cat_array", + [ + ( + np.array([1, 2, 3, 3], dtype=np.dtype('int_')), + np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_')), + np.array([1, 2, 3, 4, 5], dtype=np.dtype('int_')) + ), + ( + np.array([1, 2, 3, 3], dtype=np.dtype('uint')), + np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint')), + np.array([1, 2, 3, 4, 5], dtype=np.dtype('uint')) + ), + ( + np.array([1, 2, 3, 3], dtype=np.dtype('float_')), + np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_')), + np.array([1, 2, 3, 4, 5], dtype=np.dtype('float_')) + ), + ( + np.array( + [1, 2, 3, 3], dtype=np.dtype('unicode_') + ), + np.array( + [1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_') + ), + np.array( + [1, 2, 3, 4, 5], dtype=np.dtype('unicode_') + ) + ), + ( + np.array( + [ + '2017-01-01 10:00:00', '2017-02-01 10:00:00', + '2017-03-01 10:00:00', '2017-03-01 10:00:00' + ], + dtype='datetime64' + ), + np.array( + [ + '2017-01-01 10:00:00', '2017-02-01 10:00:00', + '2017-03-01 10:00:00', '2017-05-01 10:00:00', + '2017-03-01 10:00:00', '2017-02-01 10:00:00', + '2017-04-01 10:00:00' + ], + dtype='datetime64' + ), + np.array( + [ + '2017-01-01 10:00:00', '2017-02-01 10:00:00', + '2017-03-01 10:00:00', '2017-04-01 10:00:00', + '2017-05-01 10:00:00' + ], + dtype='datetime64' + ) + ), + ( + pd.to_timedelta(['1 days', '2 days', '3 days', '3 days'], + unit="D"), + pd.to_timedelta(['1 days', '2 days', '3 days', '5 days', + '3 days', '2 days', '4 days'], unit="D"), + pd.timedelta_range("1 days", periods=5, freq="D") + ) + ] + ) + @pytest.mark.parametrize("is_ordered", [True, False]) + def test_drop_duplicates_non_bool(self, input1, input2, + cat_array, is_ordered): + # Test case 1 + tc1 = Series(Categorical(input1, categories=cat_array, + ordered=is_ordered)) + expected = Series([False, False, False, True]) + tm.assert_series_equal(tc1.duplicated(), expected) + tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + expected = Series([False, False, True, False]) + tm.assert_series_equal(tc1.duplicated(keep='last'), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep='last'), + tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(keep='last', inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc1.duplicated(keep=False), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + # Test case 2 + tc2 = Series(Categorical(input2, categories=cat_array, + ordered=is_ordered)) + expected = Series([False, False, False, False, True, True, False]) + tm.assert_series_equal(tc2.duplicated(), expected) + tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + expected = Series([False, True, True, False, False, False, False]) + tm.assert_series_equal(tc2.duplicated(keep='last'), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep='last'), + tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(keep='last', inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + expected = Series([False, True, True, False, True, True, False]) + tm.assert_series_equal(tc2.duplicated(keep=False), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + @pytest.mark.parametrize("is_ordered", [True, False]) + def test_drop_duplicates_bool(self, is_ordered): + tc = Series(Categorical([True, False, True, False], + categories=[True, False], ordered=is_ordered)) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc.duplicated(), expected) + tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, False, False]) + tm.assert_series_equal(tc.duplicated(keep='last'), expected) + tm.assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep='last', inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, True, True]) + tm.assert_series_equal(tc.duplicated(keep=False), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + def test_describe(self): # string type desc = self.factor.describe() From 289ad35e21ba0d5b3068acd99b16713976c7c67d Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Wed, 1 Nov 2017 23:56:53 -0400 Subject: [PATCH 2/6] Fix PEP8 issue --- pandas/tests/test_categorical.py | 116 +++++++++++++++---------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 3aefce3ef3e9e..722f8d23a2ea9 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -798,69 +798,69 @@ def test_set_categories_inplace(self): tm.assert_index_equal(cat.categories, pd.Index(['a', 'b', 'c', 'd'])) @pytest.mark.parametrize( - "input1, input2, cat_array", - [ - ( - np.array([1, 2, 3, 3], dtype=np.dtype('int_')), - np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_')), - np.array([1, 2, 3, 4, 5], dtype=np.dtype('int_')) - ), - ( - np.array([1, 2, 3, 3], dtype=np.dtype('uint')), - np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint')), - np.array([1, 2, 3, 4, 5], dtype=np.dtype('uint')) + "input1, input2, cat_array", + [ + ( + np.array([1, 2, 3, 3], dtype=np.dtype('int_')), + np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_')), + np.array([1, 2, 3, 4, 5], dtype=np.dtype('int_')) + ), + ( + np.array([1, 2, 3, 3], dtype=np.dtype('uint')), + np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint')), + np.array([1, 2, 3, 4, 5], dtype=np.dtype('uint')) + ), + ( + np.array([1, 2, 3, 3], dtype=np.dtype('float_')), + np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_')), + np.array([1, 2, 3, 4, 5], dtype=np.dtype('float_')) + ), + ( + np.array( + [1, 2, 3, 3], dtype=np.dtype('unicode_') ), - ( - np.array([1, 2, 3, 3], dtype=np.dtype('float_')), - np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_')), - np.array([1, 2, 3, 4, 5], dtype=np.dtype('float_')) + np.array( + [1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_') ), - ( - np.array( - [1, 2, 3, 3], dtype=np.dtype('unicode_') - ), - np.array( - [1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_') - ), - np.array( - [1, 2, 3, 4, 5], dtype=np.dtype('unicode_') - ) + np.array( + [1, 2, 3, 4, 5], dtype=np.dtype('unicode_') + ) + ), + ( + np.array( + [ + '2017-01-01 10:00:00', '2017-02-01 10:00:00', + '2017-03-01 10:00:00', '2017-03-01 10:00:00' + ], + dtype='datetime64' ), - ( - np.array( - [ - '2017-01-01 10:00:00', '2017-02-01 10:00:00', - '2017-03-01 10:00:00', '2017-03-01 10:00:00' - ], - dtype='datetime64' - ), - np.array( - [ - '2017-01-01 10:00:00', '2017-02-01 10:00:00', - '2017-03-01 10:00:00', '2017-05-01 10:00:00', - '2017-03-01 10:00:00', '2017-02-01 10:00:00', - '2017-04-01 10:00:00' - ], - dtype='datetime64' - ), - np.array( - [ - '2017-01-01 10:00:00', '2017-02-01 10:00:00', - '2017-03-01 10:00:00', '2017-04-01 10:00:00', - '2017-05-01 10:00:00' - ], - dtype='datetime64' - ) + np.array( + [ + '2017-01-01 10:00:00', '2017-02-01 10:00:00', + '2017-03-01 10:00:00', '2017-05-01 10:00:00', + '2017-03-01 10:00:00', '2017-02-01 10:00:00', + '2017-04-01 10:00:00' + ], + dtype='datetime64' ), - ( - pd.to_timedelta(['1 days', '2 days', '3 days', '3 days'], - unit="D"), - pd.to_timedelta(['1 days', '2 days', '3 days', '5 days', - '3 days', '2 days', '4 days'], unit="D"), - pd.timedelta_range("1 days", periods=5, freq="D") + np.array( + [ + '2017-01-01 10:00:00', '2017-02-01 10:00:00', + '2017-03-01 10:00:00', '2017-04-01 10:00:00', + '2017-05-01 10:00:00' + ], + dtype='datetime64' ) - ] - ) + ), + ( + pd.to_timedelta(['1 days', '2 days', '3 days', '3 days'], + unit="D"), + pd.to_timedelta(['1 days', '2 days', '3 days', '5 days', + '3 days', '2 days', '4 days'], unit="D"), + pd.timedelta_range("1 days", periods=5, freq="D") + ) + ] + ) @pytest.mark.parametrize("is_ordered", [True, False]) def test_drop_duplicates_non_bool(self, input1, input2, cat_array, is_ordered): From e1dc3c9475b7da494f511bd92243285662931edb Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Thu, 2 Nov 2017 22:02:38 -0400 Subject: [PATCH 3/6] Parameterize dtypes --- pandas/tests/test_categorical.py | 74 +++++--------------------------- 1 file changed, 10 insertions(+), 64 deletions(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 722f8d23a2ea9..d77f8c52437e5 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -798,75 +798,19 @@ def test_set_categories_inplace(self): tm.assert_index_equal(cat.categories, pd.Index(['a', 'b', 'c', 'd'])) @pytest.mark.parametrize( - "input1, input2, cat_array", - [ - ( - np.array([1, 2, 3, 3], dtype=np.dtype('int_')), - np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_')), - np.array([1, 2, 3, 4, 5], dtype=np.dtype('int_')) - ), - ( - np.array([1, 2, 3, 3], dtype=np.dtype('uint')), - np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint')), - np.array([1, 2, 3, 4, 5], dtype=np.dtype('uint')) - ), - ( - np.array([1, 2, 3, 3], dtype=np.dtype('float_')), - np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_')), - np.array([1, 2, 3, 4, 5], dtype=np.dtype('float_')) - ), - ( - np.array( - [1, 2, 3, 3], dtype=np.dtype('unicode_') - ), - np.array( - [1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_') - ), - np.array( - [1, 2, 3, 4, 5], dtype=np.dtype('unicode_') - ) - ), - ( - np.array( - [ - '2017-01-01 10:00:00', '2017-02-01 10:00:00', - '2017-03-01 10:00:00', '2017-03-01 10:00:00' - ], - dtype='datetime64' - ), - np.array( - [ - '2017-01-01 10:00:00', '2017-02-01 10:00:00', - '2017-03-01 10:00:00', '2017-05-01 10:00:00', - '2017-03-01 10:00:00', '2017-02-01 10:00:00', - '2017-04-01 10:00:00' - ], - dtype='datetime64' - ), - np.array( - [ - '2017-01-01 10:00:00', '2017-02-01 10:00:00', - '2017-03-01 10:00:00', '2017-04-01 10:00:00', - '2017-05-01 10:00:00' - ], - dtype='datetime64' - ) - ), - ( - pd.to_timedelta(['1 days', '2 days', '3 days', '3 days'], - unit="D"), - pd.to_timedelta(['1 days', '2 days', '3 days', '5 days', - '3 days', '2 days', '4 days'], unit="D"), - pd.timedelta_range("1 days", periods=5, freq="D") - ) - ] + "dtype", + ["int_", "uint", "float_", + "unicode_", "datetime64[h]", "timedelta64[h]"] ) @pytest.mark.parametrize("is_ordered", [True, False]) - def test_drop_duplicates_non_bool(self, input1, input2, - cat_array, is_ordered): + def test_drop_duplicates_non_bool(self, dtype, is_ordered): + cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) + # Test case 1 + input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) tc1 = Series(Categorical(input1, categories=cat_array, ordered=is_ordered)) + expected = Series([False, False, False, True]) tm.assert_series_equal(tc1.duplicated(), expected) tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) @@ -890,8 +834,10 @@ def test_drop_duplicates_non_bool(self, input1, input2, tm.assert_series_equal(sc, tc1[~expected]) # Test case 2 + input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) tc2 = Series(Categorical(input2, categories=cat_array, ordered=is_ordered)) + expected = Series([False, False, False, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(), expected) tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) From 20c41e38f15ba525ae833a04dfb09b79e5f7ca0f Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Thu, 2 Nov 2017 23:24:22 -0400 Subject: [PATCH 4/6] Fix PEP8 issue --- pandas/tests/test_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index d77f8c52437e5..e78b22f949e08 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -837,7 +837,7 @@ def test_drop_duplicates_non_bool(self, dtype, is_ordered): input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) tc2 = Series(Categorical(input2, categories=cat_array, ordered=is_ordered)) - + expected = Series([False, False, False, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(), expected) tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) From 7b7e577884daabc92e6c8f88d8254b2b9006d812 Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Fri, 3 Nov 2017 20:13:22 -0400 Subject: [PATCH 5/6] Fix PEP8 issue --- pandas/tests/test_categorical.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index e78b22f949e08..d35960c7d53b4 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -835,9 +835,10 @@ def test_drop_duplicates_non_bool(self, dtype, is_ordered): # Test case 2 input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) - tc2 = Series(Categorical(input2, categories=cat_array, - ordered=is_ordered)) - + tc2 = Series(Categorical( + input2, categories=cat_array, ordered=is_ordered) + ) + expected = Series([False, False, False, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(), expected) tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) From f9c65c77119779e1b397319b94f3e9340a2a9bd3 Mon Sep 17 00:00:00 2001 From: tmnhat2001 Date: Sun, 5 Nov 2017 21:46:35 -0500 Subject: [PATCH 6/6] Use datetime64[D] and mark it as xfail --- pandas/tests/test_categorical.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index d35960c7d53b4..b77e2d1dcda8a 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -799,8 +799,9 @@ def test_set_categories_inplace(self): @pytest.mark.parametrize( "dtype", - ["int_", "uint", "float_", - "unicode_", "datetime64[h]", "timedelta64[h]"] + ["int_", "uint", "float_", "unicode_", "timedelta64[h]", + pytest.param("datetime64[D]", + marks=pytest.mark.xfail(reason="issue7996"))] ) @pytest.mark.parametrize("is_ordered", [True, False]) def test_drop_duplicates_non_bool(self, dtype, is_ordered):