From ca11b9fab740ce558b2a91b864cc2fa9e16a107b Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Fri, 6 Mar 2020 11:13:49 +0100 Subject: [PATCH 1/7] remove redundant duplicated test from tests/base/test_ops.py --- pandas/tests/base/test_ops.py | 102 ---------------------------------- 1 file changed, 102 deletions(-) diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index 8f48d0a3e8378..dc7f85de15de3 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -594,108 +594,6 @@ def test_factorize_repeated(self): expected = o[5:10].append(o[:5]) tm.assert_index_equal(uniques, expected, check_names=False) - def test_duplicated_drop_duplicates_index(self): - # GH 4060 - for original in self.objs: - if isinstance(original, Index): - - # special case - if original.is_boolean(): - result = original.drop_duplicates() - expected = Index([False, True], name="a") - tm.assert_index_equal(result, expected) - continue - - # original doesn't have duplicates - expected = np.array([False] * len(original), dtype=bool) - duplicated = original.duplicated() - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = original.drop_duplicates() - tm.assert_index_equal(result, original) - assert result is not original - - # has_duplicates - assert not original.has_duplicates - - # create repeated values, 3rd and 5th values are duplicated - idx = original[list(range(len(original))) + [5, 3]] - expected = np.array([False] * len(original) + [True, True], dtype=bool) - duplicated = idx.duplicated() - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - tm.assert_index_equal(idx.drop_duplicates(), original) - - base = [False] * len(idx) - base[3] = True - base[5] = True - expected = np.array(base) - - duplicated = idx.duplicated(keep="last") - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = idx.drop_duplicates(keep="last") - tm.assert_index_equal(result, idx[~expected]) - - base = [False] * len(original) + [True, True] - base[3] = True - base[5] = True - expected = np.array(base) - - duplicated = idx.duplicated(keep=False) - tm.assert_numpy_array_equal(duplicated, expected) - assert duplicated.dtype == bool - result = idx.drop_duplicates(keep=False) - tm.assert_index_equal(result, idx[~expected]) - - with pytest.raises( - TypeError, - match=r"drop_duplicates\(\) got an unexpected keyword argument", - ): - idx.drop_duplicates(inplace=True) - - else: - expected = Series( - [False] * len(original), index=original.index, name="a" - ) - tm.assert_series_equal(original.duplicated(), expected) - result = original.drop_duplicates() - tm.assert_series_equal(result, original) - assert result is not original - - idx = original.index[list(range(len(original))) + [5, 3]] - values = original._values[list(range(len(original))) + [5, 3]] - s = Series(values, index=idx, name="a") - - expected = Series( - [False] * len(original) + [True, True], index=idx, name="a" - ) - tm.assert_series_equal(s.duplicated(), expected) - tm.assert_series_equal(s.drop_duplicates(), original) - - base = [False] * len(idx) - base[3] = True - base[5] = True - expected = Series(base, index=idx, name="a") - - tm.assert_series_equal(s.duplicated(keep="last"), expected) - tm.assert_series_equal( - s.drop_duplicates(keep="last"), s[~np.array(base)] - ) - - base = [False] * len(original) + [True, True] - base[3] = True - base[5] = True - expected = Series(base, index=idx, name="a") - - tm.assert_series_equal(s.duplicated(keep=False), expected) - tm.assert_series_equal( - s.drop_duplicates(keep=False), s[~np.array(base)] - ) - - s.drop_duplicates(inplace=True) - tm.assert_series_equal(s, original) - def test_drop_duplicates_series_vs_dataframe(self): # GH 14192 df = pd.DataFrame( From 0dd7da21af73c7ffd4adf0edad80c1a8b538ba62 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Thu, 12 Mar 2020 17:31:07 +0100 Subject: [PATCH 2/7] added new test case for Series.duplicated and Series.drop_duplicates on a Series without duplicated values --- .../series/methods/test_drop_duplicates.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 2d052505d5ecc..f1fdb2abdd578 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -44,6 +44,27 @@ def test_drop_duplicates_bool(keep, expected): tm.assert_series_equal(sc, tc[~expected]) +@pytest.mark.parametrize("keep", ["first", "last", False]) +@pytest.mark.parametrize("values", [[], list(range(5))]) +def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values): + tc = Series(values, dtype=np.dtype(any_numpy_dtype)) + expected = Series([False] * len(tc), dtype="bool") + + if tc.dtype == "bool": + # 0 -> False and 1-> True + # any other value would be duplicated + tc = tc[:2] + expected = expected[:2] + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + + result_dropped = tc.drop_duplicates(keep=keep) + tm.assert_series_equal(result_dropped, tc) + + # validate shallow copy + assert result_dropped is not tc + + class TestSeriesDropDuplicates: @pytest.mark.parametrize( "dtype", From f286bc75314b06c5b4f0d61c800f2d7f673cab19 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Thu, 12 Mar 2020 18:08:05 +0100 Subject: [PATCH 3/7] testing duplicated for empty indices as well --- pandas/tests/indexes/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index c6ba5c9d61e9e..be071c55af828 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -304,10 +304,10 @@ def test_pickle(self, indices): @pytest.mark.parametrize("keep", ["first", "last", False]) def test_duplicated(self, indices, keep): - if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): + if isinstance(indices, (MultiIndex, RangeIndex)): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates - pytest.skip("Skip check for empty Index, MultiIndex, RangeIndex") + pytest.skip("Skip check for MultiIndex, RangeIndex") holder = type(indices) From b5e599cac5974dc76b929123197195bfd3e59fcb Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Thu, 12 Mar 2020 18:55:03 +0100 Subject: [PATCH 4/7] extended/enhanced tests for Index.duplicated and Index.drop_duplicates in tests/indexes/test_common.py --- pandas/tests/indexes/test_common.py | 46 ++++++++++++++++++----------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index be071c55af828..867a7c905e325 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -303,31 +303,41 @@ def test_pickle(self, indices): indices.name = original_name @pytest.mark.parametrize("keep", ["first", "last", False]) - def test_duplicated(self, indices, keep): + def test_duplicated_and_drop_duplicates(self, indices, keep): if isinstance(indices, (MultiIndex, RangeIndex)): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates pytest.skip("Skip check for MultiIndex, RangeIndex") + # make unique index holder = type(indices) + unique_values = list(set(indices)) + unique_idx = holder(unique_values) - idx = holder(indices) - if idx.has_duplicates: - # We are testing the duplicated-method here, so we need to know - # exactly which indices are duplicate and how (for the result). - # This is not possible if "idx" has duplicates already, which we - # therefore remove. This is seemingly circular, as drop_duplicates - # invokes duplicated, but in the end, it all works out because we - # cross-check with Series.duplicated, which is tested separately. - idx = idx.drop_duplicates() - - n, k = len(idx), 10 - duplicated_selection = np.random.choice(n, k * n) - expected = pd.Series(duplicated_selection).duplicated(keep=keep).values - idx = holder(idx.values[duplicated_selection]) - - result = idx.duplicated(keep=keep) - tm.assert_numpy_array_equal(result, expected) + # check on unique index + expected_duplicated = np.array([False] * len(unique_idx), dtype="bool") + tm.assert_numpy_array_equal( + unique_idx.duplicated(keep=keep), expected_duplicated + ) + result_dropped = unique_idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result_dropped, unique_idx) + # validate shallow copy + assert result_dropped is not unique_idx + + # make duplicated index + n = len(unique_idx) + duplicated_selection = np.random.choice(n, int(n * 1.5)) + idx = holder(unique_idx.values[duplicated_selection]) + + # Series.duplicated is tested separately + expected_duplicated = ( + pd.Series(duplicated_selection).duplicated(keep=keep).values + ) + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected_duplicated) + + # Series.drop_duplicates is tested separately + expected_dropped = holder(pd.Series(idx).drop_duplicates(keep=keep)) + tm.assert_index_equal(idx.drop_duplicates(keep=keep), expected_dropped) def test_has_duplicates(self, indices): holder = type(indices) From 0329b57cafc5cfbe5381912923b37819c71dba8f Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Thu, 12 Mar 2020 19:07:54 +0100 Subject: [PATCH 5/7] split the test cases --- pandas/tests/indexes/test_common.py | 49 ++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 867a7c905e325..bb7363dfa50b3 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -303,27 +303,20 @@ def test_pickle(self, indices): indices.name = original_name @pytest.mark.parametrize("keep", ["first", "last", False]) - def test_duplicated_and_drop_duplicates(self, indices, keep): - if isinstance(indices, (MultiIndex, RangeIndex)): - # MultiIndex tested separately in: - # tests/indexes/multi/test_unique_and_duplicates - pytest.skip("Skip check for MultiIndex, RangeIndex") + def test_drop_duplicates(self, indices, keep): + if isinstance(indices, MultiIndex): + pytest.skip("MultiIndex is tested separately") + if isinstance(indices, RangeIndex): + pytest.skip( + "RangeIndex is tested in test_drop_duplicates_no_duplicates" + " as it cannot hold duplicates" + ) # make unique index holder = type(indices) unique_values = list(set(indices)) unique_idx = holder(unique_values) - # check on unique index - expected_duplicated = np.array([False] * len(unique_idx), dtype="bool") - tm.assert_numpy_array_equal( - unique_idx.duplicated(keep=keep), expected_duplicated - ) - result_dropped = unique_idx.drop_duplicates(keep=keep) - tm.assert_index_equal(result_dropped, unique_idx) - # validate shallow copy - assert result_dropped is not unique_idx - # make duplicated index n = len(unique_idx) duplicated_selection = np.random.choice(n, int(n * 1.5)) @@ -339,6 +332,32 @@ def test_duplicated_and_drop_duplicates(self, indices, keep): expected_dropped = holder(pd.Series(idx).drop_duplicates(keep=keep)) tm.assert_index_equal(idx.drop_duplicates(keep=keep), expected_dropped) + def test_drop_duplicates_no_duplicates(self, indices): + if isinstance(indices, MultiIndex): + pytest.skip("MultiIndex is tested separately") + + # make unique index + if isinstance(indices, RangeIndex): + # RangeIndex cannot have duplicates + unique_idx = indices + else: + holder = type(indices) + unique_values = list(set(indices)) + unique_idx = holder(unique_values) + + # check on unique index + expected_duplicated = np.array([False] * len(unique_idx), dtype="bool") + tm.assert_numpy_array_equal(unique_idx.duplicated(), expected_duplicated) + result_dropped = unique_idx.drop_duplicates() + tm.assert_index_equal(result_dropped, unique_idx) + # validate shallow copy + assert result_dropped is not unique_idx + + def test_drop_duplicates_inplace(self, indices): + msg = r"drop_duplicates\(\) got an unexpected keyword argument" + with pytest.raises(TypeError, match=msg): + indices.drop_duplicates(inplace=True) + def test_has_duplicates(self, indices): holder = type(indices) if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): From 29c02fcbc78d9c9eb7366e3de9b0b33d67f9cc0b Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Thu, 12 Mar 2020 19:12:51 +0100 Subject: [PATCH 6/7] added keep fixture --- pandas/conftest.py | 9 +++++++++ pandas/tests/indexes/test_common.py | 1 - pandas/tests/series/methods/test_drop_duplicates.py | 1 - 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index dcfc523315c8b..d8f96021cdb15 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -425,6 +425,15 @@ def nselect_method(request): return request.param +@pytest.fixture(params=["first", "last", False]) +def keep(request): + """ + Valid values for the 'keep' parameter used in + .duplicated or .drop_duplicates + """ + return request.param + + @pytest.fixture(params=["left", "right", "both", "neither"]) def closed(request): """ diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index bb7363dfa50b3..422558199ae60 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -302,7 +302,6 @@ def test_pickle(self, indices): assert indices.equals(unpickled) indices.name = original_name - @pytest.mark.parametrize("keep", ["first", "last", False]) def test_drop_duplicates(self, indices, keep): if isinstance(indices, MultiIndex): pytest.skip("MultiIndex is tested separately") diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index f1fdb2abdd578..54f32f979232d 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -44,7 +44,6 @@ def test_drop_duplicates_bool(keep, expected): tm.assert_series_equal(sc, tc[~expected]) -@pytest.mark.parametrize("keep", ["first", "last", False]) @pytest.mark.parametrize("values", [[], list(range(5))]) def test_drop_duplicates_no_duplicates(any_numpy_dtype, keep, values): tc = Series(values, dtype=np.dtype(any_numpy_dtype)) From 57afc6bba902ff8c04b6fe9018dbc03be413cafc Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Fri, 13 Mar 2020 16:55:31 +0100 Subject: [PATCH 7/7] fixing some broken tests --- pandas/tests/indexes/test_common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 422558199ae60..6f0920c11a6e6 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -310,6 +310,11 @@ def test_drop_duplicates(self, indices, keep): "RangeIndex is tested in test_drop_duplicates_no_duplicates" " as it cannot hold duplicates" ) + if len(indices) == 0: + pytest.skip( + "empty index is tested in test_drop_duplicates_no_duplicates" + " as it cannot hold duplicates" + ) # make unique index holder = type(indices)