diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 543edc6b66ff2..83fe21fd20bfe 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -292,16 +292,81 @@ def test_is_monotonic(self, data, non_lexsorted_data): assert c.is_monotonic_decreasing is False def test_has_duplicates(self): - idx = CategoricalIndex([0, 0, 0], name="foo") assert idx.is_unique is False assert idx.has_duplicates is True - def test_drop_duplicates(self): + idx = CategoricalIndex([0, 1], categories=[2, 3], name="foo") + assert idx.is_unique is False + assert idx.has_duplicates is True - idx = CategoricalIndex([0, 0, 0], name="foo") - expected = CategoricalIndex([0], name="foo") - tm.assert_index_equal(idx.drop_duplicates(), expected) + idx = CategoricalIndex([0, 1, 2, 3], categories=[1, 2, 3], name="foo") + assert idx.is_unique is True + assert idx.has_duplicates is False + + @pytest.mark.parametrize( + "data, categories, expected", + [ + ( + [1, 1, 1], + [1, 2, 3], + { + "first": np.array([False, True, True]), + "last": np.array([True, True, False]), + False: np.array([True, True, True]), + }, + ), + ( + [1, 1, 1], + list("abc"), + { + "first": np.array([False, True, True]), + "last": np.array([True, True, False]), + False: np.array([True, True, True]), + }, + ), + ( + [2, "a", "b"], + list("abc"), + { + "first": np.zeros(shape=(3), dtype=np.bool), + "last": np.zeros(shape=(3), dtype=np.bool), + False: np.zeros(shape=(3), dtype=np.bool), + }, + ), + ( + list("abb"), + list("abc"), + { + "first": np.array([False, False, True]), + "last": np.array([False, True, False]), + False: np.array([False, True, True]), + }, + ), + ], + ) + def test_drop_duplicates(self, data, categories, expected): + + idx = CategoricalIndex(data, categories=categories, name="foo") + for keep, e in expected.items(): + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), e) + e = idx[~e] + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, e) + + @pytest.mark.parametrize( + "data, categories, expected_data, expected_categories", + [ + ([1, 1, 1], [1, 2, 3], [1], [1]), + ([1, 1, 1], list("abc"), [np.nan], []), + ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]), + ([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]), + ], + ) + def test_unique(self, data, categories, expected_data, expected_categories): + + idx = CategoricalIndex(data, categories=categories) + expected = CategoricalIndex(expected_data, categories=expected_categories) tm.assert_index_equal(idx.unique(), expected) def test_repr_roundtrip(self): diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index a9fb228073ab4..fb17e1df6341b 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -16,3 +16,12 @@ def sort(request): in in the Index setops methods. """ return request.param + + +@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]) +def freq_sample(request): + """ + Valid values for 'freq' parameter used to create date_range and + timedelta_range.. + """ + return request.param diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index cbf6b7b63bd50..c55b0481c1041 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -264,9 +264,9 @@ def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None - def test_drop_duplicates_metadata(self): + def test_drop_duplicates_metadata(self, freq_sample): # GH 10115 - idx = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") + idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -277,57 +277,38 @@ def test_drop_duplicates_metadata(self): tm.assert_index_equal(idx, result) assert result.freq is None - def test_drop_duplicates(self): + @pytest.mark.parametrize( + "keep, expected, index", + [ + ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), + ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10), + ), + ], + ) + def test_drop_duplicates(self, freq_sample, keep, expected, index): # to check Index/Series compat - base = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") - idx = base.append(base[:5]) + idx = pd.date_range("2011-01-01", freq=freq_sample, periods=10, name="idx") + idx = idx.append(idx[:5]) - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] - res = idx.drop_duplicates(keep="last") - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep="last") - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + result = Series(idx).drop_duplicates(keep=keep) + tm.assert_series_equal(result, Series(expected, index=index)) - @pytest.mark.parametrize( - "freq", - [ - "A", - "2A", - "-2A", - "Q", - "-1Q", - "M", - "-1M", - "D", - "3D", - "-3D", - "W", - "-1W", - "H", - "2H", - "-2H", - "T", - "2T", - "S", - "-3S", - ], - ) - def test_infer_freq(self, freq): + def test_infer_freq(self, freq_sample): # GH 11018 - idx = pd.date_range("2011-01-01 09:00:00", freq=freq, periods=10) + idx = pd.date_range("2011-01-01 09:00:00", freq=freq_sample, periods=10) result = pd.DatetimeIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) - assert result.freq == freq + assert result.freq == freq_sample def test_nat(self, tz_naive_fixture): tz = tz_naive_fixture diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 196946e696c8d..fc44226f9d72f 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -81,9 +81,10 @@ def test_value_counts_unique(self): tm.assert_index_equal(idx.unique(), exp_idx) - def test_drop_duplicates_metadata(self): + @pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) + def test_drop_duplicates_metadata(self, freq): # GH 10115 - idx = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") + idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -93,26 +94,32 @@ def test_drop_duplicates_metadata(self): tm.assert_index_equal(idx, result) assert idx.freq == result.freq - def test_drop_duplicates(self): + @pytest.mark.parametrize("freq", ["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) + @pytest.mark.parametrize( + "keep, expected, index", + [ + ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), + ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10), + ), + ], + ) + def test_drop_duplicates(self, freq, keep, expected, index): # to check Index/Series compat - base = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") - idx = base.append(base[:5]) - - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) - - res = idx.drop_duplicates(keep="last") - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep="last") - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) - - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + idx = pd.period_range("2011-01-01", periods=10, freq=freq, name="idx") + idx = idx.append(idx[:5]) + + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] + + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) + + result = Series(idx).drop_duplicates(keep=keep) + tm.assert_series_equal(result, Series(expected, index=index)) def test_order_compat(self): def _check_freq(index, expected_index): diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 4af5df6e2cc55..aa1bf997fc66b 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -134,9 +134,9 @@ def test_order(self): tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None - def test_drop_duplicates_metadata(self): + def test_drop_duplicates_metadata(self, freq_sample): # GH 10115 - idx = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") + idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -147,36 +147,38 @@ def test_drop_duplicates_metadata(self): tm.assert_index_equal(idx, result) assert result.freq is None - def test_drop_duplicates(self): + @pytest.mark.parametrize( + "keep, expected, index", + [ + ("first", np.concatenate(([False] * 10, [True] * 5)), np.arange(0, 10)), + ("last", np.concatenate(([True] * 5, [False] * 10)), np.arange(5, 15)), + ( + False, + np.concatenate(([True] * 5, [False] * 5, [True] * 5)), + np.arange(5, 10), + ), + ], + ) + def test_drop_duplicates(self, freq_sample, keep, expected, index): # to check Index/Series compat - base = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") - idx = base.append(base[:5]) + idx = pd.timedelta_range("1 day", periods=10, freq=freq_sample, name="idx") + idx = idx.append(idx[:5]) - res = idx.drop_duplicates() - tm.assert_index_equal(res, base) - res = Series(idx).drop_duplicates() - tm.assert_series_equal(res, Series(base)) + tm.assert_numpy_array_equal(idx.duplicated(keep=keep), expected) + expected = idx[~expected] - res = idx.drop_duplicates(keep="last") - exp = base[5:].append(base[:5]) - tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep="last") - tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + result = idx.drop_duplicates(keep=keep) + tm.assert_index_equal(result, expected) - res = idx.drop_duplicates(keep=False) - tm.assert_index_equal(res, base[5:]) - res = Series(idx).drop_duplicates(keep=False) - tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + result = Series(idx).drop_duplicates(keep=keep) + tm.assert_series_equal(result, Series(expected, index=index)) - @pytest.mark.parametrize( - "freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"] - ) - def test_infer_freq(self, freq): + def test_infer_freq(self, freq_sample): # GH#11018 - idx = pd.timedelta_range("1", freq=freq, periods=10) + idx = pd.timedelta_range("1", freq=freq_sample, periods=10) result = pd.TimedeltaIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) - assert result.freq == freq + assert result.freq == freq_sample def test_repeat(self): index = pd.timedelta_range("1 days", periods=2, freq="D")