diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 0bc234dcb39aa..177d10cdbf615 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -6,7 +6,7 @@ from pandas.errors import PerformanceWarning import pandas as pd -from pandas import DataFrame, Index, MultiIndex +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp import pandas._testing as tm @@ -258,3 +258,162 @@ def test_drop_non_empty_list(self, index, drop_labels): # GH# 21494 with pytest.raises(KeyError, match="not found in axis"): pd.DataFrame(index=index).drop(drop_labels) + + def test_mixed_depth_drop(self): + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.randn(4, 6), columns=index) + + result = df.drop("a", axis=1) + expected = df.drop([("a", "", "")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(["top"], axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + expected = expected.drop([("top", "OD", "wy")], axis=1) + tm.assert_frame_equal(expected, result) + + result = df.drop(("top", "OD", "wx"), axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + tm.assert_frame_equal(expected, result) + + expected = df.drop([("top", "OD", "wy")], axis=1) + expected = df.drop("top", axis=1) + + result = df.drop("result1", level=1, axis=1) + expected = df.drop( + [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 + ) + tm.assert_frame_equal(expected, result) + + def test_drop_multiindex_other_level_nan(self): + # GH#12754 + df = ( + DataFrame( + { + "A": ["one", "one", "two", "two"], + "B": [np.nan, 0.0, 1.0, 2.0], + "C": ["a", "b", "c", "c"], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.drop("c", level="C") + expected = DataFrame( + [2, 1], + columns=["D"], + index=pd.MultiIndex.from_tuples( + [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_drop_nonunique(self): + df = DataFrame( + [ + ["x-a", "x", "a", 1.5], + ["x-a", "x", "a", 1.2], + ["z-c", "z", "c", 3.1], + ["x-a", "x", "a", 4.1], + ["x-b", "x", "b", 5.1], + ["x-b", "x", "b", 4.1], + ["x-b", "x", "b", 2.2], + ["y-a", "y", "a", 1.2], + ["z-b", "z", "b", 2.1], + ], + columns=["var1", "var2", "var3", "var4"], + ) + + grp_size = df.groupby("var1").size() + drop_idx = grp_size.loc[grp_size == 1] + + idf = df.set_index(["var1", "var2", "var3"]) + + # it works! GH#2101 + result = idf.drop(drop_idx.index, level=0).reset_index() + expected = df[-df.var1.isin(drop_idx.index)] + + result.index = expected.index + + tm.assert_frame_equal(result, expected) + + def test_drop_level(self): + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + + result = frame.drop(["bar", "qux"], level="first") + expected = frame.iloc[[0, 1, 2, 5, 6]] + tm.assert_frame_equal(result, expected) + + result = frame.drop(["two"], level="second") + expected = frame.iloc[[0, 2, 3, 6, 7, 9]] + tm.assert_frame_equal(result, expected) + + result = frame.T.drop(["bar", "qux"], axis=1, level="first") + expected = frame.iloc[[0, 1, 2, 5, 6]].T + tm.assert_frame_equal(result, expected) + + result = frame.T.drop(["two"], axis=1, level="second") + expected = frame.iloc[[0, 2, 3, 6, 7, 9]].T + tm.assert_frame_equal(result, expected) + + def test_drop_level_nonunique_datetime(self): + # GH#12701 + idx = Index([2, 3, 4, 4, 5], name="id") + idxdt = pd.to_datetime( + [ + "201603231400", + "201603231500", + "201603231600", + "201603231600", + "201603231700", + ] + ) + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) + df["tstamp"] = idxdt + df = df.set_index("tstamp", append=True) + ts = Timestamp("201603231600") + assert df.index.is_unique is False + + result = df.drop(ts, level="tstamp") + expected = df.loc[idx != 4] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("box", [Series, DataFrame]) + def test_drop_tz_aware_timestamp_across_dst(self, box): + # GH#21761 + start = Timestamp("2017-10-29", tz="Europe/Berlin") + end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") + index = pd.date_range(start, end, freq="15min") + data = box(data=[1] * len(index), index=index) + result = data.drop(start) + expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") + expected_idx = pd.date_range(expected_start, end, freq="15min") + expected = box(data=[1] * len(expected_idx), index=expected_idx) + tm.assert_equal(result, expected) + + def test_drop_preserve_names(self): + index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] + ) + + df = DataFrame(np.random.randn(6, 3), index=index) + + result = df.drop([(0, 2)]) + assert result.index.names == ("one", "two") diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 3b3ae074c774a..d9da059eb9e9c 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -500,6 +500,34 @@ def test_contains_with_missing_value(self): assert np.nan not in idx assert (1, np.nan) in idx + def test_multiindex_contains_dropped(self): + # GH#19027 + # test that dropped MultiIndex levels are not in the MultiIndex + # despite continuing to be in the MultiIndex's levels + idx = MultiIndex.from_product([[1, 2], [3, 4]]) + assert 2 in idx + idx = idx.drop(2) + + # drop implementation keeps 2 in the levels + assert 2 in idx.levels[0] + # but it should no longer be in the index itself + assert 2 not in idx + + # also applies to strings + idx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + assert "a" in idx + idx = idx.drop("a") + assert "a" in idx.levels[0] + assert "a" not in idx + + def test_contains_td64_level(self): + # GH#24570 + tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min") + idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) + assert tx[0] in idx + assert "element_not_exit" not in idx + assert "0 day 09:30:00" in idx + def test_timestamp_multiindex_indexer(): # https://github.com/pandas-dev/pandas/issues/26944 diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 0064187a94265..5e5fcd3db88d8 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -26,26 +26,6 @@ def test_multiindex_perf_warn(self): with tm.assert_produces_warning(PerformanceWarning): df.loc[(0,)] - def test_multiindex_contains_dropped(self): - # GH 19027 - # test that dropped MultiIndex levels are not in the MultiIndex - # despite continuing to be in the MultiIndex's levels - idx = MultiIndex.from_product([[1, 2], [3, 4]]) - assert 2 in idx - idx = idx.drop(2) - - # drop implementation keeps 2 in the levels - assert 2 in idx.levels[0] - # but it should no longer be in the index itself - assert 2 not in idx - - # also applies to strings - idx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) - assert "a" in idx - idx = idx.drop("a") - assert "a" in idx.levels[0] - assert "a" not in idx - def test_indexing_over_hashtable_size_cutoff(self): n = 10000 @@ -85,14 +65,6 @@ def test_multi_nan_indexing(self): ) tm.assert_frame_equal(result, expected) - def test_contains(self): - # GH 24570 - tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min") - idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) - assert tx[0] in idx - assert "element_not_exit" not in idx - assert "0 day 09:30:00" in idx - def test_nested_tuples_duplicates(self): # GH#30892 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 84279d874bae1..0fdcc513ee126 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1252,92 +1252,6 @@ def test_level_with_tuples(self): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - def test_mixed_depth_drop(self): - arrays = [ - ["a", "top", "top", "routine1", "routine1", "routine2"], - ["", "OD", "OD", "result1", "result2", "result1"], - ["", "wx", "wy", "", "", ""], - ] - - tuples = sorted(zip(*arrays)) - index = MultiIndex.from_tuples(tuples) - df = DataFrame(randn(4, 6), columns=index) - - result = df.drop("a", axis=1) - expected = df.drop([("a", "", "")], axis=1) - tm.assert_frame_equal(expected, result) - - result = df.drop(["top"], axis=1) - expected = df.drop([("top", "OD", "wx")], axis=1) - expected = expected.drop([("top", "OD", "wy")], axis=1) - tm.assert_frame_equal(expected, result) - - result = df.drop(("top", "OD", "wx"), axis=1) - expected = df.drop([("top", "OD", "wx")], axis=1) - tm.assert_frame_equal(expected, result) - - expected = df.drop([("top", "OD", "wy")], axis=1) - expected = df.drop("top", axis=1) - - result = df.drop("result1", level=1, axis=1) - expected = df.drop( - [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 - ) - tm.assert_frame_equal(expected, result) - - def test_drop_multiindex_other_level_nan(self): - # GH 12754 - df = ( - DataFrame( - { - "A": ["one", "one", "two", "two"], - "B": [np.nan, 0.0, 1.0, 2.0], - "C": ["a", "b", "c", "c"], - "D": [1, 2, 3, 4], - } - ) - .set_index(["A", "B", "C"]) - .sort_index() - ) - result = df.drop("c", level="C") - expected = DataFrame( - [2, 1], - columns=["D"], - index=pd.MultiIndex.from_tuples( - [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] - ), - ) - tm.assert_frame_equal(result, expected) - - def test_drop_nonunique(self): - df = DataFrame( - [ - ["x-a", "x", "a", 1.5], - ["x-a", "x", "a", 1.2], - ["z-c", "z", "c", 3.1], - ["x-a", "x", "a", 4.1], - ["x-b", "x", "b", 5.1], - ["x-b", "x", "b", 4.1], - ["x-b", "x", "b", 2.2], - ["y-a", "y", "a", 1.2], - ["z-b", "z", "b", 2.1], - ], - columns=["var1", "var2", "var3", "var4"], - ) - - grp_size = df.groupby("var1").size() - drop_idx = grp_size.loc[grp_size == 1] - - idf = df.set_index(["var1", "var2", "var3"]) - - # it works! #2101 - result = idf.drop(drop_idx.index, level=0).reset_index() - expected = df[-df.var1.isin(drop_idx.index)] - - result.index = expected.index - - tm.assert_frame_equal(result, expected) - def test_mixed_depth_pop(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], @@ -1380,68 +1294,6 @@ def test_reindex_level_partial_selection(self): result = self.frame.T.loc[:, ["foo", "qux"]] tm.assert_frame_equal(result, expected.T) - def test_drop_level(self): - result = self.frame.drop(["bar", "qux"], level="first") - expected = self.frame.iloc[[0, 1, 2, 5, 6]] - tm.assert_frame_equal(result, expected) - - result = self.frame.drop(["two"], level="second") - expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]] - tm.assert_frame_equal(result, expected) - - result = self.frame.T.drop(["bar", "qux"], axis=1, level="first") - expected = self.frame.iloc[[0, 1, 2, 5, 6]].T - tm.assert_frame_equal(result, expected) - - result = self.frame.T.drop(["two"], axis=1, level="second") - expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T - tm.assert_frame_equal(result, expected) - - def test_drop_level_nonunique_datetime(self): - # GH 12701 - idx = Index([2, 3, 4, 4, 5], name="id") - idxdt = pd.to_datetime( - [ - "201603231400", - "201603231500", - "201603231600", - "201603231600", - "201603231700", - ] - ) - df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) - df["tstamp"] = idxdt - df = df.set_index("tstamp", append=True) - ts = Timestamp("201603231600") - assert df.index.is_unique is False - - result = df.drop(ts, level="tstamp") - expected = df.loc[idx != 4] - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("box", [Series, DataFrame]) - def test_drop_tz_aware_timestamp_across_dst(self, box): - # GH 21761 - start = Timestamp("2017-10-29", tz="Europe/Berlin") - end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") - index = pd.date_range(start, end, freq="15min") - data = box(data=[1] * len(index), index=index) - result = data.drop(start) - expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") - expected_idx = pd.date_range(expected_start, end, freq="15min") - expected = box(data=[1] * len(expected_idx), index=expected_idx) - tm.assert_equal(result, expected) - - def test_drop_preserve_names(self): - index = MultiIndex.from_arrays( - [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] - ) - - df = DataFrame(np.random.randn(6, 3), index=index) - - result = df.drop([(0, 2)]) - assert result.index.names == ("one", "two") - def test_unicode_repr_level_names(self): index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"])