diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index bee8025275b42..be2b73b8ff8e5 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -3,7 +3,6 @@ timedelta, ) import inspect -from itertools import permutations import numpy as np import pytest @@ -302,23 +301,24 @@ def test_reindex_limit(self): expected = DataFrame(exp_data) tm.assert_frame_equal(result, expected) - def test_reindex_level(self): - icol = ["jim", "joe", "jolie"] - - def verify_first_level(df, level, idx, check_index_type=True): - def f(val): - return np.nonzero((df[level] == val).to_numpy())[0] - - i = np.concatenate(list(map(f, idx))) - left = df.set_index(icol).reindex(idx, level=level) - right = df.iloc[i].set_index(icol) - tm.assert_frame_equal(left, right, check_index_type=check_index_type) - - def verify(df, level, idx, indexer, check_index_type=True): - left = df.set_index(icol).reindex(idx, level=level) - right = df.iloc[indexer].set_index(icol) - tm.assert_frame_equal(left, right, check_index_type=check_index_type) - + @pytest.mark.parametrize( + "idx, check_index_type", + [ + [["C", "B", "A"], True], + [["F", "C", "A", "D"], True], + [["A"], True], + [["A", "B", "C"], True], + [["C", "A", "B"], True], + [["C", "B"], True], + [["C", "A"], True], + [["A", "B"], True], + [["B", "A", "C"], True], + # reindex by these causes different MultiIndex levels + [["D", "F"], False], + [["A", "C", "B"], False], + ], + ) + def test_reindex_level_verify_first_level(self, idx, check_index_type): df = DataFrame( { "jim": list("B" * 4 + "A" * 2 + "C" * 3), @@ -327,35 +327,40 @@ def verify(df, level, idx, indexer, check_index_type=True): "joline": np.random.randint(0, 1000, 9), } ) + icol = ["jim", "joe", "jolie"] - target = [ - ["C", "B", "A"], - ["F", "C", "A", "D"], - ["A"], - ["A", "B", "C"], - ["C", "A", "B"], - ["C", "B"], - ["C", "A"], - ["A", "B"], - ["B", "A", "C"], - ] - - for idx in target: - verify_first_level(df, "jim", idx) - - # reindex by these causes different MultiIndex levels - for idx in [["D", "F"], ["A", "C", "B"]]: - verify_first_level(df, "jim", idx, check_index_type=False) + def f(val): + return np.nonzero((df["jim"] == val).to_numpy())[0] - verify(df, "joe", list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6]) - verify(df, "joe", list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6]) - verify(df, "joe", list("abc"), [3, 2, 1, 8, 7, 6]) - verify(df, "joe", list("eca"), [1, 3, 4, 6, 8]) - verify(df, "joe", list("edc"), [0, 1, 4, 5, 6]) - verify(df, "joe", list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6]) - verify(df, "joe", list("edwq"), [0, 4, 5]) - verify(df, "joe", list("wq"), [], check_index_type=False) + i = np.concatenate(list(map(f, idx))) + left = df.set_index(icol).reindex(idx, level="jim") + right = df.iloc[i].set_index(icol) + tm.assert_frame_equal(left, right, check_index_type=check_index_type) + @pytest.mark.parametrize( + "idx", + [ + ("mid",), + ("mid", "btm"), + ("mid", "btm", "top"), + ("mid",), + ("mid", "top"), + ("mid", "top", "btm"), + ("btm",), + ("btm", "mid"), + ("btm", "mid", "top"), + ("btm",), + ("btm", "top"), + ("btm", "top", "mid"), + ("top",), + ("top", "mid"), + ("top", "mid", "btm"), + ("top",), + ("top", "btm"), + ("top", "btm", "mid"), + ], + ) + def test_reindex_level_verify_first_level_repeats(self, idx): df = DataFrame( { "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, @@ -379,22 +384,86 @@ def verify(df, level, idx, indexer, check_index_type=True): "joline": np.random.randn(20).round(3) * 10, } ) + icol = ["jim", "joe", "jolie"] - for idx in permutations(df["jim"].unique()): - for i in range(3): - verify_first_level(df, "jim", idx[: i + 1]) - - i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17] - verify(df, "joe", ["1st", "2nd", "3rd"], i) + def f(val): + return np.nonzero((df["jim"] == val).to_numpy())[0] - i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] - verify(df, "joe", ["3rd", "2nd", "1st"], i) + i = np.concatenate(list(map(f, idx))) + left = df.set_index(icol).reindex(idx, level="jim") + right = df.iloc[i].set_index(icol) + tm.assert_frame_equal(left, right) - i = [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17] - verify(df, "joe", ["2nd", "3rd"], i) + @pytest.mark.parametrize( + "idx, indexer", + [ + [ + ["1st", "2nd", "3rd"], + [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17], + ], + [ + ["3rd", "2nd", "1st"], + [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14], + ], + [["2nd", "3rd"], [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17]], + [["3rd", "1st"], [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14]], + ], + ) + def test_reindex_level_verify_repeats(self, idx, indexer): + df = DataFrame( + { + "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, + "joe": ["3rd"] * 2 + + ["1st"] * 3 + + ["2nd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["2nd"] * 2, + # this needs to be jointly unique with jim and joe or + # reindexing will fail ~1.5% of the time, this works + # out to needing unique groups of same size as joe + "jolie": np.concatenate( + [ + np.random.choice(1000, x, replace=False) + for x in [2, 3, 3, 2, 3, 2, 3, 2] + ] + ), + "joline": np.random.randn(20).round(3) * 10, + } + ) + icol = ["jim", "joe", "jolie"] + left = df.set_index(icol).reindex(idx, level="joe") + right = df.iloc[indexer].set_index(icol) + tm.assert_frame_equal(left, right) - i = [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14] - verify(df, "joe", ["3rd", "1st"], i) + @pytest.mark.parametrize( + "idx, indexer, check_index_type", + [ + [list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6], True], + [list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6], True], + [list("abc"), [3, 2, 1, 8, 7, 6], True], + [list("eca"), [1, 3, 4, 6, 8], True], + [list("edc"), [0, 1, 4, 5, 6], True], + [list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6], True], + [list("edwq"), [0, 4, 5], True], + [list("wq"), [], False], + ], + ) + def test_reindex_level_verify(self, idx, indexer, check_index_type): + df = DataFrame( + { + "jim": list("B" * 4 + "A" * 2 + "C" * 3), + "joe": list("abcdeabcd")[::-1], + "jolie": [10, 20, 30] * 3, + "joline": np.random.randint(0, 1000, 9), + } + ) + icol = ["jim", "joe", "jolie"] + left = df.set_index(icol).reindex(idx, level="joe") + right = df.iloc[indexer].set_index(icol) + tm.assert_frame_equal(left, right, check_index_type=check_index_type) def test_non_monotonic_reindex_methods(self): dr = date_range("2013-08-01", periods=6, freq="B") diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index e73885ebcc2c8..005d2600b2bae 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -751,21 +751,14 @@ def test_unstack_multi_level_rows_and_cols(self): expected = df.unstack(["i3"]).unstack(["i2"]) tm.assert_frame_equal(result, expected) - def test_unstack_nan_index1(self): + @pytest.mark.parametrize("idx", [("jim", "joe"), ("joe", "jim")]) + @pytest.mark.parametrize("lev", list(range(2))) + def test_unstack_nan_index1(self, idx, lev): # GH7466 def cast(val): val_str = "" if val != val else val return f"{val_str:1}" - def verify(df): - mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] - rows, cols = df.notna().values.nonzero() - for i, j in zip(rows, cols): - left = sorted(df.iloc[i, j].split(".")) - right = mk_list(df.index[i]) + mk_list(df.columns[j]) - right = sorted(map(cast, right)) - assert left == right - df = DataFrame( { "jim": ["a", "b", np.nan, "d"], @@ -778,12 +771,24 @@ def verify(df): right = df.set_index(["joe", "jim"]).unstack()["jolie"].T tm.assert_frame_equal(left, right) - for idx in itertools.permutations(df.columns[:2]): - mi = df.set_index(list(idx)) - for lev in range(2): - udf = mi.unstack(level=lev) - assert udf.notna().values.sum() == len(df) - verify(udf["jolie"]) + mi = df.set_index(list(idx)) + udf = mi.unstack(level=lev) + assert udf.notna().values.sum() == len(df) + mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] + rows, cols = udf["jolie"].notna().values.nonzero() + for i, j in zip(rows, cols): + left = sorted(udf["jolie"].iloc[i, j].split(".")) + right = mk_list(udf["jolie"].index[i]) + mk_list(udf["jolie"].columns[j]) + right = sorted(map(cast, right)) + assert left == right + + @pytest.mark.parametrize("idx", itertools.permutations(["1st", "2nd", "3rd"])) + @pytest.mark.parametrize("lev", list(range(3))) + @pytest.mark.parametrize("col", ["4th", "5th"]) + def test_unstack_nan_index_repeats(self, idx, lev, col): + def cast(val): + val_str = "" if val != val else val + return f"{val_str:1}" df = DataFrame( { @@ -830,13 +835,16 @@ def verify(df): df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1), ) - for idx in itertools.permutations(["1st", "2nd", "3rd"]): - mi = df.set_index(list(idx)) - for lev in range(3): - udf = mi.unstack(level=lev) - assert udf.notna().values.sum() == 2 * len(df) - for col in ["4th", "5th"]: - verify(udf[col]) + mi = df.set_index(list(idx)) + udf = mi.unstack(level=lev) + assert udf.notna().values.sum() == 2 * len(df) + mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] + rows, cols = udf[col].notna().values.nonzero() + for i, j in zip(rows, cols): + left = sorted(udf[col].iloc[i, j].split(".")) + right = mk_list(udf[col].index[i]) + mk_list(udf[col].columns[j]) + right = sorted(map(cast, right)) + assert left == right def test_unstack_nan_index2(self): # GH7403