|
2 | 2 | import pytest
|
3 | 3 |
|
4 | 4 | import pandas as pd
|
5 |
| -from pandas import CategoricalDtype, DataFrame, IntervalIndex, MultiIndex, Series |
| 5 | +from pandas import CategoricalDtype, DataFrame, Index, IntervalIndex, MultiIndex, Series |
6 | 6 | import pandas._testing as tm
|
7 | 7 |
|
8 | 8 |
|
9 | 9 | class TestDataFrameSortIndex:
|
| 10 | + def test_sort_index_and_reconstruction_doc_example(self): |
| 11 | + # doc example |
| 12 | + df = DataFrame( |
| 13 | + {"value": [1, 2, 3, 4]}, |
| 14 | + index=MultiIndex( |
| 15 | + levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] |
| 16 | + ), |
| 17 | + ) |
| 18 | + assert df.index.is_lexsorted() |
| 19 | + assert not df.index.is_monotonic |
| 20 | + |
| 21 | + # sort it |
| 22 | + expected = DataFrame( |
| 23 | + {"value": [2, 1, 4, 3]}, |
| 24 | + index=MultiIndex( |
| 25 | + levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] |
| 26 | + ), |
| 27 | + ) |
| 28 | + result = df.sort_index() |
| 29 | + assert result.index.is_lexsorted() |
| 30 | + assert result.index.is_monotonic |
| 31 | + |
| 32 | + tm.assert_frame_equal(result, expected) |
| 33 | + |
| 34 | + # reconstruct |
| 35 | + result = df.sort_index().copy() |
| 36 | + result.index = result.index._sort_levels_monotonic() |
| 37 | + assert result.index.is_lexsorted() |
| 38 | + assert result.index.is_monotonic |
| 39 | + |
| 40 | + tm.assert_frame_equal(result, expected) |
| 41 | + |
| 42 | + def test_sort_index_non_existent_label_multiindex(self): |
| 43 | + # GH#12261 |
| 44 | + df = DataFrame(0, columns=[], index=MultiIndex.from_product([[], []])) |
| 45 | + df.loc["b", "2"] = 1 |
| 46 | + df.loc["a", "3"] = 1 |
| 47 | + result = df.sort_index().index.is_monotonic |
| 48 | + assert result is True |
| 49 | + |
| 50 | + def test_sort_index_reorder_on_ops(self): |
| 51 | + # GH#15687 |
| 52 | + df = DataFrame( |
| 53 | + np.random.randn(8, 2), |
| 54 | + index=MultiIndex.from_product( |
| 55 | + [["a", "b"], ["big", "small"], ["red", "blu"]], |
| 56 | + names=["letter", "size", "color"], |
| 57 | + ), |
| 58 | + columns=["near", "far"], |
| 59 | + ) |
| 60 | + df = df.sort_index() |
| 61 | + |
| 62 | + def my_func(group): |
| 63 | + group.index = ["newz", "newa"] |
| 64 | + return group |
| 65 | + |
| 66 | + result = df.groupby(level=["letter", "size"]).apply(my_func).sort_index() |
| 67 | + expected = MultiIndex.from_product( |
| 68 | + [["a", "b"], ["big", "small"], ["newa", "newz"]], |
| 69 | + names=["letter", "size", None], |
| 70 | + ) |
| 71 | + |
| 72 | + tm.assert_index_equal(result.index, expected) |
| 73 | + |
| 74 | + def test_sort_index_nan_multiindex(self): |
| 75 | + # GH#14784 |
| 76 | + # incorrect sorting w.r.t. nans |
| 77 | + tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] |
| 78 | + mi = MultiIndex.from_tuples(tuples) |
| 79 | + |
| 80 | + df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) |
| 81 | + s = Series(np.arange(4), index=mi) |
| 82 | + |
| 83 | + df2 = DataFrame( |
| 84 | + { |
| 85 | + "date": pd.DatetimeIndex( |
| 86 | + [ |
| 87 | + "20121002", |
| 88 | + "20121007", |
| 89 | + "20130130", |
| 90 | + "20130202", |
| 91 | + "20130305", |
| 92 | + "20121002", |
| 93 | + "20121207", |
| 94 | + "20130130", |
| 95 | + "20130202", |
| 96 | + "20130305", |
| 97 | + "20130202", |
| 98 | + "20130305", |
| 99 | + ] |
| 100 | + ), |
| 101 | + "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], |
| 102 | + "whole_cost": [ |
| 103 | + 1790, |
| 104 | + np.nan, |
| 105 | + 280, |
| 106 | + 259, |
| 107 | + np.nan, |
| 108 | + 623, |
| 109 | + 90, |
| 110 | + 312, |
| 111 | + np.nan, |
| 112 | + 301, |
| 113 | + 359, |
| 114 | + 801, |
| 115 | + ], |
| 116 | + "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], |
| 117 | + } |
| 118 | + ).set_index(["date", "user_id"]) |
| 119 | + |
| 120 | + # sorting frame, default nan position is last |
| 121 | + result = df.sort_index() |
| 122 | + expected = df.iloc[[3, 0, 2, 1], :] |
| 123 | + tm.assert_frame_equal(result, expected) |
| 124 | + |
| 125 | + # sorting frame, nan position last |
| 126 | + result = df.sort_index(na_position="last") |
| 127 | + expected = df.iloc[[3, 0, 2, 1], :] |
| 128 | + tm.assert_frame_equal(result, expected) |
| 129 | + |
| 130 | + # sorting frame, nan position first |
| 131 | + result = df.sort_index(na_position="first") |
| 132 | + expected = df.iloc[[1, 2, 3, 0], :] |
| 133 | + tm.assert_frame_equal(result, expected) |
| 134 | + |
| 135 | + # sorting frame with removed rows |
| 136 | + result = df2.dropna().sort_index() |
| 137 | + expected = df2.sort_index().dropna() |
| 138 | + tm.assert_frame_equal(result, expected) |
| 139 | + |
| 140 | + # sorting series, default nan position is last |
| 141 | + result = s.sort_index() |
| 142 | + expected = s.iloc[[3, 0, 2, 1]] |
| 143 | + tm.assert_series_equal(result, expected) |
| 144 | + |
| 145 | + # sorting series, nan position last |
| 146 | + result = s.sort_index(na_position="last") |
| 147 | + expected = s.iloc[[3, 0, 2, 1]] |
| 148 | + tm.assert_series_equal(result, expected) |
| 149 | + |
| 150 | + # sorting series, nan position first |
| 151 | + result = s.sort_index(na_position="first") |
| 152 | + expected = s.iloc[[1, 2, 3, 0]] |
| 153 | + tm.assert_series_equal(result, expected) |
| 154 | + |
10 | 155 | def test_sort_index_nan(self):
|
11 | 156 | # GH#3917
|
12 | 157 |
|
@@ -318,3 +463,196 @@ def test_sort_index_ignore_index_multi_index(
|
318 | 463 |
|
319 | 464 | tm.assert_frame_equal(result_df, expected_df)
|
320 | 465 | tm.assert_frame_equal(df, DataFrame(original_dict, index=mi))
|
| 466 | + |
| 467 | + def test_sort_index_categorical_multiindex(self): |
| 468 | + # GH#15058 |
| 469 | + df = DataFrame( |
| 470 | + { |
| 471 | + "a": range(6), |
| 472 | + "l1": pd.Categorical( |
| 473 | + ["a", "a", "b", "b", "c", "c"], |
| 474 | + categories=["c", "a", "b"], |
| 475 | + ordered=True, |
| 476 | + ), |
| 477 | + "l2": [0, 1, 0, 1, 0, 1], |
| 478 | + } |
| 479 | + ) |
| 480 | + result = df.set_index(["l1", "l2"]).sort_index() |
| 481 | + expected = DataFrame( |
| 482 | + [4, 5, 0, 1, 2, 3], |
| 483 | + columns=["a"], |
| 484 | + index=MultiIndex( |
| 485 | + levels=[ |
| 486 | + pd.CategoricalIndex( |
| 487 | + ["c", "a", "b"], |
| 488 | + categories=["c", "a", "b"], |
| 489 | + ordered=True, |
| 490 | + name="l1", |
| 491 | + dtype="category", |
| 492 | + ), |
| 493 | + [0, 1], |
| 494 | + ], |
| 495 | + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], |
| 496 | + names=["l1", "l2"], |
| 497 | + ), |
| 498 | + ) |
| 499 | + tm.assert_frame_equal(result, expected) |
| 500 | + |
| 501 | + def test_sort_index_and_reconstruction(self): |
| 502 | + |
| 503 | + # GH#15622 |
| 504 | + # lexsortedness should be identical |
| 505 | + # across MultiIndex construction methods |
| 506 | + |
| 507 | + df = DataFrame([[1, 1], [2, 2]], index=list("ab")) |
| 508 | + expected = DataFrame( |
| 509 | + [[1, 1], [2, 2], [1, 1], [2, 2]], |
| 510 | + index=MultiIndex.from_tuples( |
| 511 | + [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")] |
| 512 | + ), |
| 513 | + ) |
| 514 | + assert expected.index.is_lexsorted() |
| 515 | + |
| 516 | + result = DataFrame( |
| 517 | + [[1, 1], [2, 2], [1, 1], [2, 2]], |
| 518 | + index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), |
| 519 | + ) |
| 520 | + result = result.sort_index() |
| 521 | + assert result.index.is_lexsorted() |
| 522 | + assert result.index.is_monotonic |
| 523 | + |
| 524 | + tm.assert_frame_equal(result, expected) |
| 525 | + |
| 526 | + result = DataFrame( |
| 527 | + [[1, 1], [2, 2], [1, 1], [2, 2]], |
| 528 | + index=MultiIndex( |
| 529 | + levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] |
| 530 | + ), |
| 531 | + ) |
| 532 | + result = result.sort_index() |
| 533 | + assert result.index.is_lexsorted() |
| 534 | + |
| 535 | + tm.assert_frame_equal(result, expected) |
| 536 | + |
| 537 | + concatted = pd.concat([df, df], keys=[0.8, 0.5]) |
| 538 | + result = concatted.sort_index() |
| 539 | + |
| 540 | + assert result.index.is_lexsorted() |
| 541 | + assert result.index.is_monotonic |
| 542 | + |
| 543 | + tm.assert_frame_equal(result, expected) |
| 544 | + |
| 545 | + # GH#14015 |
| 546 | + df = DataFrame( |
| 547 | + [[1, 2], [6, 7]], |
| 548 | + columns=MultiIndex.from_tuples( |
| 549 | + [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], |
| 550 | + names=["l1", "Date"], |
| 551 | + ), |
| 552 | + ) |
| 553 | + |
| 554 | + df.columns.set_levels( |
| 555 | + pd.to_datetime(df.columns.levels[1]), level=1, inplace=True |
| 556 | + ) |
| 557 | + assert not df.columns.is_lexsorted() |
| 558 | + assert not df.columns.is_monotonic |
| 559 | + result = df.sort_index(axis=1) |
| 560 | + assert result.columns.is_lexsorted() |
| 561 | + assert result.columns.is_monotonic |
| 562 | + result = df.sort_index(axis=1, level=1) |
| 563 | + assert result.columns.is_lexsorted() |
| 564 | + assert result.columns.is_monotonic |
| 565 | + |
| 566 | + # TODO: better name, de-duplicate with test_sort_index_level above |
| 567 | + def test_sort_index_level2(self): |
| 568 | + mi = MultiIndex( |
| 569 | + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], |
| 570 | + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], |
| 571 | + names=["first", "second"], |
| 572 | + ) |
| 573 | + frame = DataFrame( |
| 574 | + np.random.randn(10, 3), |
| 575 | + index=mi, |
| 576 | + columns=Index(["A", "B", "C"], name="exp"), |
| 577 | + ) |
| 578 | + |
| 579 | + df = frame.copy() |
| 580 | + df.index = np.arange(len(df)) |
| 581 | + |
| 582 | + # axis=1 |
| 583 | + |
| 584 | + # series |
| 585 | + a_sorted = frame["A"].sort_index(level=0) |
| 586 | + |
| 587 | + # preserve names |
| 588 | + assert a_sorted.index.names == frame.index.names |
| 589 | + |
| 590 | + # inplace |
| 591 | + rs = frame.copy() |
| 592 | + rs.sort_index(level=0, inplace=True) |
| 593 | + tm.assert_frame_equal(rs, frame.sort_index(level=0)) |
| 594 | + |
| 595 | + def test_sort_index_level_large_cardinality(self): |
| 596 | + |
| 597 | + # GH#2684 (int64) |
| 598 | + index = MultiIndex.from_arrays([np.arange(4000)] * 3) |
| 599 | + df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) |
| 600 | + |
| 601 | + # it works! |
| 602 | + result = df.sort_index(level=0) |
| 603 | + assert result.index.lexsort_depth == 3 |
| 604 | + |
| 605 | + # GH#2684 (int32) |
| 606 | + index = MultiIndex.from_arrays([np.arange(4000)] * 3) |
| 607 | + df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) |
| 608 | + |
| 609 | + # it works! |
| 610 | + result = df.sort_index(level=0) |
| 611 | + assert (result.dtypes.values == df.dtypes.values).all() |
| 612 | + assert result.index.lexsort_depth == 3 |
| 613 | + |
| 614 | + def test_sort_index_level_by_name(self): |
| 615 | + mi = MultiIndex( |
| 616 | + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], |
| 617 | + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], |
| 618 | + names=["first", "second"], |
| 619 | + ) |
| 620 | + frame = DataFrame( |
| 621 | + np.random.randn(10, 3), |
| 622 | + index=mi, |
| 623 | + columns=Index(["A", "B", "C"], name="exp"), |
| 624 | + ) |
| 625 | + |
| 626 | + frame.index.names = ["first", "second"] |
| 627 | + result = frame.sort_index(level="second") |
| 628 | + expected = frame.sort_index(level=1) |
| 629 | + tm.assert_frame_equal(result, expected) |
| 630 | + |
| 631 | + def test_sort_index_level_mixed(self): |
| 632 | + mi = MultiIndex( |
| 633 | + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], |
| 634 | + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], |
| 635 | + names=["first", "second"], |
| 636 | + ) |
| 637 | + frame = DataFrame( |
| 638 | + np.random.randn(10, 3), |
| 639 | + index=mi, |
| 640 | + columns=Index(["A", "B", "C"], name="exp"), |
| 641 | + ) |
| 642 | + |
| 643 | + sorted_before = frame.sort_index(level=1) |
| 644 | + |
| 645 | + df = frame.copy() |
| 646 | + df["foo"] = "bar" |
| 647 | + sorted_after = df.sort_index(level=1) |
| 648 | + tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) |
| 649 | + |
| 650 | + dft = frame.T |
| 651 | + sorted_before = dft.sort_index(level=1, axis=1) |
| 652 | + dft["foo", "three"] = "bar" |
| 653 | + |
| 654 | + sorted_after = dft.sort_index(level=1, axis=1) |
| 655 | + tm.assert_frame_equal( |
| 656 | + sorted_before.drop([("foo", "three")], axis=1), |
| 657 | + sorted_after.drop([("foo", "three")], axis=1), |
| 658 | + ) |
0 commit comments