From 47c67d6099c5b66c9621e36ffcb8bf42d45e590f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Mar 2017 19:15:14 -0400 Subject: [PATCH 01/11] BUG: construct MultiIndex identically from levels/labels when concatting closes #15622 closes #15687 closes #14015 closes #13431 --- asv_bench/benchmarks/timeseries.py | 5 +- doc/source/whatsnew/v0.20.0.txt | 74 +++++++++++++++- pandas/core/frame.py | 18 ++-- pandas/core/groupby.py | 9 +- pandas/core/reshape.py | 9 +- pandas/core/series.py | 18 +++- pandas/core/sorting.py | 5 ++ pandas/indexes/multi.py | 52 ++++++++++- pandas/tests/indexes/test_multi.py | 53 +++++++++++ pandas/tests/series/test_analytics.py | 2 +- pandas/tests/test_multilevel.py | 122 ++++++++++++++++++++++++++ pandas/tests/tools/test_hashing.py | 29 ++++++ pandas/tests/tools/test_pivot.py | 3 +- 13 files changed, 375 insertions(+), 24 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 6e9ef4b10273c..dfe3f0ef87c11 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -292,7 +292,10 @@ def setup(self): self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S') self.ts3 = Series(1, index=self.rng3) - def time_sort_index(self): + def time_sort_index_monotonic(self): + self.ts2.sort_index() + + def time_sort_index_non_monotonic(self): self.ts.sort_index() def time_timeseries_slice_minutely(self): diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index cb9e2496757ef..279b93f75a08e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -714,6 +714,78 @@ If indicated, a deprecation warning will be issued if you reference that module. "pandas._hash", "pandas.tools.libhash", "" "pandas._window", "pandas.core.libwindow", "" +.. _whatsnew_0200.api_breaking.sort_index: + +DataFrame.sort_index changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort. +This would happen with a ``lexsorted``, but non-montonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`) + +This is UNCHANGED between versions, but showing for illustration purposes: + +.. ipython:: python + + df = DataFrame(np.arange(6), columns=['value'], index=MultiIndex.from_product([list('BA'), range(3)])) + df + +.. ipython:: python + + df.index.is_lexsorted() + df.index.is_monotonic + +Sorting works as expected + +.. ipython:: python + + df.sort_index() + +.. ipython:: python + + df.sort_index().index.is_lexsorted() + df.sort_index().index.is_monotonic + +However, this example, which has a monotonic level, doesn't behave as desired. + +.. ipython:: python + df = pd.DataFrame({'value': [1, 2, 3, 4]}, + index=pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + +Previous Behavior: + +.. ipython:: python + + In [11]: df.sort_index() + Out[11]: + value + a bb 1 + aa 2 + b bb 3 + aa 4 + + In [14]: df.sort_index().index.is_lexsorted() + Out[14]: True + + In [15]: df.sort_index().index.is_monotonic + Out[15]: False + +New Behavior: + +.. ipython:: python + + df.sort_index() + df.sort_index().index.is_lexsorted() + df.sort_index().index.is_monotonic + +Previous Behavior: + +.. code-block:: ipython + +New Behavior: + +.. ipython:: python + .. _whatsnew_0200.api_breaking.groupby_describe: @@ -965,7 +1037,7 @@ Performance Improvements - Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied function used the ``.name`` attribute of the group DataFrame (:issue:`15062`). - Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`). - +- Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`) .. _whatsnew_0200.bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f6199be2d1fc9..e3379261625f2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3322,6 +3322,10 @@ def trans(v): def sort_index(self, axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None): + + # TODO: this can be combined with Series.sort_index impl as + # almost identical + inplace = validate_bool_kwarg(inplace, 'inplace') # 10726 if by is not None: @@ -3335,8 +3339,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis = self._get_axis_number(axis) labels = self._get_axis(axis) - # sort by the index - if level is not None: + if level: new_axis, indexer = labels.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) @@ -3346,17 +3349,15 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer - if not labels.is_lexsorted(): - labels = MultiIndex.from_tuples(labels.values) + labels = labels._reconstruct(sort=True) indexer = lexsort_indexer(labels.labels, orders=ascending, na_position=na_position) else: from pandas.core.sorting import nargsort - # GH11080 - Check monotonic-ness before sort an index - # if monotonic (already sorted), return None or copy() according - # to 'inplace' + # Check monotonic-ness before sort an index + # GH11080 if ((ascending and labels.is_monotonic_increasing) or (not ascending and labels.is_monotonic_decreasing)): if inplace: @@ -3367,8 +3368,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, indexer = nargsort(labels, kind=kind, ascending=ascending, na_position=na_position) + baxis = self._get_block_manager_axis(axis) new_data = self._data.take(indexer, - axis=self._get_block_manager_axis(axis), + axis=baxis, convert=False, verify=False) if inplace: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index fe764a099bb63..add2987b8f452 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1882,6 +1882,13 @@ def get_group_levels(self): 'ohlc': lambda *args: ['open', 'high', 'low', 'close'] } + def _is_builtin_func(self, arg): + """ + if we define an builtin function for this argument, return it, + otherwise return the arg + """ + return SelectionMixin._builtin_table.get(arg, arg) + def _get_cython_function(self, kind, how, values, is_numeric): dtype_str = values.dtype.name @@ -2107,7 +2114,7 @@ def _aggregate_series_fast(self, obj, func): # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)).to_dense() indexer = get_group_index_sorter(group_index, ngroups) - obj = obj.take(indexer, convert=False) + obj = obj.take(indexer, convert=False).to_dense() group_index = algorithms.take_nd( group_index, indexer, allow_fill=False) grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index c7e06d63fbda9..b03c3d77928c7 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -22,8 +22,8 @@ from pandas.sparse.libsparse import IntIndex from pandas.core.categorical import Categorical, _factorize_from_iterable -from pandas.core.sorting import (get_group_index, compress_group_index, - decons_obs_group_ids) +from pandas.core.sorting import (get_group_index, get_compressed_ids, + compress_group_index, decons_obs_group_ids) import pandas.core.algorithms as algos from pandas._libs import algos as _algos, reshape as _reshape @@ -496,11 +496,6 @@ def _unstack_frame(obj, level, fill_value=None): return unstacker.get_result() -def get_compressed_ids(labels, sizes): - ids = get_group_index(labels, sizes, sort=True, xnull=False) - return compress_group_index(ids, sort=True) - - def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the diff --git a/pandas/core/series.py b/pandas/core/series.py index d6a1a9d98faf4..8cc957f4e2f7f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1751,17 +1751,31 @@ def _try_kind_sort(arr): def sort_index(self, axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True): + # TODO: this can be combined with DataFrame.sort_index impl as + # almost identical inplace = validate_bool_kwarg(inplace, 'inplace') axis = self._get_axis_number(axis) index = self.index - if level is not None: + + if level: new_index, indexer = index.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer(index.labels, orders=ascending) + labels = index._reconstruct(sort=True) + indexer = lexsort_indexer(labels.labels, orders=ascending) else: from pandas.core.sorting import nargsort + + # Check monotonic-ness before sort an index + # GH11080 + if ((ascending and index.is_monotonic_increasing) or + (not ascending and index.is_monotonic_decreasing)): + if inplace: + return + else: + return self.copy() + indexer = nargsort(index, kind=kind, ascending=ascending, na_position=na_position) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 205d0d94d2ec3..ea131e66cb833 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -93,6 +93,11 @@ def maybe_lift(lab, size): # pormote nan values return loop(list(labels), list(shape)) +def get_compressed_ids(labels, sizes): + ids = get_group_index(labels, sizes, sort=True, xnull=False) + return compress_group_index(ids, sort=True) + + def is_int64_overflow_possible(shape): the_prod = long(1) for x in shape: diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index f12b10ae682fa..0f4b6810b0f54 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1171,9 +1171,57 @@ def from_product(cls, iterables, sortorder=None, names=None): labels, levels = _factorize_from_iterables(iterables) labels = cartesian_product(labels) + return MultiIndex(levels, labels, sortorder=sortorder, names=names) - return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, - names=names) + def _reconstruct(self, sort=False): + """ + reconstruct the MultiIndex + + The MultiIndex will have the same outward appearance (e.g. values) + and will also .equals() + + Parameters + ---------- + sort: boolean, default False + monotonically sort the levels + + Returns + ------- + MultiIndex + + """ + new_levels = [] + new_labels = [] + + if sort: + + if self.is_monotonic: + return self + + for lev, lab in zip(self.levels, self.labels): + + if lev.is_monotonic: + new_levels.append(lev) + new_labels.append(lab) + continue + + # indexer to reorder the levels + indexer = lev.argsort() + lev = lev.take(indexer) + + # indexer to reorder the labels + ri = lib.get_reverse_indexer(indexer, len(indexer)) + lab = algos.take_1d(ri, lab) + + new_levels.append(lev) + new_labels.append(lab) + + else: + return self + + return MultiIndex(new_levels, new_labels, + names=self.names, sortorder=self.sortorder, + verify_integrity=False) @property def nlevels(self): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 470526043234f..d78b3f8d49b10 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2411,6 +2411,59 @@ def test_is_monotonic(self): self.assertFalse(i.is_monotonic) + def test_reconstruct_sort(self): + + # starts off lexsorted & monotonic + mi = MultiIndex.from_arrays([ + ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] + ]) + assert mi.is_lexsorted() + assert mi.is_monotonic + + recons = mi._reconstruct(sort=True) + assert recons.is_lexsorted() + assert recons.is_monotonic + assert mi is recons + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + recons = mi._reconstruct(sort=False) + assert recons.is_lexsorted() + assert recons.is_monotonic + assert mi is recons + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # cannot convert to lexsorted + mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'), + ('x', 'b'), ('y', 'a'), ('z', 'b')], + names=['one', 'two']) + assert not mi.is_lexsorted() + assert not mi.is_monotonic + + recons = mi._reconstruct(sort=True) + assert not recons.is_lexsorted() + assert not recons.is_monotonic + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # cannot convert to lexsorted + mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], + labels=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=['col1', 'col2']) + assert not mi.is_lexsorted() + assert not mi.is_monotonic + + recons = mi._reconstruct(sort=True) + assert not recons.is_lexsorted() + assert not recons.is_monotonic + + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + def test_isin(self): values = [('foo', 2), ('bar', 3), ('quux', 4)] diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 732142f1bce9a..a682e8643d251 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1526,7 +1526,7 @@ def test_unstack(self): labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) expected = DataFrame({'bar': s.values}, index=exp_index).sort_index(level=0) - unstacked = s.unstack(0) + unstacked = s.unstack(0).sort_index() assert_frame_equal(unstacked, expected) # GH5873 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 5584c1ac6a239..92b20767e7e9c 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2438,6 +2438,30 @@ def test_getitem_slice_not_sorted(self): expected = df.reindex(columns=df.columns[:3]) tm.assert_frame_equal(result, expected) + def test_frame_getitem_not_sorted2(self): + # 13431 + df = DataFrame({'col1': ['b', 'd', 'b', 'a'], + 'col2': [3, 1, 1, 2], + 'data': ['one', 'two', 'three', 'four']}) + + df2 = df.set_index(['col1', 'col2']) + df2_original = df2.copy() + + df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) + df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True) + assert not df2.index.is_lexsorted() + assert not df2.index.is_monotonic + + assert df2_original.index.equals(df2.index) + expected = df2.sort_index() + assert not expected.index.is_lexsorted() + assert expected.index.is_monotonic + + result = df2.sort_index(level=0) + assert not result.index.is_lexsorted() + assert result.index.is_monotonic + tm.assert_frame_equal(result, expected) + def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' @@ -2474,3 +2498,101 @@ def test_series_getitem_not_sorted(self): expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) + + def test_sort_index_and_reconstruction(self): + + # 15622 + # lexsortedness should be identical + # across MultiIndex consruction methods + + df = DataFrame([[1, 1], [2, 2]], index=list('ab')) + expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_tuples([(0.5, 'a'), + (0.5, 'b'), + (0.8, 'a'), + (0.8, 'b')])) + assert expected.index.is_lexsorted() + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_product([[0.5, 0.8], list('ab')])) + result = result.sort_index() + assert result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + result = result.sort_index() + assert result.index.is_lexsorted() + + tm.assert_frame_equal(result, expected) + + concatted = pd.concat([df, df], keys=[0.8, 0.5]) + result = concatted.sort_index() + + # this will be monotonic, but not lexsorted! + assert not result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + # 14015 + df = DataFrame([[1, 2], [6, 7]], + columns=MultiIndex.from_tuples( + [(0, '20160811 12:00:00'), + (0, '20160809 12:00:00')], + names=['l1', 'Date'])) + + df.columns.set_levels(pd.to_datetime(df.columns.levels[1]), + level=1, + inplace=True) + assert not df.columns.is_lexsorted() + assert not df.columns.is_monotonic + result = df.sort_index(axis=1) + assert result.columns.is_lexsorted() + assert result.columns.is_monotonic + result = df.sort_index(axis=1, level=1) + assert result.columns.is_lexsorted() + assert result.columns.is_monotonic + + # doc example + df = DataFrame({'value': [1, 2, 3, 4]}, + index=MultiIndex( + levels=[['a', 'b'], ['bb', 'aa']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]])) + result = df.sort_index() + expected = DataFrame({'value': [2, 1, 4, 3]}, + index=MultiIndex( + levels=[['a', 'b'], ['aa', 'bb']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]])) + tm.assert_frame_equal(result, expected) + + def test_sort_index_reorder_on_ops(self): + # 15687 + df = pd.DataFrame( + np.random.randn(8, 2), + index=MultiIndex.from_product( + [['a', 'b'], + ['big', 'small'], + ['red', 'blu']], + names=['letter', 'size', 'color']), + columns=['near', 'far']) + df = df.sort_index() + + def my_func(group): + group.index = ['newz', 'newa'] + return group + + result = df.groupby(level=['letter', 'size']).apply( + my_func).sort_index() + expected = MultiIndex.from_product( + [['a', 'b'], + ['big', 'small'], + ['newa', 'newz']], + names=['letter', 'size', None]) + + tm.assert_index_equal(result.index, expected) diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/tools/test_hashing.py index 9bed0d428bc41..17a1fb1a7d525 100644 --- a/pandas/tests/tools/test_hashing.py +++ b/pandas/tests/tools/test_hashing.py @@ -87,6 +87,35 @@ def test_multiindex_unique(self): result = hash_pandas_object(mi) self.assertTrue(result.is_unique) + def test_multiindex_objects(self): + mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], + labels=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=['col1', 'col2']) + recons = mi._reconstruct(sort=True) + + # these are equal + assert mi.equals(recons) + assert Index(mi.values).equals(Index(recons.values)) + + # _hashed_values and hash_pandas_object(..., index=False) + # equivalency + expected = hash_pandas_object( + mi, index=False).values + result = mi._hashed_values + tm.assert_numpy_array_equal(result, expected) + + expected = hash_pandas_object( + recons, index=False).values + result = recons._hashed_values + tm.assert_numpy_array_equal(result, expected) + + expected = mi._hashed_values + result = recons._hashed_values + + # values should match, but in different order + tm.assert_numpy_array_equal(np.sort(result), + np.sort(expected)) + def test_hash_pandas_object(self): for obj in [Series([1, 2, 3]), diff --git a/pandas/tests/tools/test_pivot.py b/pandas/tests/tools/test_pivot.py index 4502f232c6d9c..c8dfaf5e29bc6 100644 --- a/pandas/tests/tools/test_pivot.py +++ b/pandas/tests/tools/test_pivot.py @@ -2,6 +2,7 @@ import numpy as np +from collections import OrderedDict import pandas as pd from pandas import (DataFrame, Series, Index, MultiIndex, Grouper, date_range, concat) @@ -513,7 +514,7 @@ def test_pivot_columns_lexsorted(self): self.assertTrue(pivoted.columns.is_monotonic) def test_pivot_complex_aggfunc(self): - f = {'D': ['std'], 'E': ['sum']} + f = OrderedDict([('D', ['std']), ('E', ['sum'])]) expected = self.data.groupby(['A', 'B']).agg(f).unstack('B') result = self.data.pivot_table(index='A', columns='B', aggfunc=f) From 7be8941a4fbddf22ba6bb3103b5e631dc2af46f6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 16 Mar 2017 16:01:41 -0400 Subject: [PATCH 02/11] incorrectly raising KeyError rather than UnsortedIndexError, caught by doc-example --- pandas/indexes/multi.py | 7 ++++--- pandas/tests/indexes/test_multi.py | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 0f4b6810b0f54..8e13aa37335fc 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1792,9 +1792,10 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): def _partial_tup_index(self, tup, side='left'): if len(tup) > self.lexsort_depth: - raise KeyError('Key length (%d) was greater than MultiIndex' - ' lexsort depth (%d)' % - (len(tup), self.lexsort_depth)) + raise UnsortedIndexError( + 'Key length (%d) was greater than MultiIndex' + ' lexsort depth (%d)' % + (len(tup), self.lexsort_depth)) n = len(tup) start, end = 0, len(self) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index d78b3f8d49b10..5515e809b7944 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2752,6 +2752,30 @@ def test_unsortedindex(self): with assertRaises(KeyError): df.loc(axis=0)['q', :] + def test_unsortedindex_doc_examples(self): + # http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa + dfm = DataFrame({'jim': [0, 0, 1, 1], + 'joe': ['x', 'x', 'z', 'y'], + 'jolie': np.random.rand(4)}) + + dfm = dfm.set_index(['jim', 'joe']) + with tm.assert_produces_warning(PerformanceWarning): + dfm.loc[(1, 'z')] + + with pytest.raises(UnsortedIndexError): + dfm.loc[(0, 'y'):(1, 'z')] + + assert not dfm.index.is_lexsorted() + assert dfm.index.lexsort_depth == 1 + + # sort it + dfm = dfm.sort_index() + dfm.loc[(1, 'z')] + dfm.loc[(0, 'y'):(1, 'z')] + + assert dfm.index.is_lexsorted() + assert dfm.index.lexsort_depth == 2 + def test_tuples_with_name_string(self): # GH 15110 and GH 14848 From b234bdb4303aa8a8692e4d9512bb0a07d3070ad9 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 16 Mar 2017 11:31:19 -0400 Subject: [PATCH 03/11] support for removing unused levels (internally) xref #2770 --- pandas/indexes/multi.py | 53 +++++++++++++++++++++++++++--- pandas/tests/indexes/test_multi.py | 49 ++++++++++++++++++++++----- pandas/tests/test_multilevel.py | 22 +++++++++++-- 3 files changed, 108 insertions(+), 16 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 8e13aa37335fc..b1119d5327f10 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1173,7 +1173,7 @@ def from_product(cls, iterables, sortorder=None, names=None): labels = cartesian_product(labels) return MultiIndex(levels, labels, sortorder=sortorder, names=names) - def _reconstruct(self, sort=False): + def _reconstruct(self, sort=False, remove_unused=False): """ reconstruct the MultiIndex @@ -1184,21 +1184,33 @@ def _reconstruct(self, sort=False): ---------- sort: boolean, default False monotonically sort the levels + remove_unused: boolean, default False + remove unsued levels Returns ------- MultiIndex """ + + if sort and remove_unused: + raise ValueError("only support one of sort / remove_unused") + + if not (sort or remove_unused): + raise ValueError("must supply one of sort / remove_unsued") + + levels = self.levels + labels = self.labels + new_levels = [] new_labels = [] if sort: - if self.is_monotonic: + if self.is_lexsorted() and self.is_monotonic: return self - for lev, lab in zip(self.levels, self.labels): + for lev, lab in zip(levels, labels): if lev.is_monotonic: new_levels.append(lev) @@ -1216,8 +1228,39 @@ def _reconstruct(self, sort=False): new_levels.append(lev) new_labels.append(lab) - else: - return self + elif remove_unused: + + changed = np.zeros(self.nlevels, dtype=bool) + for i, (lev, lab) in enumerate(zip(levels, labels)): + + uniques = np.sort(algos.unique(lab)) + + # nothing unused + if len(uniques) == len(lev): + new_levels.append(lev) + new_labels.append(lab) + changed[i] = True + continue + + unused = list(reversed(sorted(set( + np.arange(len(lev))) - set(uniques)))) + + # new levels are simple + lev = lev.take(uniques) + + # new labels, we remove the unsued + # by decrementing the labels for that value + # prob a better way + for u in unused: + + lab = np.where(lab > u, lab - 1, lab) + + new_levels.append(lev) + new_labels.append(lab) + + # nothing changed + if not changed.any(): + return self return MultiIndex(new_levels, new_labels, names=self.names, sortorder=self.sortorder, diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 5515e809b7944..af30b295ee9af 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2411,6 +2411,18 @@ def test_is_monotonic(self): self.assertFalse(i.is_monotonic) + def test_reconstruct_api(self): + + mi = MultiIndex.from_arrays([ + ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] + ]) + + with pytest.raises(ValueError): + mi._reconstruct() + + with pytest.raises(ValueError): + mi._reconstruct(sort=True, remove_unused=True) + def test_reconstruct_sort(self): # starts off lexsorted & monotonic @@ -2428,14 +2440,6 @@ def test_reconstruct_sort(self): assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) - recons = mi._reconstruct(sort=False) - assert recons.is_lexsorted() - assert recons.is_monotonic - assert mi is recons - - assert mi.equals(recons) - assert Index(mi.values).equals(Index(recons.values)) - # cannot convert to lexsorted mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'), ('x', 'b'), ('y', 'a'), ('z', 'b')], @@ -2464,6 +2468,35 @@ def test_reconstruct_sort(self): assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) + def test_reconstruct_remove_unused(self): + # xref to GH 2770 + df = DataFrame([['deleteMe', 1, 9], + ['keepMe', 2, 9], + ['keepMeToo', 3, 9]], + columns=['first', 'second', 'third']) + df2 = df.set_index(['first', 'second'], drop=False) + df2 = df2[df2['first'] != 'deleteMe'] + + # removed levels are there + expected = MultiIndex(levels=[['deleteMe', 'keepMe', 'keepMeToo'], + [1, 2, 3]], + labels=[[1, 2], [1, 2]], + names=['first', 'second']) + result = df2.index + tm.assert_index_equal(result, expected) + + expected = MultiIndex(levels=[['keepMe', 'keepMeToo'], + [2, 3]], + labels=[[0, 1], [0, 1]], + names=['first', 'second']) + result = df2.index._reconstruct(remove_unused=True) + tm.assert_index_equal(result, expected) + + # idempotent + result2 = result._reconstruct(remove_unused=True) + tm.assert_index_equal(result2, expected) + assert result2 is result + def test_isin(self): values = [('foo', 2), ('bar', 3), ('quux', 4)] diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 92b20767e7e9c..d378b67b25505 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2559,16 +2559,32 @@ def test_sort_index_and_reconstruction(self): assert result.columns.is_lexsorted() assert result.columns.is_monotonic + def test_sort_index_and_reconstruction_doc_example(self): # doc example df = DataFrame({'value': [1, 2, 3, 4]}, index=MultiIndex( levels=[['a', 'b'], ['bb', 'aa']], - labels=[[0, 0, 1, 1], [1, 0, 1, 0]])) - result = df.sort_index() + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + assert df.index.is_lexsorted() + assert not df.index.is_monotonic + + # sort it expected = DataFrame({'value': [2, 1, 4, 3]}, index=MultiIndex( levels=[['a', 'b'], ['aa', 'bb']], - labels=[[0, 0, 1, 1], [1, 0, 1, 0]])) + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + result = df.sort_index() + assert not result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + # reconstruct + result = df.sort_index().copy() + result.index = result.index._reconstruct(sort=True) + assert result.index.is_lexsorted() + assert result.index.is_monotonic + tm.assert_frame_equal(result, expected) def test_sort_index_reorder_on_ops(self): From 269cb3bbbc172e15c0280c199f16c8ffe10c2cfb Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 22 Mar 2017 19:51:41 -0400 Subject: [PATCH 04/11] small doc updates --- doc/source/advanced.rst | 22 +++++++++++----------- doc/source/whatsnew/v0.20.0.txt | 11 +---------- pandas/core/sorting.py | 16 ++++++++++++++++ pandas/indexes/multi.py | 12 ++++++++---- 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index f380070ddac79..04448974e659b 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -136,7 +136,7 @@ can find yourself working with hierarchically-indexed data without creating a may wish to generate your own ``MultiIndex`` when preparing the data set. Note that how the index is displayed by be controlled using the -``multi_sparse`` option in ``pandas.set_printoptions``: +``multi_sparse`` option in ``pandas.set_options()``: .. ipython:: python @@ -288,7 +288,7 @@ As usual, **both sides** of the slicers are included as this is label indexing. .. code-block:: python - df.loc[(slice('A1','A3'),.....),:] + df.loc[(slice('A1','A3'),.....), :] rather than this: @@ -317,43 +317,43 @@ Basic multi-index slicing using slices, lists, and labels. .. ipython:: python - dfmi.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:] + dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :] You can use a ``pd.IndexSlice`` to have a more natural syntax using ``:`` rather than using ``slice(None)`` .. ipython:: python idx = pd.IndexSlice - dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] It is possible to perform quite complicated selections using this method on multiple axes at the same time. .. ipython:: python - dfmi.loc['A1',(slice(None),'foo')] - dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']] + dfmi.loc['A1', (slice(None), 'foo')] + dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] Using a boolean indexer you can provide selection related to the *values*. .. ipython:: python - mask = dfmi[('a','foo')]>200 - dfmi.loc[idx[mask,:,['C1','C3']],idx[:,'foo']] + mask = dfmi[('a', 'foo')] > 200 + dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']] You can also specify the ``axis`` argument to ``.loc`` to interpret the passed slicers on a single axis. .. ipython:: python - dfmi.loc(axis=0)[:,:,['C1','C3']] + dfmi.loc(axis=0)[:, :, ['C1', 'C3']] Furthermore you can *set* the values using these methods .. ipython:: python df2 = dfmi.copy() - df2.loc(axis=0)[:,:,['C1','C3']] = -10 + df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10 df2 You can use a right-hand-side of an alignable object as well. @@ -361,7 +361,7 @@ You can use a right-hand-side of an alignable object as well. .. ipython:: python df2 = dfmi.copy() - df2.loc[idx[:,:,['C1','C3']],:] = df2*1000 + df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000 df2 .. _advanced.xs: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 279b93f75a08e..24fceae5e47f2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -720,7 +720,7 @@ DataFrame.sort_index changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort. -This would happen with a ``lexsorted``, but non-montonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`) +This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`) This is UNCHANGED between versions, but showing for illustration purposes: @@ -778,15 +778,6 @@ New Behavior: df.sort_index().index.is_lexsorted() df.sort_index().index.is_monotonic -Previous Behavior: - -.. code-block:: ipython - -New Behavior: - -.. ipython:: python - - .. _whatsnew_0200.api_breaking.groupby_describe: Groupby Describe Formatting diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ea131e66cb833..e56a4f50de134 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -94,6 +94,22 @@ def maybe_lift(lab, size): # pormote nan values def get_compressed_ids(labels, sizes): + """ + + Group_index is offsets into cartesian product of all possible labels. This + space can be huge, so this function compresses it, by computing offsets + (comp_ids) into the list of unique labels (obs_group_ids). + + Parameters + ---------- + labels : list of label arrays + sizes : list of size of the levels + + Returns + ------- + tuple of (comp_ids, obs_group_ids) + + """ ids = get_group_index(labels, sizes, sort=True, xnull=False) return compress_group_index(ids, sort=True) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index b1119d5327f10..7165301f9659c 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1175,10 +1175,14 @@ def from_product(cls, iterables, sortorder=None, names=None): def _reconstruct(self, sort=False, remove_unused=False): """ - reconstruct the MultiIndex + create a new MultiIndex from the current to provide either: + - monotonically sorted items IN the levels + - removing unused levels (meaning that they are not expressed + in the labels) - The MultiIndex will have the same outward appearance (e.g. values) - and will also .equals() + The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will also + be .equals() to the original. Parameters ---------- @@ -1189,7 +1193,7 @@ def _reconstruct(self, sort=False, remove_unused=False): Returns ------- - MultiIndex + new MultiIndex """ From 3c4ca22fa5bd190d00f79867d4f99d712da97f93 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 23 Mar 2017 19:36:49 -0400 Subject: [PATCH 05/11] add degenerate test case --- pandas/tests/test_multilevel.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index d378b67b25505..70e1daa4d6f55 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -11,6 +11,7 @@ from pandas.core.index import Index, MultiIndex from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp +from pandas.core.common import UnsortedIndexError from pandas.types.common import is_float_dtype, is_integer_dtype import pandas.core.common as com import pandas.util.testing as tm @@ -2612,3 +2613,23 @@ def my_func(group): names=['letter', 'size', None]) tm.assert_index_equal(result.index, expected) + + def test_sort_non_lexsorted(self): + # degenerate case where we sort but don't + # have a satisfying result :< + + idx = MultiIndex([['A', 'B', 'C'], + ['c', 'b', 'a']], + [[0, 1, 2, 0, 1, 2], + [0, 2, 1, 1, 0, 2]]) + + df = DataFrame({'col': range(len(idx))}, index=idx) + assert df.index.is_lexsorted() is False + assert df.index.is_monotonic is False + + result = df.sort_index() + assert result.index.is_lexsorted() is False + assert result.index.is_monotonic is True + + with pytest.raises(UnsortedIndexError): + result.loc[pd.IndexSlice['B':'C', 'a':'c'], :] From f2ddc9c5d2800b6083f9d302d43abecd4fc572f0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 4 Apr 2017 11:32:59 -0400 Subject: [PATCH 06/11] replace _reconstruct with: sort_monotonic, and remove_unused_levels (public) --- doc/source/advanced.rst | 41 +++++----- doc/source/api.rst | 1 + doc/source/whatsnew/v0.20.0.txt | 2 + pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- pandas/indexes/multi.py | 122 ++++++++++++++--------------- pandas/tests/indexes/test_multi.py | 22 ++---- pandas/tests/test_multilevel.py | 2 +- pandas/tests/tools/test_hashing.py | 2 +- 9 files changed, 96 insertions(+), 100 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 04448974e659b..16ded4083b588 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -175,35 +175,40 @@ completely analogous way to selecting a column in a regular DataFrame: See :ref:`Cross-section with hierarchical index ` for how to select on a deeper level. -.. note:: +.. _advanced.shown_levels: + +Defined Levels +~~~~~~~~~~~~~~ + +The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even +if the they are not actually used. When slicing an index, you may notice this. +For example: - The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even - if the they are not actually used. When slicing an index, you may notice this. - For example: +.. ipython:: python - .. ipython:: python + # original multi-index + df.columns - # original multi-index - df.columns + # sliced + df[['foo','qux']].columns - # sliced - df[['foo','qux']].columns +This is done to avoid a recomputation of the levels in order to make slicing +highly performant. If you want to see the actual used levels. - This is done to avoid a recomputation of the levels in order to make slicing - highly performant. If you want to see the actual used levels. +.. ipython:: python - .. ipython:: python + df[['foo','qux']].columns.values - df[['foo','qux']].columns.values + # for a specific level + df[['foo','qux']].columns.get_level_values(0) - # for a specific level - df[['foo','qux']].columns.get_level_values(0) +To reconstruct the multiindex with only the used levels - To reconstruct the multiindex with only the used levels +.. versionadded:: 0.20.0 - .. ipython:: python +.. ipython:: python - pd.MultiIndex.from_tuples(df[['foo','qux']].columns.values) + df[['foo','qux']].columns.remove_unused_levels() Data alignment and using ``reindex`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/api.rst b/doc/source/api.rst index 24bad7d515305..336b0b9b14c6c 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1432,6 +1432,7 @@ MultiIndex Components MultiIndex.droplevel MultiIndex.swaplevel MultiIndex.reorder_levels + MultiIndex.remove_unused_levels .. _api.datetimeindex: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 24fceae5e47f2..c8ac0096a296f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -366,6 +366,7 @@ Other Enhancements - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`) - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) - ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`) +- A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels `. (:issue:`15694`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -778,6 +779,7 @@ New Behavior: df.sort_index().index.is_lexsorted() df.sort_index().index.is_monotonic + .. _whatsnew_0200.api_breaking.groupby_describe: Groupby Describe Formatting diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e3379261625f2..08d1df69855ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3349,7 +3349,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer - labels = labels._reconstruct(sort=True) + labels = labels.sort_monotonic() indexer = lexsort_indexer(labels.labels, orders=ascending, na_position=na_position) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8cc957f4e2f7f..5529e733c0bdd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1762,7 +1762,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer - labels = index._reconstruct(sort=True) + labels = index.sort_monotonic() indexer = lexsort_indexer(labels.labels, orders=ascending) else: from pandas.core.sorting import nargsort diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 7165301f9659c..d8d45298523d5 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1173,98 +1173,98 @@ def from_product(cls, iterables, sortorder=None, names=None): labels = cartesian_product(labels) return MultiIndex(levels, labels, sortorder=sortorder, names=names) - def _reconstruct(self, sort=False, remove_unused=False): + def sort_monotonic(self): """ - create a new MultiIndex from the current to provide either: - - monotonically sorted items IN the levels - - removing unused levels (meaning that they are not expressed - in the labels) + create a new MultiIndex from the current to monotonically sorted + items IN the levels The resulting MultiIndex will have the same outward appearance, meaning the same .values and ordering. It will also be .equals() to the original. - Parameters - ---------- - sort: boolean, default False - monotonically sort the levels - remove_unused: boolean, default False - remove unsued levels - Returns ------- - new MultiIndex + MultiIndex """ - if sort and remove_unused: - raise ValueError("only support one of sort / remove_unused") - - if not (sort or remove_unused): - raise ValueError("must supply one of sort / remove_unsued") - - levels = self.levels - labels = self.labels + if self.is_lexsorted() and self.is_monotonic: + return self new_levels = [] new_labels = [] - if sort: - - if self.is_lexsorted() and self.is_monotonic: - return self + for lev, lab in zip(self.levels, self.labels): - for lev, lab in zip(levels, labels): + if lev.is_monotonic: + new_levels.append(lev) + new_labels.append(lab) + continue - if lev.is_monotonic: - new_levels.append(lev) - new_labels.append(lab) - continue + # indexer to reorder the levels + indexer = lev.argsort() + lev = lev.take(indexer) - # indexer to reorder the levels - indexer = lev.argsort() - lev = lev.take(indexer) + # indexer to reorder the labels + ri = lib.get_reverse_indexer(indexer, len(indexer)) + lab = algos.take_1d(ri, lab) - # indexer to reorder the labels - ri = lib.get_reverse_indexer(indexer, len(indexer)) - lab = algos.take_1d(ri, lab) + new_levels.append(lev) + new_labels.append(lab) - new_levels.append(lev) - new_labels.append(lab) - - elif remove_unused: + return MultiIndex(new_levels, new_labels, + names=self.names, sortorder=self.sortorder, + verify_integrity=False) - changed = np.zeros(self.nlevels, dtype=bool) - for i, (lev, lab) in enumerate(zip(levels, labels)): + def remove_unused_levels(self): + """ + create a new MultiIndex from the current that removesing + unused levels, meaning that they are not expressed in the labels - uniques = np.sort(algos.unique(lab)) + The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will also + be .equals() to the original. - # nothing unused - if len(uniques) == len(lev): - new_levels.append(lev) - new_labels.append(lab) - changed[i] = True - continue + Returns + ------- + MultiIndex - unused = list(reversed(sorted(set( - np.arange(len(lev))) - set(uniques)))) + """ - # new levels are simple - lev = lev.take(uniques) + new_levels = [] + new_labels = [] - # new labels, we remove the unsued - # by decrementing the labels for that value - # prob a better way - for u in unused: + changed = np.zeros(self.nlevels, dtype=bool) + for i, (lev, lab) in enumerate(zip(self.levels, self.labels)): - lab = np.where(lab > u, lab - 1, lab) + uniques = np.sort(algos.unique(lab)) + # nothing unused + if len(uniques) == len(lev): new_levels.append(lev) new_labels.append(lab) + changed[i] = True + continue + + unused = list(reversed(sorted(set( + np.arange(len(lev))) - set(uniques)))) + + # new levels are simple + lev = lev.take(uniques) - # nothing changed - if not changed.any(): - return self + # new labels, we remove the unsued + # by decrementing the labels for that value + # prob a better way + for u in unused: + + lab = np.where(lab > u, lab - 1, lab) + + new_levels.append(lev) + new_labels.append(lab) + + # nothing changed + if not changed.any(): + return self return MultiIndex(new_levels, new_labels, names=self.names, sortorder=self.sortorder, diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index af30b295ee9af..be6075945dfc6 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2411,18 +2411,6 @@ def test_is_monotonic(self): self.assertFalse(i.is_monotonic) - def test_reconstruct_api(self): - - mi = MultiIndex.from_arrays([ - ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] - ]) - - with pytest.raises(ValueError): - mi._reconstruct() - - with pytest.raises(ValueError): - mi._reconstruct(sort=True, remove_unused=True) - def test_reconstruct_sort(self): # starts off lexsorted & monotonic @@ -2432,7 +2420,7 @@ def test_reconstruct_sort(self): assert mi.is_lexsorted() assert mi.is_monotonic - recons = mi._reconstruct(sort=True) + recons = mi.sort_monotonic() assert recons.is_lexsorted() assert recons.is_monotonic assert mi is recons @@ -2447,7 +2435,7 @@ def test_reconstruct_sort(self): assert not mi.is_lexsorted() assert not mi.is_monotonic - recons = mi._reconstruct(sort=True) + recons = mi.sort_monotonic() assert not recons.is_lexsorted() assert not recons.is_monotonic @@ -2461,7 +2449,7 @@ def test_reconstruct_sort(self): assert not mi.is_lexsorted() assert not mi.is_monotonic - recons = mi._reconstruct(sort=True) + recons = mi.sort_monotonic() assert not recons.is_lexsorted() assert not recons.is_monotonic @@ -2489,11 +2477,11 @@ def test_reconstruct_remove_unused(self): [2, 3]], labels=[[0, 1], [0, 1]], names=['first', 'second']) - result = df2.index._reconstruct(remove_unused=True) + result = df2.index.remove_unused_levels() tm.assert_index_equal(result, expected) # idempotent - result2 = result._reconstruct(remove_unused=True) + result2 = result.remove_unused_levels() tm.assert_index_equal(result2, expected) assert result2 is result diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 70e1daa4d6f55..f09e694fa85eb 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2582,7 +2582,7 @@ def test_sort_index_and_reconstruction_doc_example(self): # reconstruct result = df.sort_index().copy() - result.index = result.index._reconstruct(sort=True) + result.index = result.index.sort_monotonic() assert result.index.is_lexsorted() assert result.index.is_monotonic diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/tools/test_hashing.py index 17a1fb1a7d525..6712cb9f50761 100644 --- a/pandas/tests/tools/test_hashing.py +++ b/pandas/tests/tools/test_hashing.py @@ -91,7 +91,7 @@ def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) - recons = mi._reconstruct(sort=True) + recons = mi.sort_monotonic() # these are equal assert mi.equals(recons) From 520c9c1557eec87d1aaac4029f843f6ff6ce54a0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 4 Apr 2017 11:35:16 -0400 Subject: [PATCH 07/11] versionadded tags --- pandas/indexes/multi.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index d8d45298523d5..52b8c958a64d9 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1175,6 +1175,10 @@ def from_product(cls, iterables, sortorder=None, names=None): def sort_monotonic(self): """ + .. versionadded:: 0.20.0 + + This is an *internal* function. + create a new MultiIndex from the current to monotonically sorted items IN the levels @@ -1218,6 +1222,8 @@ def sort_monotonic(self): def remove_unused_levels(self): """ + .. versionadded:: 0.20.0 + create a new MultiIndex from the current that removesing unused levels, meaning that they are not expressed in the labels From 527c3a6b8a2df11cb8400331aeaa571289b4fa02 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 4 Apr 2017 11:47:09 -0400 Subject: [PATCH 08/11] simpler algo for remove_used_levels --- pandas/indexes/multi.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 52b8c958a64d9..27742be04dfc0 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1240,20 +1240,21 @@ def remove_unused_levels(self): new_levels = [] new_labels = [] - changed = np.zeros(self.nlevels, dtype=bool) + changed = np.ones(self.nlevels, dtype=bool) for i, (lev, lab) in enumerate(zip(self.levels, self.labels)): - uniques = np.sort(algos.unique(lab)) + uniques = algos.unique(lab) # nothing unused if len(uniques) == len(lev): new_levels.append(lev) new_labels.append(lab) - changed[i] = True + changed[i] = False continue - unused = list(reversed(sorted(set( - np.arange(len(lev))) - set(uniques)))) + # set difference, then reverse sort + diff = Index(np.arange(len(lev))).difference(uniques) + unused = diff.sort_values(ascending=False) # new levels are simple lev = lev.take(uniques) From 48249ab50aa0dcd160293f99874b87e2769f60dd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 4 Apr 2017 14:24:33 -0400 Subject: [PATCH 09/11] add doc example --- doc/source/whatsnew/v0.20.0.txt | 11 +++++++---- pandas/indexes/multi.py | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c8ac0096a296f..21b259e7663ba 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -367,6 +367,7 @@ Other Enhancements - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`) - ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`) - A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels `. (:issue:`15694`) +- :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels `. (:issue:`15694`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -746,12 +747,14 @@ Sorting works as expected df.sort_index().index.is_lexsorted() df.sort_index().index.is_monotonic -However, this example, which has a monotonic level, doesn't behave as desired. +However, this example, which has a non-monotonic 2nd level, +doesn't behave as desired. .. ipython:: python - df = pd.DataFrame({'value': [1, 2, 3, 4]}, - index=pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + df = pd.DataFrame( + {'value': [1, 2, 3, 4]}, + index=pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) Previous Behavior: diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 27742be04dfc0..d7c85169bdd03 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1235,6 +1235,23 @@ def remove_unused_levels(self): ------- MultiIndex + Examples + -------- + >>> i = MultiIndex.from_product([range(2), list('ab')]) + MultiIndex(levels=[[0, 1], ['a', 'b']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + + + >>> i[2:] + MultiIndex(levels=[[0, 1], ['a', 'b']], + labels=[[1, 1], [0, 1]]) + + # the 0 from the first level is not represented + # and can be removed + >>> i[2:].remove_unused_levels() + MultiIndex(levels=[[1], ['a', 'b']], + labels=[[0, 0], [0, 1]]) + """ new_levels = [] From 31097fcefff26060353b431b130a56e5f7109e9c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 4 Apr 2017 14:31:58 -0400 Subject: [PATCH 10/11] add doc-strings, rename sort_monotonic -> sort_levels_monotonic --- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- pandas/indexes/multi.py | 20 +++++++++++++++++--- pandas/tests/indexes/test_multi.py | 6 +++--- pandas/tests/test_multilevel.py | 2 +- pandas/tests/tools/test_hashing.py | 2 +- 6 files changed, 24 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 08d1df69855ea..318021d252a8d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3349,7 +3349,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer - labels = labels.sort_monotonic() + labels = labels.sort_levels_monotonic() indexer = lexsort_indexer(labels.labels, orders=ascending, na_position=na_position) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5529e733c0bdd..ab8f7261fbc8d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1762,7 +1762,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer - labels = index.sort_monotonic() + labels = index.sort_levels_monotonic() indexer = lexsort_indexer(labels.labels, orders=ascending) else: from pandas.core.sorting import nargsort diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index d7c85169bdd03..747bfc2f2dbad 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1173,14 +1173,15 @@ def from_product(cls, iterables, sortorder=None, names=None): labels = cartesian_product(labels) return MultiIndex(levels, labels, sortorder=sortorder, names=names) - def sort_monotonic(self): + def sort_levels_monotonic(self): """ .. versionadded:: 0.20.0 This is an *internal* function. create a new MultiIndex from the current to monotonically sorted - items IN the levels + items IN the levels. This does not actually make the entire MultiIndex + monotonic, JUST the levels. The resulting MultiIndex will have the same outward appearance, meaning the same .values and ordering. It will also @@ -1190,6 +1191,19 @@ def sort_monotonic(self): ------- MultiIndex + Examples + -------- + + >>> i = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + >>> i + MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) + + >>> i.sort_monotonic() + MultiIndex(levels=[['a', 'b'], ['aa', 'bb']], + labels=[[0, 0, 1, 1], [1, 0, 1, 0]]) + """ if self.is_lexsorted() and self.is_monotonic: @@ -1237,7 +1251,7 @@ def remove_unused_levels(self): Examples -------- - >>> i = MultiIndex.from_product([range(2), list('ab')]) + >>> i = pd.MultiIndex.from_product([range(2), list('ab')]) MultiIndex(levels=[[0, 1], ['a', 'b']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index be6075945dfc6..3e36937beab11 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2420,7 +2420,7 @@ def test_reconstruct_sort(self): assert mi.is_lexsorted() assert mi.is_monotonic - recons = mi.sort_monotonic() + recons = mi.sort_levels_monotonic() assert recons.is_lexsorted() assert recons.is_monotonic assert mi is recons @@ -2435,7 +2435,7 @@ def test_reconstruct_sort(self): assert not mi.is_lexsorted() assert not mi.is_monotonic - recons = mi.sort_monotonic() + recons = mi.sort_levels_monotonic() assert not recons.is_lexsorted() assert not recons.is_monotonic @@ -2449,7 +2449,7 @@ def test_reconstruct_sort(self): assert not mi.is_lexsorted() assert not mi.is_monotonic - recons = mi.sort_monotonic() + recons = mi.sort_levels_monotonic() assert not recons.is_lexsorted() assert not recons.is_monotonic diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index f09e694fa85eb..66a24b200878a 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2582,7 +2582,7 @@ def test_sort_index_and_reconstruction_doc_example(self): # reconstruct result = df.sort_index().copy() - result.index = result.index.sort_monotonic() + result.index = result.index.sort_levels_monotonic() assert result.index.is_lexsorted() assert result.index.is_monotonic diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/tools/test_hashing.py index 6712cb9f50761..1e8c58f5dbaa1 100644 --- a/pandas/tests/tools/test_hashing.py +++ b/pandas/tests/tools/test_hashing.py @@ -91,7 +91,7 @@ def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) - recons = mi.sort_monotonic() + recons = mi.sort_levels_monotonic() # these are equal assert mi.equals(recons) From bd17d2bbed42a3d626e47fd9ab1092daca8b799a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 6 Apr 2017 20:34:04 -0400 Subject: [PATCH 11/11] rename sort_index_montonic -> _sort_index_monotonic doc fixups --- pandas/core/frame.py | 3 +-- pandas/core/series.py | 2 +- pandas/indexes/multi.py | 8 ++++---- pandas/tests/indexes/test_multi.py | 6 +++--- pandas/tests/test_multilevel.py | 2 +- pandas/tests/tools/test_hashing.py | 2 +- 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 318021d252a8d..c8c21b0c5fd7d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3349,8 +3349,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer - labels = labels.sort_levels_monotonic() - + labels = labels._sort_levels_monotonic() indexer = lexsort_indexer(labels.labels, orders=ascending, na_position=na_position) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index ab8f7261fbc8d..760abc20351cf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1762,7 +1762,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer - labels = index.sort_levels_monotonic() + labels = index._sort_levels_monotonic() indexer = lexsort_indexer(labels.labels, orders=ascending) else: from pandas.core.sorting import nargsort diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 747bfc2f2dbad..96e0effbd7608 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1173,7 +1173,7 @@ def from_product(cls, iterables, sortorder=None, names=None): labels = cartesian_product(labels) return MultiIndex(levels, labels, sortorder=sortorder, names=names) - def sort_levels_monotonic(self): + def _sort_levels_monotonic(self): """ .. versionadded:: 0.20.0 @@ -1236,15 +1236,15 @@ def sort_levels_monotonic(self): def remove_unused_levels(self): """ - .. versionadded:: 0.20.0 - - create a new MultiIndex from the current that removesing + create a new MultiIndex from the current that removing unused levels, meaning that they are not expressed in the labels The resulting MultiIndex will have the same outward appearance, meaning the same .values and ordering. It will also be .equals() to the original. + .. versionadded:: 0.20.0 + Returns ------- MultiIndex diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 3e36937beab11..e93319a30d5d8 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2420,7 +2420,7 @@ def test_reconstruct_sort(self): assert mi.is_lexsorted() assert mi.is_monotonic - recons = mi.sort_levels_monotonic() + recons = mi._sort_levels_monotonic() assert recons.is_lexsorted() assert recons.is_monotonic assert mi is recons @@ -2435,7 +2435,7 @@ def test_reconstruct_sort(self): assert not mi.is_lexsorted() assert not mi.is_monotonic - recons = mi.sort_levels_monotonic() + recons = mi._sort_levels_monotonic() assert not recons.is_lexsorted() assert not recons.is_monotonic @@ -2449,7 +2449,7 @@ def test_reconstruct_sort(self): assert not mi.is_lexsorted() assert not mi.is_monotonic - recons = mi.sort_levels_monotonic() + recons = mi._sort_levels_monotonic() assert not recons.is_lexsorted() assert not recons.is_monotonic diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 66a24b200878a..914d26fcafb4a 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2582,7 +2582,7 @@ def test_sort_index_and_reconstruction_doc_example(self): # reconstruct result = df.sort_index().copy() - result.index = result.index.sort_levels_monotonic() + result.index = result.index._sort_levels_monotonic() assert result.index.is_lexsorted() assert result.index.is_monotonic diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/tools/test_hashing.py index 1e8c58f5dbaa1..864b5018abc75 100644 --- a/pandas/tests/tools/test_hashing.py +++ b/pandas/tests/tools/test_hashing.py @@ -91,7 +91,7 @@ def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) - recons = mi.sort_levels_monotonic() + recons = mi._sort_levels_monotonic() # these are equal assert mi.equals(recons)