From f74c06678243a9a0509cd9466981aabfbdfc1355 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 9 Nov 2017 16:34:46 -0800 Subject: [PATCH 1/2] Handle unsortable Periods correctly in set_index, MultiIndex --- pandas/core/sorting.py | 17 ++++++-- pandas/tests/indexes/period/test_period.py | 45 ++++++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 27252b9616a44..650bc674d4db5 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -431,21 +431,32 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): def sort_mixed(values): # order ints before strings, safe in py3 + from pandas import Period + per_pos = np.array([isinstance(x, Period) for x in values], + dtype=bool) str_pos = np.array([isinstance(x, string_types) for x in values], dtype=bool) - nums = np.sort(values[~str_pos]) + nums = np.sort(values[~(str_pos | per_pos)]) strs = np.sort(values[str_pos]) - return np.concatenate([nums, np.asarray(strs, dtype=object)]) + try: + pers = np.sort(values[per_pos]) + except (TypeError, ValueError): + # period.IncompatibleFrequency subclasses ValueError, leads + # to inconsistent behavior in py2/py3 + pers = sorted(values[per_pos], key=lambda x: x.start_time) + pers = np.array(pers, dtype=object) + return np.concatenate([nums, np.asarray(strs, dtype=object), pers]) sorter = None if PY3 and lib.infer_dtype(values) == 'mixed-integer': # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: + from pandas._libs.period import IncompatibleFrequency try: sorter = values.argsort() ordered = values.take(sorter) - except TypeError: + except (TypeError, IncompatibleFrequency): # try this anyway ordered = sort_mixed(values) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index e5ee078d3558d..4345e199ef9f8 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -11,6 +11,51 @@ from ..datetimelike import DatetimeLike +class TestPeriodLevelMultiIndex(object): + # TODO: Is there a more appropriate place for these? + def test_set_index(self): + # GH#17112 + index = Index(['PCE'] * 4, name='Variable') + data = [Period('2018Q2'), + Period('2021', freq='5A-Dec'), + Period('2026', freq='10A-Dec'), + Period('2017Q2')] + ser = Series(data, index=index, name='Period') + df = ser.to_frame() + + res = df.set_index('Period', append=True) + # If the doesn't raise then that's a good start + assert res.index.names == ['Variable', 'Period'] + + def test_from_arrays_period_level(self): + # GH#17112 + index = Index(['PCE'] * 4, name='Variable') + data = [Period('2018Q2'), + Period('2021', freq='5A-Dec'), + Period('2026', freq='10A-Dec'), + Period('2017Q2')] + ser = Series(data, index=index, name='Period') + + mi = pd.MultiIndex.from_arrays([ser.index, ser]) + assert mi.names == ['Variable', 'Period'] + assert mi.get_level_values('Variable').equals(index) + + def test_from_arrays_dataframe_level_invalid(self): + # GH#17112 + index = pd.Index(['CPROF', 'HOUSING', 'INDPROD', 'NGDP', 'PGDP'], + name='Variable') + data = [pd.Period('1968Q4')] * 5 + df = pd.DataFrame(data, index=index, columns=['Period']) + with pytest.raises(TypeError): + # user should not pass a DataFrame as an index level. + # In this single-column case the user needs to specifically pass + # df['Period']. + # Check that this raises at construction time instead of later + # when accessing `mi.shape`, which used to raise + # "ValueError: all arrays must be the same length", + pd.MultiIndex.from_arrays([df.index, df]) + + class TestPeriodIndex(DatetimeLike): _holder = PeriodIndex _multiprocess_can_split_ = True From a02200249b77506b23da92eb8d225dbae58f1c52 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 9 Nov 2017 17:33:50 -0800 Subject: [PATCH 2/2] per reviewer suggestion, leave non-(str or int) unsorted --- pandas/core/sorting.py | 20 +++++++------------- pandas/tests/indexes/period/test_period.py | 15 --------------- 2 files changed, 7 insertions(+), 28 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 650bc674d4db5..a0cc196728e4b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -431,32 +431,26 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): def sort_mixed(values): # order ints before strings, safe in py3 - from pandas import Period - per_pos = np.array([isinstance(x, Period) for x in values], + num_pos = np.array([isinstance(x, (float, int, long)) for x in values], dtype=bool) str_pos = np.array([isinstance(x, string_types) for x in values], dtype=bool) - nums = np.sort(values[~(str_pos | per_pos)]) + nums = np.sort(values[num_pos]) strs = np.sort(values[str_pos]) - try: - pers = np.sort(values[per_pos]) - except (TypeError, ValueError): - # period.IncompatibleFrequency subclasses ValueError, leads - # to inconsistent behavior in py2/py3 - pers = sorted(values[per_pos], key=lambda x: x.start_time) - pers = np.array(pers, dtype=object) - return np.concatenate([nums, np.asarray(strs, dtype=object), pers]) + others = values[~(str_pos | num_pos)] # We don't bother sorting these + return np.concatenate([nums, np.asarray(strs, dtype=object), others]) sorter = None if PY3 and lib.infer_dtype(values) == 'mixed-integer': # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: - from pandas._libs.period import IncompatibleFrequency try: sorter = values.argsort() ordered = values.take(sorter) - except (TypeError, IncompatibleFrequency): + except (ValueError, TypeError): + # Period comparison may raise IncompatibleFrequency, which + # subclasses ValueError instead of TypeError # try this anyway ordered = sort_mixed(values) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 4345e199ef9f8..eb7f47736c96f 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -40,21 +40,6 @@ def test_from_arrays_period_level(self): assert mi.names == ['Variable', 'Period'] assert mi.get_level_values('Variable').equals(index) - def test_from_arrays_dataframe_level_invalid(self): - # GH#17112 - index = pd.Index(['CPROF', 'HOUSING', 'INDPROD', 'NGDP', 'PGDP'], - name='Variable') - data = [pd.Period('1968Q4')] * 5 - df = pd.DataFrame(data, index=index, columns=['Period']) - with pytest.raises(TypeError): - # user should not pass a DataFrame as an index level. - # In this single-column case the user needs to specifically pass - # df['Period']. - # Check that this raises at construction time instead of later - # when accessing `mi.shape`, which used to raise - # "ValueError: all arrays must be the same length", - pd.MultiIndex.from_arrays([df.index, df]) - class TestPeriodIndex(DatetimeLike): _holder = PeriodIndex