From acb9b3390c8c7df8275b7a7463f34bc640a52fba Mon Sep 17 00:00:00 2001 From: Mike Kelly Date: Mon, 13 Oct 2014 10:18:15 -0400 Subject: [PATCH] PERF: Slowness in multi-level indexes with datetime levels Special case handling of sliced multi indexes, where there the length of the level values may exceed the length of the index. Corrected nan handling issue introduced by the original change. (+1 squashed commit) Squashed commits: [0dae170] PERF: Slowness in multi-level indexes with datetime levels Special case handling of sliced multi indexes, where there the length of the level values may exceed the length of the index. --- pandas/core/index.py | 13 +++++++++---- pandas/tests/test_index.py | 12 ++++++++++++ vb_suite/index_object.py | 17 ++++++++++++----- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index 919018977cb80..c2c7e28a7a7f4 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2937,11 +2937,16 @@ def values(self): values = [] for lev, lab in zip(self.levels, self.labels): - lev_values = lev.values # Need to box timestamps, etc. - if hasattr(lev, '_box_values'): - lev_values = lev._box_values(lev_values) - taken = com.take_1d(lev_values, lab) + box = hasattr(lev, '_box_values') + # Try to minimize boxing. + if box and len(lev) > len(lab): + taken = lev._box_values(com.take_1d(lev.values, lab)) + elif box: + taken = com.take_1d(lev._box_values(lev.values), lab, + fill_value=_get_na_value(lev.dtype.type)) + else: + taken = com.take_1d(lev.values, lab) values.append(taken) self._tuples = lib.fast_zip(values) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index a8c4548f462ac..3c5f3a8d6b6d3 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -2335,6 +2335,18 @@ def test_from_product_datetimeindex(self): (2, pd.Timestamp('2000-01-02'))]) assert_array_equal(mi.values, etalon) + def test_values_boxed(self): + tuples = [(1, pd.Timestamp('2000-01-01')), + (2, pd.NaT), + (3, pd.Timestamp('2000-01-03')), + (1, pd.Timestamp('2000-01-04')), + (2, pd.Timestamp('2000-01-02')), + (3, pd.Timestamp('2000-01-03'))] + mi = pd.MultiIndex.from_tuples(tuples) + assert_array_equal(mi.values, pd.lib.list_to_object_array(tuples)) + # Check that code branches for boxed values produce identical results + assert_array_equal(mi.values[:4], mi[:4].values) + def test_append(self): result = self.index[:3].append(self.index[3:]) self.assertTrue(result.equals(self.index)) diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index de60a44e23a52..d54845f4643cd 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -108,7 +108,7 @@ # Constructing MultiIndex from cartesian product of iterables -# +# setup = common_setup + """ iterables = [tm.makeStringIndex(10000), xrange(20)] @@ -123,10 +123,17 @@ setup = common_setup + """ level1 = range(1000) -level2 = date_range(start='1/1/2012', periods=10) +level2 = date_range(start='1/1/2012', periods=100) +mi = MultiIndex.from_product([level1, level2]) """ -multiindex_with_datetime_level = \ - Benchmark("MultiIndex.from_product([level1, level2]).values", setup, - name='multiindex_with_datetime_level', +multiindex_with_datetime_level_full = \ + Benchmark("mi.copy().values", setup, + name='multiindex_with_datetime_level_full', + start_date=datetime(2014, 10, 11)) + + +multiindex_with_datetime_level_sliced = \ + Benchmark("mi[:10].values", setup, + name='multiindex_with_datetime_level_sliced', start_date=datetime(2014, 10, 11))