Skip to content

Commit 4c1547f

Browse files
committed
ENH: store tuples in MultiIndex, add legacy pickle testing and other speed tweaks
1 parent 9bb210c commit 4c1547f

File tree

4 files changed

+63
-25
lines changed

4 files changed

+63
-25
lines changed

pandas/core/index.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from pandas.core.common import (adjoin as _adjoin, _stringify,
99
_is_bool_indexer, _asarray_tuplesafe)
1010
from pandas.util.decorators import cache_readonly
11+
import pandas.core.common as com
1112
import pandas._tseries as lib
1213
import pandas._engines as _engines
1314

@@ -868,27 +869,32 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None):
868869

869870
return Index(levels[0], name=name).take(labels[0])
870871

871-
return np.arange(len(labels[0]), dtype=object).view(cls)
872+
levels = [_ensure_index(lev) for lev in levels]
873+
labels = [np.asarray(labs, dtype=np.int32) for labs in labels]
872874

873-
def __init__(self, levels, labels, sortorder=None, names=None,
874-
consistent=None):
875-
self.levels = [_ensure_index(lev) for lev in levels]
876-
self.labels = [np.asarray(labs, dtype=np.int32) for labs in labels]
875+
values = [np.asarray(lev).take(lab)
876+
for lev, lab in zip(levels, labels)]
877+
subarr = lib.fast_zip(values).view(cls)
878+
879+
subarr.levels = levels
880+
subarr.labels = labels
877881

878882
if names is None:
879-
self.names = [None] * self.nlevels
883+
subarr.names = [None] * subarr.nlevels
880884
else:
881-
assert(len(names) == self.nlevels)
882-
self.names = list(names)
885+
assert(len(names) == subarr.nlevels)
886+
subarr.names = list(names)
883887

884888
# set the name
885-
for i, name in enumerate(self.names):
886-
self.levels[i].name = name
889+
for i, name in enumerate(subarr.names):
890+
subarr.levels[i].name = name
887891

888892
if sortorder is not None:
889-
self.sortorder = int(sortorder)
893+
subarr.sortorder = int(sortorder)
890894
else:
891-
self.sortorder = sortorder
895+
subarr.sortorder = sortorder
896+
897+
return subarr
892898

893899
@property
894900
def dtype(self):
@@ -908,9 +914,18 @@ def _get_level_number(self, level):
908914

909915
@property
910916
def values(self):
911-
values = [np.asarray(lev).take(lab)
912-
for lev, lab in zip(self.levels, self.labels)]
913-
return lib.fast_zip(values)
917+
if self._is_legacy_format:
918+
# for legacy MultiIndex
919+
values = [np.asarray(lev).take(lab)
920+
for lev, lab in zip(self.levels, self.labels)]
921+
return lib.fast_zip(values)
922+
else:
923+
return self.view(np.ndarray)
924+
925+
@property
926+
def _is_legacy_format(self):
927+
contents = self.view(np.ndarray)
928+
return len(contents) > 0 and not isinstance(contents[0], tuple)
914929

915930
def get_level_values(self, level):
916931
"""
@@ -1112,7 +1127,7 @@ def append(self, other):
11121127
return MultiIndex.from_tuples(new_tuples, names=self.names)
11131128

11141129
def argsort(self, *args, **kwargs):
1115-
return self.get_tuple_index().argsort()
1130+
return self.values.argsort()
11161131

11171132
def drop(self, labels):
11181133
"""
@@ -1290,10 +1305,12 @@ def get_indexer(self, target, method=None):
12901305
method = self._get_method(method)
12911306

12921307
target_index = target
1293-
if isinstance(target, MultiIndex):
1308+
if isinstance(target, MultiIndex) and target._is_legacy_format:
12941309
target_index = target.get_tuple_index()
12951310

1296-
self_index = self.get_tuple_index()
1311+
self_index = self
1312+
if self._is_legacy_format:
1313+
self_index = self.get_tuple_index()
12971314

12981315
if method == 'pad':
12991316
indexer = self._pad(self_index, target_index, self_index.indexMap,
@@ -1332,7 +1349,7 @@ def get_tuple_index(self):
13321349
-------
13331350
index : Index
13341351
"""
1335-
return Index(list(self))
1352+
return Index(self.values)
13361353

13371354
def slice_locs(self, start=None, end=None, strict=False):
13381355
"""

pandas/src/reduce.pyx

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,18 +89,18 @@ cdef class Reducer:
8989
raise ValueError('function does not reduce')
9090
return result
9191

92-
cdef class Grouper:
92+
cdef class SeriesGrouper:
9393
'''
9494
Performs generic grouping operation while avoiding ndarray construction
9595
overhead
9696
'''
9797
cdef:
98-
Py_ssize_t nresults, ngroups
98+
Py_ssize_t nresults, ngroup
9999
object arr, dummy, f, labels, counts
100100
bint passed_dummy
101101

102-
def __init__(self, object arr, object index, object f,
103-
object labels, ngroups, dummy=None):
102+
def __init__(self, object arr, object f, object labels, ngroups,
103+
dummy=None):
104104
n = len(arr)
105105

106106
assert(arr.ndim == 1)
@@ -111,6 +111,7 @@ cdef class Grouper:
111111
self.labels = labels
112112
self.f = f
113113
self.arr = arr
114+
114115
self.dummy = self._check_dummy(dummy)
115116
self.passed_dummy = dummy is not None
116117

pandas/tests/test_groupby.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -800,8 +800,8 @@ def test_groupby_level(self):
800800
result0 = frame.groupby(level=0).sum()
801801
result1 = frame.groupby(level=1).sum()
802802

803-
expected0 = frame.groupby(deleveled['first']).sum()
804-
expected1 = frame.groupby(deleveled['second']).sum()
803+
expected0 = frame.groupby(deleveled['first'].values).sum()
804+
expected1 = frame.groupby(deleveled['second'].values).sum()
805805

806806
self.assert_(result0.index.name == 'first')
807807
self.assert_(result1.index.name == 'second')

pandas/tests/test_index.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,26 @@ def test_pickle(self):
680680
unpickled = pickle.loads(pickled)
681681
self.assert_(self.index.equals(unpickled))
682682

683+
def test_legacy_pickle(self):
684+
import os
685+
def curpath():
686+
pth, _ = os.path.split(os.path.abspath(__file__))
687+
return pth
688+
689+
ppath = os.path.join(curpath(), 'data/multiindex_v1.pickle')
690+
obj = pickle.load(open(ppath, 'r'))
691+
692+
self.assert_(obj._is_legacy_format)
693+
694+
obj2 = MultiIndex.from_tuples(obj.values)
695+
self.assert_(obj.equals(obj2))
696+
697+
res = obj.get_indexer(obj2[::-1])
698+
exp = obj.get_indexer(obj[::-1])
699+
exp2 = obj2.get_indexer(obj2[::-1])
700+
assert_almost_equal(res, exp)
701+
assert_almost_equal(exp, exp2)
702+
683703
def test_contains(self):
684704
self.assert_(('foo', 'two') in self.index)
685705
self.assert_(('bar', 'two') not in self.index)

0 commit comments

Comments
 (0)