Skip to content

Commit 99f7cc1

Browse files
committed
PERF: high memory in MI
closes #13904
1 parent c26e5bb commit 99f7cc1

File tree

15 files changed

+340
-85
lines changed

15 files changed

+340
-85
lines changed

asv_bench/benchmarks/indexing.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def setup(self):
8888

8989
def time_getitem_scalar(self):
9090
self.ts[self.dt]
91-
91+
9292

9393
class DataFrameIndexing(object):
9494
goal_time = 0.2
@@ -189,6 +189,7 @@ def setup(self):
189189
self.eps_C = 5
190190
self.eps_D = 5000
191191
self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel()
192+
self.miint = MultiIndex.from_product([np.arange(1000), np.arange(1000)], names=['one','two'])
192193

193194
def time_series_xs_mi_ix(self):
194195
self.s.ix[999]
@@ -199,6 +200,10 @@ def time_frame_xs_mi_ix(self):
199200
def time_multiindex_slicers(self):
200201
self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]
201202

203+
def time_multiindex_get_indexer(self):
204+
self.miint.get_indexer(np.array([(0, 10), (0, 11), (0, 12), (0, 13), (0, 14),
205+
(0, 15), (0, 16),(0, 17), (0, 18), (0, 19)], dtype=object))
206+
202207

203208
class PanelIndexing(object):
204209
goal_time = 0.2

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ Performance Improvements
406406
- Improved performance of timeseries plotting with an irregular DatetimeIndex
407407
(or with ``compat_x=True``) (:issue:`15073`).
408408
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
409-
409+
- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
410410
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
411411

412412

pandas/core/algorithms.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1250,7 +1250,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
12501250
indexer = np.arange(arr.shape[axis], dtype=np.int64)
12511251
dtype, fill_value = arr.dtype, arr.dtype.type()
12521252
else:
1253-
indexer = _ensure_int64(indexer)
1253+
indexer = _ensure_int64(indexer, copy=False)
12541254
if not allow_fill:
12551255
dtype, fill_value = arr.dtype, arr.dtype.type()
12561256
mask_info = None, False
@@ -1303,7 +1303,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
13031303

13041304
func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
13051305
mask_info=mask_info)
1306-
indexer = _ensure_int64(indexer)
13071306
func(arr, indexer, out, fill_value)
13081307

13091308
if flip_order:

pandas/hashtable.pxd

+8
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ cdef class PyObjectHashTable(HashTable):
3030
cpdef get_item(self, object val)
3131
cpdef set_item(self, object key, Py_ssize_t val)
3232

33+
cdef class MultiIndexHashTable(HashTable):
34+
cdef:
35+
kh_uint64_t *table
36+
object mi
37+
38+
cpdef get_item(self, object val)
39+
cpdef set_item(self, object key, Py_ssize_t val)
40+
3341
cdef class StringHashTable(HashTable):
3442
cdef kh_str_t *table
3543

pandas/index.pyx

+32-2
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,6 @@ cdef class IndexEngine:
284284
if not self.is_mapping_populated:
285285

286286
values = self._get_index_values()
287-
288287
self.mapping = self._make_hash_table(len(values))
289288
self.mapping.map_locations(values)
290289

@@ -322,7 +321,7 @@ cdef class IndexEngine:
322321
Py_ssize_t i, j, n, n_t, n_alloc
323322

324323
self._ensure_mapping_populated()
325-
values = self._get_index_values()
324+
values = np.array(self._get_index_values(), copy=False)
326325
stargets = set(targets)
327326
n = len(values)
328327
n_t = len(targets)
@@ -554,5 +553,36 @@ cdef inline bint _is_utc(object tz):
554553
return tz is UTC or isinstance(tz, _du_utc)
555554

556555

556+
cdef class MultiIndexEngine(IndexEngine):
557+
558+
def _call_monotonic(self, object mi):
559+
return mi.is_lexsorted(), mi.is_monotonic, mi.is_unique
560+
561+
def get_backfill_indexer(self, other, limit=None):
562+
# we coerce to ndarray-of-tuples
563+
values = np.array(self._get_index_values())
564+
return algos.backfill_object(values, other, limit=limit)
565+
566+
def get_pad_indexer(self, other, limit=None):
567+
# we coerce to ndarray-of-tuples
568+
values = np.array(self._get_index_values())
569+
return algos.pad_object(values, other, limit=limit)
570+
571+
cpdef get_loc(self, object val):
572+
if is_definitely_invalid_key(val):
573+
raise TypeError("'{val}' is an invalid key".format(val=val))
574+
575+
self._ensure_mapping_populated()
576+
if not self.unique:
577+
return self._get_loc_duplicates(val)
578+
579+
try:
580+
return self.mapping.get_item(val)
581+
except TypeError:
582+
raise KeyError(val)
583+
584+
cdef _make_hash_table(self, n):
585+
return _hash.MultiIndexHashTable(n)
586+
557587
# Generated from template.
558588
include "index_class_helper.pxi"

pandas/indexes/base.py

-1
Original file line numberDiff line numberDiff line change
@@ -2412,7 +2412,6 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None):
24122412
'if index and target are monotonic' % method)
24132413

24142414
side = 'left' if method == 'pad' else 'right'
2415-
target = np.asarray(target)
24162415

24172416
# find exact matches first (this simplifies the algorithm)
24182417
indexer = self.get_indexer(target)

pandas/indexes/multi.py

+121-28
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from pandas.compat.numpy import function as nv
1515
from pandas import compat
1616

17-
1817
from pandas.types.common import (_ensure_int64,
1918
_ensure_platform_int,
2019
is_object_dtype,
@@ -73,6 +72,7 @@ class MultiIndex(Index):
7372
_levels = FrozenList()
7473
_labels = FrozenList()
7574
_comparables = ['names']
75+
_engine_type = _index.MultiIndexEngine
7676
rename = Index.set_names
7777

7878
def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
@@ -114,7 +114,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
114114
result._verify_integrity()
115115
if _set_identity:
116116
result._reset_identity()
117-
118117
return result
119118

120119
def _verify_integrity(self, labels=None, levels=None):
@@ -619,6 +618,10 @@ def _get_level_number(self, level):
619618

620619
_tuples = None
621620

621+
@cache_readonly
622+
def _engine(self):
623+
return self._engine_type(lambda: self, len(self))
624+
622625
@property
623626
def values(self):
624627
if self._tuples is not None:
@@ -655,10 +658,74 @@ def _has_complex_internals(self):
655658
# to disable groupby tricks
656659
return True
657660

661+
@cache_readonly
662+
def is_monotonic(self):
663+
664+
# TODO
665+
# this is unfortunate we end up tupelizing
666+
# just to determine monotonicity :<
667+
668+
# fast-path
669+
if not self.levels[0].is_monotonic:
670+
return False
671+
672+
return Index(self.values).is_monotonic
673+
658674
@cache_readonly
659675
def is_unique(self):
660676
return not self.duplicated().any()
661677

678+
@cache_readonly
679+
def _have_mixed_levels(self):
680+
""" return a boolean list indicated if we have mixed levels """
681+
return ['mixed' in l for l in self._inferred_type_levels]
682+
683+
@cache_readonly
684+
def _inferred_type_levels(self):
685+
""" return a list of the inferred types, one for each level """
686+
return [i.inferred_type for i in self.levels]
687+
688+
@cache_readonly
689+
def _hashed_values(self):
690+
""" return a uint64 ndarray of my hashed values """
691+
from pandas.tools.hashing import hash_tuples
692+
return hash_tuples(self)
693+
694+
def _hashed_indexing_key(self, key):
695+
"""
696+
validate and return the hash for the provided key
697+
698+
*this is internal for use for the cython routines*
699+
700+
Paramters
701+
---------
702+
key : string or tuple
703+
704+
Returns
705+
-------
706+
np.uint64
707+
708+
Notes
709+
-----
710+
we need to stringify if we have mixed levels
711+
712+
"""
713+
from pandas.tools.hashing import hash_tuples
714+
715+
if not isinstance(key, tuple):
716+
return hash_tuples(key)
717+
718+
if not len(key) == self.nlevels:
719+
raise KeyError
720+
721+
def f(k, stringify):
722+
if stringify and not isinstance(k, compat.string_types):
723+
k = str(k)
724+
return k
725+
key = tuple([f(k, stringify)
726+
for k, stringify in zip(key, self._have_mixed_levels)])
727+
return hash_tuples(key)
728+
662729
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
663730
False: 'first'})
664731
@Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs)
@@ -852,7 +919,8 @@ def to_frame(self, index=True):
852919
from pandas import DataFrame
853920
result = DataFrame({(name or level): self.get_level_values(level)
854921
for name, level in
855-
zip(self.names, range(len(self.levels)))})
922+
zip(self.names, range(len(self.levels)))},
923+
copy=False)
856924
if index:
857925
result.index = self
858926
return result
@@ -1478,29 +1546,41 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
14781546
method = missing.clean_reindex_fill_method(method)
14791547
target = _ensure_index(target)
14801548

1481-
target_index = target
1482-
if isinstance(target, MultiIndex):
1483-
target_index = target._tuple_index
1549+
# empty indexer
1550+
if is_list_like(target) and not len(target):
1551+
return _ensure_platform_int(np.array([]))
14841552

1485-
if not is_object_dtype(target_index.dtype):
1486-
return np.ones(len(target_index)) * -1
1553+
if not isinstance(target, MultiIndex):
1554+
try:
1555+
target = MultiIndex.from_tuples(target)
1556+
except (TypeError, ValueError):
1557+
1558+
# let's instead try with a straight Index
1559+
if method is None:
1560+
return Index(self.values).get_indexer(target,
1561+
method=method,
1562+
limit=limit,
1563+
tolerance=tolerance)
14871564

14881565
if not self.is_unique:
14891566
raise Exception('Reindexing only valid with uniquely valued Index '
14901567
'objects')
14911568

1492-
self_index = self._tuple_index
1493-
14941569
if method == 'pad' or method == 'backfill':
14951570
if tolerance is not None:
14961571
raise NotImplementedError("tolerance not implemented yet "
14971572
'for MultiIndex')
1498-
indexer = self_index._get_fill_indexer(target, method, limit)
1573+
indexer = self._get_fill_indexer(target, method, limit)
14991574
elif method == 'nearest':
15001575
raise NotImplementedError("method='nearest' not implemented yet "
15011576
'for MultiIndex; see GitHub issue 9365')
15021577
else:
1503-
indexer = self_index._engine.get_indexer(target._values)
1578+
# we may not compare equally because of hashing if we
1579+
# don't have the same dtypes
1580+
if self._inferred_type_levels != target._inferred_type_levels:
1581+
return Index(self.values).get_indexer(target.values)
1582+
1583+
indexer = self._engine.get_indexer(target)
15041584

15051585
return _ensure_platform_int(indexer)
15061586

@@ -1567,17 +1647,6 @@ def reindex(self, target, method=None, level=None, limit=None,
15671647

15681648
return target, indexer
15691649

1570-
@cache_readonly
1571-
def _tuple_index(self):
1572-
"""
1573-
Convert MultiIndex to an Index of tuples
1574-
1575-
Returns
1576-
-------
1577-
index : Index
1578-
"""
1579-
return Index(self._values)
1580-
15811650
def get_slice_bound(self, label, side, kind):
15821651

15831652
if not isinstance(label, tuple):
@@ -1824,8 +1893,16 @@ def partial_selection(key, indexer=None):
18241893

18251894
key = tuple(self[indexer].tolist()[0])
18261895

1827-
return (self._engine.get_loc(_values_from_object(key)),
1828-
None)
1896+
try:
1897+
return (self._engine.get_loc(
1898+
_values_from_object(key)), None)
1899+
except ValueError:
1900+
# if we hae a very odd MultiIndex,
1901+
# e.g. with embedded tuples, this might fail
1902+
# TODO: should prob not allow construction of a MI
1903+
# like this in the first place
1904+
return Index(self.values).get_loc(key)
1905+
18291906
else:
18301907
return partial_selection(key)
18311908
else:
@@ -2098,7 +2175,9 @@ def equals(self, other):
20982175
return True
20992176

21002177
if not isinstance(other, Index):
2101-
return False
2178+
if not isinstance(other, tuple):
2179+
return False
2180+
other = Index([other])
21022181

21032182
if not isinstance(other, MultiIndex):
21042183
return array_equivalent(self._values,
@@ -2111,10 +2190,24 @@ def equals(self, other):
21112190
return False
21122191

21132192
for i in range(self.nlevels):
2193+
slabels = self.labels[i]
2194+
slabels = slabels[slabels != -1]
21142195
svalues = algos.take_nd(np.asarray(self.levels[i]._values),
2115-
self.labels[i], allow_fill=False)
2196+
slabels, allow_fill=False)
2197+
2198+
olabels = other.labels[i]
2199+
olabels = olabels[olabels != -1]
21162200
ovalues = algos.take_nd(np.asarray(other.levels[i]._values),
2117-
other.labels[i], allow_fill=False)
2201+
olabels, allow_fill=False)
2202+
2203+
# since we use NaT both datetime64 and timedelta64
2204+
# we can have a situation where a level is typed say
2205+
# timedelta64 in self (IOW it has other values than NaT)
2206+
# but types datetime64 in other (where its all NaT)
2207+
# but these are equivalent
2208+
if len(svalues) == 0 and len(ovalues) == 0:
2209+
continue
2210+
21182211
if not array_equivalent(svalues, ovalues):
21192212
return False
21202213

pandas/io/pytables.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3789,9 +3789,9 @@ def read(self, where=None, columns=None, **kwargs):
37893789
lp = DataFrame(c.data, index=long_index, columns=c.values)
37903790

37913791
# need a better algorithm
3792-
tuple_index = long_index._tuple_index
3792+
tuple_index = long_index.values
37933793

3794-
unique_tuples = lib.fast_unique(tuple_index.values)
3794+
unique_tuples = lib.fast_unique(tuple_index)
37953795
unique_tuples = _asarray_tuplesafe(unique_tuples)
37963796

37973797
indexer = match(unique_tuples, tuple_index)

0 commit comments

Comments
 (0)