Skip to content

Commit 091754e

Browse files
committed
PERF: high memory in MI
closes #13904 BUG: a qualifer (+) would always display with a MultiIndex, regardless if it needed deep introspection for memory usage PERF: rework MultiIndex.is_monotonic as per @ssanderson idea
1 parent dcb4e47 commit 091754e

20 files changed

+530
-114
lines changed

asv_bench/benchmarks/indexing.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def setup(self):
8888

8989
def time_getitem_scalar(self):
9090
self.ts[self.dt]
91-
91+
9292

9393
class DataFrameIndexing(object):
9494
goal_time = 0.2
@@ -189,6 +189,15 @@ def setup(self):
189189
self.eps_C = 5
190190
self.eps_D = 5000
191191
self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel()
192+
self.miint = MultiIndex.from_product(
193+
[np.arange(1000),
194+
np.arange(1000)], names=['one', 'two'])
195+
196+
import string
197+
self.mistring = MultiIndex.from_product(
198+
[np.arange(1000),
199+
np.arange(20), list(string.ascii_letters)],
200+
names=['one', 'two', 'three'])
192201

193202
def time_series_xs_mi_ix(self):
194203
self.s.ix[999]
@@ -197,7 +206,24 @@ def time_frame_xs_mi_ix(self):
197206
self.df.ix[999]
198207

199208
def time_multiindex_slicers(self):
200-
self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]
209+
self.mdt2.loc[self.idx[
210+
(self.test_A - self.eps_A):(self.test_A + self.eps_A),
211+
(self.test_B - self.eps_B):(self.test_B + self.eps_B),
212+
(self.test_C - self.eps_C):(self.test_C + self.eps_C),
213+
(self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]
214+
215+
def time_multiindex_get_indexer(self):
216+
self.miint.get_indexer(
217+
np.array([(0, 10), (0, 11), (0, 12),
218+
(0, 13), (0, 14), (0, 15),
219+
(0, 16), (0, 17), (0, 18),
220+
(0, 19)], dtype=object))
221+
222+
def time_multiindex_string_get_loc(self):
223+
self.mistring.get_loc((999, 19, 'Z'))
224+
225+
def time_is_monotonic(self):
226+
self.miint.is_monotonic
201227

202228

203229
class PanelIndexing(object):

asv_bench/benchmarks/reindex.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ def setup(self):
1616
data=np.random.rand(10000, 30), columns=range(30))
1717

1818
# multi-index
19-
N = 1000
20-
K = 20
19+
N = 5000
20+
K = 200
2121
level1 = tm.makeStringIndex(N).values.repeat(K)
2222
level2 = np.tile(tm.makeStringIndex(K).values, N)
2323
index = MultiIndex.from_arrays([level1, level2])

doc/source/whatsnew/v0.20.0.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ Performance Improvements
469469
- Improved performance of timeseries plotting with an irregular DatetimeIndex
470470
(or with ``compat_x=True``) (:issue:`15073`).
471471
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
472-
472+
- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
473473
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
474474

475475

@@ -499,6 +499,8 @@ Bug Fixes
499499
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
500500

501501

502+
503+
- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
502504
- Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`)
503505
- Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`)
504506

pandas/core/algorithms.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1250,7 +1250,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
12501250
indexer = np.arange(arr.shape[axis], dtype=np.int64)
12511251
dtype, fill_value = arr.dtype, arr.dtype.type()
12521252
else:
1253-
indexer = _ensure_int64(indexer)
1253+
indexer = _ensure_int64(indexer, copy=False)
12541254
if not allow_fill:
12551255
dtype, fill_value = arr.dtype, arr.dtype.type()
12561256
mask_info = None, False
@@ -1303,7 +1303,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
13031303

13041304
func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
13051305
mask_info=mask_info)
1306-
indexer = _ensure_int64(indexer)
13071306
func(arr, indexer, out, fill_value)
13081307

13091308
if flip_order:

pandas/core/frame.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1752,7 +1752,8 @@ def _sizeof_fmt(num, size_qualifier):
17521752
# all cases (e.g., it misses categorical data even with object
17531753
# categories)
17541754
deep = False
1755-
if 'object' in counts or is_object_dtype(self.index):
1755+
if ('object' in counts or
1756+
self.index._is_memory_usage_qualified()):
17561757
size_qualifier = '+'
17571758
mem_usage = self.memory_usage(index=True, deep=deep).sum()
17581759
lines.append("memory usage: %s\n" %

pandas/hashtable.pxd

+8
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ cdef class PyObjectHashTable(HashTable):
3030
cpdef get_item(self, object val)
3131
cpdef set_item(self, object key, Py_ssize_t val)
3232

33+
cdef class MultiIndexHashTable(HashTable):
34+
cdef:
35+
kh_uint64_t *table
36+
object mi
37+
38+
cpdef get_item(self, object val)
39+
cpdef set_item(self, object key, Py_ssize_t val)
40+
3341
cdef class StringHashTable(HashTable):
3442
cdef kh_str_t *table
3543

pandas/index.pyx

+32-2
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,6 @@ cdef class IndexEngine:
284284
if not self.is_mapping_populated:
285285

286286
values = self._get_index_values()
287-
288287
self.mapping = self._make_hash_table(len(values))
289288
self.mapping.map_locations(values)
290289

@@ -322,7 +321,7 @@ cdef class IndexEngine:
322321
Py_ssize_t i, j, n, n_t, n_alloc
323322

324323
self._ensure_mapping_populated()
325-
values = self._get_index_values()
324+
values = np.array(self._get_index_values(), copy=False)
326325
stargets = set(targets)
327326
n = len(values)
328327
n_t = len(targets)
@@ -554,5 +553,36 @@ cdef inline bint _is_utc(object tz):
554553
return tz is UTC or isinstance(tz, _du_utc)
555554

556555

556+
cdef class MultiIndexEngine(IndexEngine):
557+
558+
def _call_monotonic(self, object mi):
559+
return mi.is_lexsorted(), mi.is_monotonic, mi.is_unique
560+
561+
def get_backfill_indexer(self, other, limit=None):
562+
# we coerce to ndarray-of-tuples
563+
values = np.array(self._get_index_values())
564+
return algos.backfill_object(values, other, limit=limit)
565+
566+
def get_pad_indexer(self, other, limit=None):
567+
# we coerce to ndarray-of-tuples
568+
values = np.array(self._get_index_values())
569+
return algos.pad_object(values, other, limit=limit)
570+
571+
cpdef get_loc(self, object val):
572+
if is_definitely_invalid_key(val):
573+
raise TypeError("'{val}' is an invalid key".format(val=val))
574+
575+
self._ensure_mapping_populated()
576+
if not self.unique:
577+
return self._get_loc_duplicates(val)
578+
579+
try:
580+
return self.mapping.get_item(val)
581+
except TypeError:
582+
raise KeyError(val)
583+
584+
cdef _make_hash_table(self, n):
585+
return _hash.MultiIndexHashTable(n)
586+
557587
# Generated from template.
558588
include "index_class_helper.pxi"

pandas/indexes/base.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1431,6 +1431,10 @@ def inferred_type(self):
14311431
""" return a string of the type inferred from the values """
14321432
return lib.infer_dtype(self)
14331433

1434+
def _is_memory_usage_qualified(self):
1435+
""" return a boolean if we need a qualified .info display """
1436+
return self.is_object()
1437+
14341438
def is_type_compatible(self, kind):
14351439
return kind == self.inferred_type
14361440

@@ -2446,7 +2450,6 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None):
24462450
'if index and target are monotonic' % method)
24472451

24482452
side = 'left' if method == 'pad' else 'right'
2449-
target = np.asarray(target)
24502453

24512454
# find exact matches first (this simplifies the algorithm)
24522455
indexer = self.get_indexer(target)

0 commit comments

Comments
 (0)