Skip to content

Commit 0411455

Browse files
committed
WIP: high memory in MI
xref #13904
1 parent 3853fe6 commit 0411455

File tree

15 files changed

+261
-82
lines changed

15 files changed

+261
-82
lines changed

asv_bench/benchmarks/indexing.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def setup(self):
8888

8989
def time_getitem_scalar(self):
9090
self.ts[self.dt]
91-
91+
9292

9393
class DataFrameIndexing(object):
9494
goal_time = 0.2
@@ -189,6 +189,7 @@ def setup(self):
189189
self.eps_C = 5
190190
self.eps_D = 5000
191191
self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel()
192+
self.miint = MultiIndex.from_product([np.arange(1000), np.arange(1000)], names=['one','two'])
192193

193194
def time_series_xs_mi_ix(self):
194195
self.s.ix[999]
@@ -199,6 +200,10 @@ def time_frame_xs_mi_ix(self):
199200
def time_multiindex_slicers(self):
200201
self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]
201202

203+
def time_multiindex_get_indexer(self):
204+
self.miint.get_indexer(np.array([(0, 10), (0, 11), (0, 12), (0, 13), (0, 14),
205+
(0, 15), (0, 16),(0, 17), (0, 18), (0, 19)], dtype=object))
206+
202207

203208
class PanelIndexing(object):
204209
goal_time = 0.2

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ Performance Improvements
406406
- Improved performance of timeseries plotting with an irregular DatetimeIndex
407407
(or with ``compat_x=True``) (:issue:`15073`).
408408
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)
409-
409+
- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
410410
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.
411411

412412

pandas/core/algorithms.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1250,7 +1250,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
12501250
indexer = np.arange(arr.shape[axis], dtype=np.int64)
12511251
dtype, fill_value = arr.dtype, arr.dtype.type()
12521252
else:
1253-
indexer = _ensure_int64(indexer)
1253+
indexer = _ensure_int64(indexer, copy=False)
12541254
if not allow_fill:
12551255
dtype, fill_value = arr.dtype, arr.dtype.type()
12561256
mask_info = None, False
@@ -1303,7 +1303,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
13031303

13041304
func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
13051305
mask_info=mask_info)
1306-
indexer = _ensure_int64(indexer)
13071306
func(arr, indexer, out, fill_value)
13081307

13091308
if flip_order:

pandas/hashtable.pxd

+8
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ cdef class PyObjectHashTable(HashTable):
3030
cpdef get_item(self, object val)
3131
cpdef set_item(self, object key, Py_ssize_t val)
3232

33+
cdef class MultiIndexHashTable(HashTable):
34+
cdef:
35+
kh_uint64_t *table
36+
object mi
37+
38+
cpdef get_item(self, object val)
39+
cpdef set_item(self, object key, Py_ssize_t val)
40+
3341
cdef class StringHashTable(HashTable):
3442
cdef kh_str_t *table
3543

pandas/index.pyx

+32-2
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,6 @@ cdef class IndexEngine:
284284
if not self.is_mapping_populated:
285285

286286
values = self._get_index_values()
287-
288287
self.mapping = self._make_hash_table(len(values))
289288
self.mapping.map_locations(values)
290289

@@ -322,7 +321,7 @@ cdef class IndexEngine:
322321
Py_ssize_t i, j, n, n_t, n_alloc
323322

324323
self._ensure_mapping_populated()
325-
values = self._get_index_values()
324+
values = np.array(self._get_index_values(), copy=False)
326325
stargets = set(targets)
327326
n = len(values)
328327
n_t = len(targets)
@@ -554,5 +553,36 @@ cdef inline bint _is_utc(object tz):
554553
return tz is UTC or isinstance(tz, _du_utc)
555554

556555

556+
cdef class MultiIndexEngine(IndexEngine):
557+
558+
def _call_monotonic(self, object mi):
559+
return mi.is_lexsorted(), mi.is_monotonic, mi.is_unique
560+
561+
def get_backfill_indexer(self, other, limit=None):
562+
# we coerce to ndarray-of-tuples
563+
values = np.array(self._get_index_values())
564+
return algos.backfill_object(values, other, limit=limit)
565+
566+
def get_pad_indexer(self, other, limit=None):
567+
# we coerce to ndarray-of-tuples
568+
values = np.array(self._get_index_values())
569+
return algos.pad_object(values, other, limit=limit)
570+
571+
cpdef get_loc(self, object val):
572+
if is_definitely_invalid_key(val):
573+
raise TypeError("'{val}' is an invalid key".format(val=val))
574+
575+
self._ensure_mapping_populated()
576+
if not self.unique:
577+
return self._get_loc_duplicates(val)
578+
579+
try:
580+
return self.mapping.get_item(val)
581+
except TypeError:
582+
raise KeyError(val)
583+
584+
cdef _make_hash_table(self, n):
585+
return _hash.MultiIndexHashTable(n)
586+
557587
# Generated from template.
558588
include "index_class_helper.pxi"

pandas/indexes/base.py

-1
Original file line numberDiff line numberDiff line change
@@ -2412,7 +2412,6 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None):
24122412
'if index and target are monotonic' % method)
24132413

24142414
side = 'left' if method == 'pad' else 'right'
2415-
target = np.asarray(target)
24162415

24172416
# find exact matches first (this simplifies the algorithm)
24182417
indexer = self.get_indexer(target)

pandas/indexes/multi.py

+87-25
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from pandas.compat.numpy import function as nv
1515
from pandas import compat
1616

17-
1817
from pandas.types.common import (_ensure_int64,
1918
_ensure_platform_int,
2019
is_object_dtype,
@@ -73,6 +72,7 @@ class MultiIndex(Index):
7372
_levels = FrozenList()
7473
_labels = FrozenList()
7574
_comparables = ['names']
75+
_engine_type = _index.MultiIndexEngine
7676
rename = Index.set_names
7777

7878
def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
@@ -114,7 +114,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
114114
result._verify_integrity()
115115
if _set_identity:
116116
result._reset_identity()
117-
118117
return result
119118

120119
def _verify_integrity(self, labels=None, levels=None):
@@ -619,6 +618,10 @@ def _get_level_number(self, level):
619618

620619
_tuples = None
621620

621+
@cache_readonly
622+
def _engine(self):
623+
return self._engine_type(lambda: self, len(self))
624+
622625
@property
623626
def values(self):
624627
if self._tuples is not None:
@@ -655,10 +658,59 @@ def _has_complex_internals(self):
655658
# to disable groupby tricks
656659
return True
657660

661+
@cache_readonly
662+
def is_monotonic(self):
663+
664+
# TODO
665+
# this is unfortunate we end up tupelizing
666+
# just to determine monotonicity :<
667+
668+
# fast-path
669+
if not self.levels[0].is_monotonic:
670+
return False
671+
672+
return Index(self.values).is_monotonic
673+
658674
@cache_readonly
659675
def is_unique(self):
660676
return not self.duplicated().any()
661677

678+
@cache_readonly
679+
def _hashed_values(self):
680+
""" return a uint64 ndarray of my hashed values """
681+
from pandas.tools.hashing import hash_tuples
682+
return hash_tuples(self)
683+
684+
@cache_readonly
685+
def _have_mixed_levels(self):
686+
""" return a boolean list indicated if we have mixed levels """
687+
return ['mixed' in l for l in self._inferred_type_levels]
688+
689+
@cache_readonly
690+
def _inferred_type_levels(self):
691+
""" return a list of the inferred types, one for each level """
692+
return [i.inferred_type for i in self.levels]
693+
694+
def _as_valid_indexing_key(self, key):
695+
"""
696+
validate and return our key
697+
we need to stringify if we have mixed levels
698+
699+
this is internal for use for the cython routines
700+
"""
701+
if not isinstance(key, tuple):
702+
return key
703+
704+
if not len(key) == self.nlevels:
705+
raise KeyError
706+
707+
def f(k, stringify):
708+
if stringify and not isinstance(k, compat.string_types):
709+
k = str(k)
710+
return k
711+
return tuple([f(k, stringify)
712+
for k, stringify in zip(key, self._have_mixed_levels)])
713+
662714
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
663715
False: 'first'})
664716
@Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs)
@@ -852,7 +904,8 @@ def to_frame(self, index=True):
852904
from pandas import DataFrame
853905
result = DataFrame({(name or level): self.get_level_values(level)
854906
for name, level in
855-
zip(self.names, range(len(self.levels)))})
907+
zip(self.names, range(len(self.levels)))},
908+
copy=False)
856909
if index:
857910
result.index = self
858911
return result
@@ -1478,29 +1531,41 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
14781531
method = missing.clean_reindex_fill_method(method)
14791532
target = _ensure_index(target)
14801533

1481-
target_index = target
1482-
if isinstance(target, MultiIndex):
1483-
target_index = target._tuple_index
1534+
# empty indexer
1535+
if is_list_like(target) and not len(target):
1536+
return _ensure_platform_int(np.array([]))
1537+
1538+
if not isinstance(target, MultiIndex):
1539+
try:
1540+
target = MultiIndex.from_tuples(target)
1541+
except (TypeError, ValueError):
14841542

1485-
if not is_object_dtype(target_index.dtype):
1486-
return np.ones(len(target_index)) * -1
1543+
# let's instead try with a straight Index
1544+
if method is None:
1545+
return Index(self.values).get_indexer(target,
1546+
method=method,
1547+
limit=limit,
1548+
tolerance=tolerance)
14871549

14881550
if not self.is_unique:
14891551
raise Exception('Reindexing only valid with uniquely valued Index '
14901552
'objects')
14911553

1492-
self_index = self._tuple_index
1493-
14941554
if method == 'pad' or method == 'backfill':
14951555
if tolerance is not None:
14961556
raise NotImplementedError("tolerance not implemented yet "
14971557
'for MultiIndex')
1498-
indexer = self_index._get_fill_indexer(target, method, limit)
1558+
indexer = self._get_fill_indexer(target, method, limit)
14991559
elif method == 'nearest':
15001560
raise NotImplementedError("method='nearest' not implemented yet "
15011561
'for MultiIndex; see GitHub issue 9365')
15021562
else:
1503-
indexer = self_index._engine.get_indexer(target._values)
1563+
# we may not compare equally because of hashing if we
1564+
# don't have the same dtypes
1565+
if self._inferred_type_levels != target._inferred_type_levels:
1566+
return Index(self.values).get_indexer(target.values)
1567+
1568+
indexer = self._engine.get_indexer(target)
15041569

15051570
return _ensure_platform_int(indexer)
15061571

@@ -1567,17 +1632,6 @@ def reindex(self, target, method=None, level=None, limit=None,
15671632

15681633
return target, indexer
15691634

1570-
@cache_readonly
1571-
def _tuple_index(self):
1572-
"""
1573-
Convert MultiIndex to an Index of tuples
1574-
1575-
Returns
1576-
-------
1577-
index : Index
1578-
"""
1579-
return Index(self._values)
1580-
15811635
def get_slice_bound(self, label, side, kind):
15821636

15831637
if not isinstance(label, tuple):
@@ -1824,8 +1878,16 @@ def partial_selection(key, indexer=None):
18241878

18251879
key = tuple(self[indexer].tolist()[0])
18261880

1827-
return (self._engine.get_loc(_values_from_object(key)),
1828-
None)
1881+
try:
1882+
return (self._engine.get_loc(
1883+
_values_from_object(key)), None)
1884+
except ValueError:
1885+
# if we hae a very odd MultiIndex,
1886+
# e.g. with embedded tuples, this might fail
1887+
# TODO: should prob not allow construction of a MI
1888+
# like this in the first place
1889+
return Index(self.values).get_loc(key)
1890+
18291891
else:
18301892
return partial_selection(key)
18311893
else:

pandas/io/pytables.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3789,9 +3789,9 @@ def read(self, where=None, columns=None, **kwargs):
37893789
lp = DataFrame(c.data, index=long_index, columns=c.values)
37903790

37913791
# need a better algorithm
3792-
tuple_index = long_index._tuple_index
3792+
tuple_index = long_index.values
37933793

3794-
unique_tuples = lib.fast_unique(tuple_index.values)
3794+
unique_tuples = lib.fast_unique(tuple_index)
37953795
unique_tuples = _asarray_tuplesafe(unique_tuples)
37963796

37973797
indexer = match(unique_tuples, tuple_index)

pandas/src/algos_common_helper.pxi.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -567,12 +567,12 @@ def get_dispatch(dtypes):
567567

568568
{{for name, c_type, dtype in get_dispatch(dtypes)}}
569569

570-
cpdef ensure_{{name}}(object arr):
570+
cpdef ensure_{{name}}(object arr, copy=True):
571571
if util.is_array(arr):
572572
if (<ndarray> arr).descr.type_num == NPY_{{c_type}}:
573573
return arr
574574
else:
575-
return arr.astype(np.{{dtype}})
575+
return arr.astype(np.{{dtype}}, copy=copy)
576576
else:
577577
return np.array(arr, dtype=np.{{dtype}})
578578

0 commit comments

Comments
 (0)