Skip to content

PERF: high memory in MI #15245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def setup(self):

def time_getitem_scalar(self):
self.ts[self.dt]


class DataFrameIndexing(object):
goal_time = 0.2
Expand Down Expand Up @@ -189,6 +189,15 @@ def setup(self):
self.eps_C = 5
self.eps_D = 5000
self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel()
self.miint = MultiIndex.from_product(
[np.arange(1000),
np.arange(1000)], names=['one', 'two'])

import string
self.mistring = MultiIndex.from_product(
[np.arange(1000),
np.arange(20), list(string.ascii_letters)],
names=['one', 'two', 'three'])

def time_series_xs_mi_ix(self):
self.s.ix[999]
Expand All @@ -197,7 +206,24 @@ def time_frame_xs_mi_ix(self):
self.df.ix[999]

def time_multiindex_slicers(self):
self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]
self.mdt2.loc[self.idx[
(self.test_A - self.eps_A):(self.test_A + self.eps_A),
(self.test_B - self.eps_B):(self.test_B + self.eps_B),
(self.test_C - self.eps_C):(self.test_C + self.eps_C),
(self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]

def time_multiindex_get_indexer(self):
self.miint.get_indexer(
np.array([(0, 10), (0, 11), (0, 12),
(0, 13), (0, 14), (0, 15),
(0, 16), (0, 17), (0, 18),
(0, 19)], dtype=object))

def time_multiindex_string_get_loc(self):
self.mistring.get_loc((999, 19, 'Z'))

def time_is_monotonic(self):
self.miint.is_monotonic


class PanelIndexing(object):
Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def setup(self):
data=np.random.rand(10000, 30), columns=range(30))

# multi-index
N = 1000
K = 20
N = 5000
K = 200
level1 = tm.makeStringIndex(N).values.repeat(K)
level2 = np.tile(tm.makeStringIndex(K).values, N)
index = MultiIndex.from_arrays([level1, level2])
Expand Down
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ Performance Improvements
- Improved performance of timeseries plotting with an irregular DatetimeIndex
(or with ``compat_x=True``) (:issue:`15073`).
- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`)

- Improved performance and reduced memory when indexing with a ``MultiIndex`` (:issue:`15245`)
- When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object.


Expand Down Expand Up @@ -500,6 +500,8 @@ Bug Fixes
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)



- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
- Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`)
- Bug in ``Series.ffill()`` with mixed dtypes containing tz-aware datetimes. (:issue:`14956`)

Expand Down
3 changes: 1 addition & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1250,7 +1250,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
indexer = np.arange(arr.shape[axis], dtype=np.int64)
dtype, fill_value = arr.dtype, arr.dtype.type()
else:
indexer = _ensure_int64(indexer)
indexer = _ensure_int64(indexer, copy=False)
if not allow_fill:
dtype, fill_value = arr.dtype, arr.dtype.type()
mask_info = None, False
Expand Down Expand Up @@ -1303,7 +1303,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,

func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
mask_info=mask_info)
indexer = _ensure_int64(indexer)
func(arr, indexer, out, fill_value)

if flip_order:
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1752,7 +1752,8 @@ def _sizeof_fmt(num, size_qualifier):
# all cases (e.g., it misses categorical data even with object
# categories)
deep = False
if 'object' in counts or is_object_dtype(self.index):
if ('object' in counts or
self.index._is_memory_usage_qualified()):
size_qualifier = '+'
mem_usage = self.memory_usage(index=True, deep=deep).sum()
lines.append("memory usage: %s\n" %
Expand Down
8 changes: 8 additions & 0 deletions pandas/hashtable.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ cdef class PyObjectHashTable(HashTable):
cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)

cdef class MultiIndexHashTable(HashTable):
cdef:
kh_uint64_t *table
object mi

cpdef get_item(self, object val)
cpdef set_item(self, object key, Py_ssize_t val)

cdef class StringHashTable(HashTable):
cdef kh_str_t *table

Expand Down
39 changes: 36 additions & 3 deletions pandas/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ cdef class IndexEngine:
Py_ssize_t i, n
int last_true

values = self._get_index_values()
values = np.array(self._get_index_values(), copy=False)
n = len(values)

result = np.empty(n, dtype=bool)
Expand Down Expand Up @@ -284,7 +284,6 @@ cdef class IndexEngine:
if not self.is_mapping_populated:

values = self._get_index_values()

self.mapping = self._make_hash_table(len(values))
self.mapping.map_locations(values)

Expand Down Expand Up @@ -322,7 +321,7 @@ cdef class IndexEngine:
Py_ssize_t i, j, n, n_t, n_alloc

self._ensure_mapping_populated()
values = self._get_index_values()
values = np.array(self._get_index_values(), copy=False)
stargets = set(targets)
n = len(values)
n_t = len(targets)
Expand Down Expand Up @@ -554,5 +553,39 @@ cdef inline bint _is_utc(object tz):
return tz is UTC or isinstance(tz, _du_utc)


cdef class MultiIndexEngine(IndexEngine):

def _call_monotonic(self, object mi):
# defer these back to the mi iteself
return (mi.is_monotonic_increasing,
mi.is_monotonic_decreasing,
mi.is_unique)

def get_backfill_indexer(self, other, limit=None):
# we coerce to ndarray-of-tuples
values = np.array(self._get_index_values())
return algos.backfill_object(values, other, limit=limit)

def get_pad_indexer(self, other, limit=None):
# we coerce to ndarray-of-tuples
values = np.array(self._get_index_values())
return algos.pad_object(values, other, limit=limit)

cpdef get_loc(self, object val):
if is_definitely_invalid_key(val):
raise TypeError("'{val}' is an invalid key".format(val=val))

self._ensure_mapping_populated()
if not self.unique:
return self._get_loc_duplicates(val)

try:
return self.mapping.get_item(val)
except TypeError:
raise KeyError(val)

cdef _make_hash_table(self, n):
return _hash.MultiIndexHashTable(n)

# Generated from template.
include "index_class_helper.pxi"
5 changes: 4 additions & 1 deletion pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,6 +1431,10 @@ def inferred_type(self):
""" return a string of the type inferred from the values """
return lib.infer_dtype(self)

def _is_memory_usage_qualified(self):
""" return a boolean if we need a qualified .info display """
return self.is_object()

def is_type_compatible(self, kind):
return kind == self.inferred_type

Expand Down Expand Up @@ -2446,7 +2450,6 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None):
'if index and target are monotonic' % method)

side = 'left' if method == 'pad' else 'right'
target = np.asarray(target)

# find exact matches first (this simplifies the algorithm)
indexer = self.get_indexer(target)
Expand Down
Loading