Skip to content

PERF: set multiindex labels with a coerced dtype (GH8456) #8676

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 30, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions doc/source/whatsnew/v0.15.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,30 @@ users upgrade to this version.
API changes
~~~~~~~~~~~

- Represent ``MultiIndex`` labels with a dtype that utilizes memory based on the level size. In prior versions, the memory usage was a constant 8 bytes per element in each level. In addition, in prior versions, the *reported* memory usage was incorrect as it didn't show the usage for the memory occupied by the underling data array. (:issue:`8456`)

.. ipython:: python

dfi = DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A'])

previous behavior:

.. code-block:: python

# this was underreported and actually took (in < 0.15.1) about 24008 bytes
In [1]: dfi.memory_usage(index=True)
Out[1]:
Index 8000
A 8000
dtype: int64


current behavior:

.. ipython:: python

dfi.memory_usage(index=True)

- ``groupby`` with ``as_index=False`` will not add erroneous extra columns to
result (:issue:`8582`):

Expand Down
3 changes: 1 addition & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
elif is_timedelta:
uniques = uniques.astype('m8[ns]')
if isinstance(values, Index):
uniques = values._simple_new(uniques, None, freq=getattr(values, 'freq', None),
tz=getattr(values, 'tz', None))
uniques = values._shallow_copy(uniques, name=None)
elif isinstance(values, Series):
uniques = Index(uniques)
return labels, uniques
Expand Down
2 changes: 0 additions & 2 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,6 @@ def __repr__(self):
__setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled
pop = append = extend = remove = sort = insert = _disabled


class FrozenNDArray(PandasObject, np.ndarray):

# no __array_finalize__ for now because no metadata
Expand Down Expand Up @@ -540,4 +539,3 @@ def duplicated(self, take_last=False):

def _update_inplace(self, result, **kwargs):
raise NotImplementedError

24 changes: 4 additions & 20 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa

if fastpath:
# fast path
self._codes = _coerce_codes_dtype(values, categories)
self._codes = com._coerce_indexer_dtype(values, categories)
self.name = name
self.categories = categories
self.ordered = ordered
Expand Down Expand Up @@ -289,7 +289,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
self.ordered = False if ordered is None else ordered
self.categories = categories
self.name = name
self._codes = _coerce_codes_dtype(codes, categories)
self._codes = com._coerce_indexer_dtype(codes, categories)

def copy(self):
""" Copy constructor. """
Expand Down Expand Up @@ -609,7 +609,7 @@ def add_categories(self, new_categories, inplace=False):
new_categories = self._validate_categories(new_categories)
cat = self if inplace else self.copy()
cat._categories = new_categories
cat._codes = _coerce_codes_dtype(cat._codes, new_categories)
cat._codes = com._coerce_indexer_dtype(cat._codes, new_categories)
if not inplace:
return cat

Expand Down Expand Up @@ -1422,22 +1422,6 @@ def _delegate_method(self, name, *args, **kwargs):

##### utility routines #####

_int8_max = np.iinfo(np.int8).max
_int16_max = np.iinfo(np.int16).max
_int32_max = np.iinfo(np.int32).max

def _coerce_codes_dtype(codes, categories):
""" coerce the code input array to an appropriate dtype """
codes = np.array(codes,copy=False)
l = len(categories)
if l < _int8_max:
return codes.astype('int8')
elif l < _int16_max:
return codes.astype('int16')
elif l < _int32_max:
return codes.astype('int32')
return codes.astype('int64')

def _get_codes_for_values(values, categories):
""""
utility routine to turn values into codes given the specified categories
Expand All @@ -1450,7 +1434,7 @@ def _get_codes_for_values(values, categories):
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
t = hash_klass(len(categories))
t.map_locations(com._values_from_object(categories))
return _coerce_codes_dtype(t.lookup(values), categories)
return com._coerce_indexer_dtype(t.lookup(values), categories)

def _convert_to_list_like(list_like):
if hasattr(list_like, "dtype"):
Expand Down
16 changes: 15 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ class AmbiguousIndexError(PandasError, KeyError):
_INT64_DTYPE = np.dtype(np.int64)
_DATELIKE_DTYPES = set([np.dtype(t) for t in ['M8[ns]', '<M8[ns]', '>M8[ns]',
'm8[ns]', '<m8[ns]', '>m8[ns]']])

_int8_max = np.iinfo(np.int8).max
_int16_max = np.iinfo(np.int16).max
_int32_max = np.iinfo(np.int32).max

# define abstract base classes to enable isinstance type checking on our
# objects
Expand Down Expand Up @@ -723,6 +725,7 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None):
return func

def func(arr, indexer, out, fill_value=np.nan):
indexer = _ensure_int64(indexer)
_take_nd_generic(arr, indexer, out, axis=axis,
fill_value=fill_value, mask_info=mask_info)
return func
Expand Down Expand Up @@ -815,6 +818,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan,
func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype,
axis=axis, mask_info=mask_info)

indexer = _ensure_int64(indexer)
func(arr, indexer, out, fill_value)

if flip_order:
Expand Down Expand Up @@ -961,6 +965,16 @@ def diff(arr, n, axis=0):

return out_arr

def _coerce_indexer_dtype(indexer, categories):
""" coerce the indexer input array to the smallest dtype possible """
l = len(categories)
if l < _int8_max:
return _ensure_int8(indexer)
elif l < _int16_max:
return _ensure_int16(indexer)
elif l < _int32_max:
return _ensure_int32(indexer)
return _ensure_int64(indexer)

def _coerce_to_dtypes(result, dtypes):
""" given a dtypes and a result set, coerce the result elements to the
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1564,7 +1564,7 @@ def memory_usage(self, index=False):
result = Series([ c.values.nbytes for col, c in self.iteritems() ],
index=self.columns)
if index:
result = Series(self.index.values.nbytes,
result = Series(self.index.nbytes,
index=['Index']).append(result)
return result

Expand Down
25 changes: 17 additions & 8 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pandas import compat
import numpy as np

from sys import getsizeof
import pandas.tslib as tslib
import pandas.lib as lib
import pandas.algos as _algos
Expand All @@ -17,7 +18,7 @@
from pandas.core.common import isnull, array_equivalent
import pandas.core.common as com
from pandas.core.common import (_values_from_object, is_float, is_integer,
ABCSeries, _ensure_object)
ABCSeries, _ensure_object, _ensure_int64)
from pandas.core.config import get_option

# simplify
Expand Down Expand Up @@ -2680,13 +2681,13 @@ def _set_labels(self, labels, level=None, copy=False, validate=True,
raise ValueError('Length of labels must match length of levels.')

if level is None:
new_labels = FrozenList(_ensure_frozen(v, copy=copy)._shallow_copy()
for v in labels)
new_labels = FrozenList(_ensure_frozen(lab, lev, copy=copy)._shallow_copy()
for lev, lab in zip(self.levels, labels))
else:
level = [self._get_level_number(l) for l in level]
new_labels = list(self._labels)
for l, v in zip(level, labels):
new_labels[l] = _ensure_frozen(v, copy=copy)._shallow_copy()
for l, lev, lab in zip(level, self.levels, labels):
new_labels[l] = _ensure_frozen(lab, lev, copy=copy)._shallow_copy()
new_labels = FrozenList(new_labels)

self._labels = new_labels
Expand Down Expand Up @@ -2824,6 +2825,14 @@ def _array_values(self):
def dtype(self):
return np.dtype('O')

@cache_readonly
def nbytes(self):
""" return the number of bytes in the underlying data """
level_nbytes = sum(( i.nbytes for i in self.levels ))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note: nested ( is also unnecessary here, but harmless :).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this IS necessary, each level is an array.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test it out: sum(x for x in [1, 2, 3])

sum(i.nbytes for i in self.levels) implicitly does the generator compression: http://legacy.python.org/dev/peps/pep-0289/

If you have a single argument to a function the parentheses around generator comprehensions are optional.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be true, but each element is a level, which is an ndarray iteself

In [1]: i = pd.MultiIndex.from_product([list('ab'),range(3)])

In [2]: i.levels
Out[2]: FrozenList([[u'a', u'b'], [0, 1, 2]])

In [4]: sum([ x.nbytes for x in i.levels ])    
Out[4]: 40

In [7]: i.levels[0]
Out[7]: Index([u'a', u'b'], dtype='object')

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure I understand your point from this example? I get the same thing without the nested brackets:

In [6]: sum(x.nbytes for x in i.levels)
Out[6]: 40

This is really a matter of Python syntax, which is independent of the nature of the arguments

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, yeh...I always have used the [ ], doesn't make a difference in this case

label_nbytes = sum(( i.nbytes for i in self.labels ))
names_nbytes = sum(( getsizeof(i) for i in self.names ))
return level_nbytes + label_nbytes + names_nbytes

def __repr__(self):
encoding = get_option('display.encoding')
attrs = [('levels', default_pprint(self.levels)),
Expand Down Expand Up @@ -4361,7 +4370,7 @@ def insert(self, loc, item):
lev_loc = level.get_loc(k)

new_levels.append(level)
new_labels.append(np.insert(labels, loc, lev_loc))
new_labels.append(np.insert(_ensure_int64(labels), loc, lev_loc))

return MultiIndex(levels=new_levels, labels=new_labels,
names=self.names, verify_integrity=False)
Expand Down Expand Up @@ -4474,8 +4483,8 @@ def _ensure_index(index_like, copy=False):
return Index(index_like)


def _ensure_frozen(array_like, copy=False):
array_like = np.asanyarray(array_like, dtype=np.int_)
def _ensure_frozen(array_like, categories, copy=False):
array_like = com._coerce_indexer_dtype(array_like, categories)
array_like = array_like.view(FrozenNDArray)
if copy:
array_like = array_like.copy()
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6766,6 +6766,15 @@ def test_info_memory_usage(self):
size_df = np.size(df.columns.values) # index=False; default
self.assertEqual(size_df, np.size(df.memory_usage()))

# test for validity
DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True)
DataFrame(1,index=['a'],columns=['A']).index.nbytes
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).memory_usage(index=True)
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes

def test_dtypes(self):
self.mixed_frame['bool'] = self.mixed_frame['A'] > 0
result = self.mixed_frame.dtypes
Expand Down
25 changes: 24 additions & 1 deletion pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
class Base(object):
""" base class for index sub-class tests """
_holder = None
_compat_props = ['shape', 'ndim', 'size', 'itemsize', 'nbytes']

def verify_pickle(self,index):
unpickled = self.round_trip_pickle(index)
Expand Down Expand Up @@ -90,9 +91,12 @@ def test_ndarray_compat_properties(self):
self.assertTrue(idx.transpose().equals(idx))

values = idx.values
for prop in ['shape', 'ndim', 'size', 'itemsize', 'nbytes']:
for prop in self._compat_props:
self.assertEqual(getattr(idx, prop), getattr(values, prop))

# test for validity
idx.nbytes
idx.values.nbytes

class TestIndex(Base, tm.TestCase):
_holder = Index
Expand Down Expand Up @@ -1837,6 +1841,7 @@ def test_pickle_compat_construction(self):
class TestMultiIndex(Base, tm.TestCase):
_holder = MultiIndex
_multiprocess_can_split_ = True
_compat_props = ['shape', 'ndim', 'size', 'itemsize']

def setUp(self):
major_axis = Index(['foo', 'bar', 'baz', 'qux'])
Expand Down Expand Up @@ -1865,6 +1870,24 @@ def f():
pass
tm.assertRaisesRegexp(ValueError,'The truth value of a',f)

def test_labels_dtypes(self):

# GH 8456
i = MultiIndex.from_tuples([('A', 1), ('A', 2)])
self.assertTrue(i.labels[0].dtype == 'int8')
self.assertTrue(i.labels[1].dtype == 'int8')

i = MultiIndex.from_product([['a'],range(40)])
self.assertTrue(i.labels[1].dtype == 'int8')
i = MultiIndex.from_product([['a'],range(400)])
self.assertTrue(i.labels[1].dtype == 'int16')
i = MultiIndex.from_product([['a'],range(40000)])
self.assertTrue(i.labels[1].dtype == 'int32')

i = pd.MultiIndex.from_product([['a'],range(1000)])
self.assertTrue((i.labels[0]>=0).all())
self.assertTrue((i.labels[1]>=0).all())

def test_hash_error(self):
with tm.assertRaisesRegexp(TypeError,
"unhashable type: %r" %
Expand Down