diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt index c666a19bcd133..d57f4c7e2a9d3 100644 --- a/doc/source/whatsnew/v0.15.1.txt +++ b/doc/source/whatsnew/v0.15.1.txt @@ -20,6 +20,30 @@ users upgrade to this version. API changes ~~~~~~~~~~~ +- Represent ``MultiIndex`` labels with a dtype that utilizes memory based on the level size. In prior versions, the memory usage was a constant 8 bytes per element in each level. In addition, in prior versions, the *reported* memory usage was incorrect as it didn't show the usage for the memory occupied by the underling data array. (:issue:`8456`) + + .. ipython:: python + + dfi = DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']) + + previous behavior: + + .. code-block:: python + + # this was underreported and actually took (in < 0.15.1) about 24008 bytes + In [1]: dfi.memory_usage(index=True) + Out[1]: + Index 8000 + A 8000 + dtype: int64 + + + current behavior: + + .. ipython:: python + + dfi.memory_usage(index=True) + - ``groupby`` with ``as_index=False`` will not add erroneous extra columns to result (:issue:`8582`): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8c4f45fdeb57a..364a3fa13801b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -166,8 +166,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): - uniques = values._simple_new(uniques, None, freq=getattr(values, 'freq', None), - tz=getattr(values, 'tz', None)) + uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques diff --git a/pandas/core/base.py b/pandas/core/base.py index 71a08e0dd553d..fba83be6fcadf 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -232,7 +232,6 @@ def __repr__(self): __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled pop = append = extend = remove = sort = insert = _disabled - class FrozenNDArray(PandasObject, np.ndarray): # no __array_finalize__ for now because no metadata @@ -540,4 +539,3 @@ def duplicated(self, take_last=False): def _update_inplace(self, result, **kwargs): raise NotImplementedError - diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index c7cc065a965a0..00128bd977911 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -196,7 +196,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa if fastpath: # fast path - self._codes = _coerce_codes_dtype(values, categories) + self._codes = com._coerce_indexer_dtype(values, categories) self.name = name self.categories = categories self.ordered = ordered @@ -289,7 +289,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa self.ordered = False if ordered is None else ordered self.categories = categories self.name = name - self._codes = _coerce_codes_dtype(codes, categories) + self._codes = com._coerce_indexer_dtype(codes, categories) def copy(self): """ Copy constructor. """ @@ -609,7 +609,7 @@ def add_categories(self, new_categories, inplace=False): new_categories = self._validate_categories(new_categories) cat = self if inplace else self.copy() cat._categories = new_categories - cat._codes = _coerce_codes_dtype(cat._codes, new_categories) + cat._codes = com._coerce_indexer_dtype(cat._codes, new_categories) if not inplace: return cat @@ -1422,22 +1422,6 @@ def _delegate_method(self, name, *args, **kwargs): ##### utility routines ##### -_int8_max = np.iinfo(np.int8).max -_int16_max = np.iinfo(np.int16).max -_int32_max = np.iinfo(np.int32).max - -def _coerce_codes_dtype(codes, categories): - """ coerce the code input array to an appropriate dtype """ - codes = np.array(codes,copy=False) - l = len(categories) - if l < _int8_max: - return codes.astype('int8') - elif l < _int16_max: - return codes.astype('int16') - elif l < _int32_max: - return codes.astype('int32') - return codes.astype('int64') - def _get_codes_for_values(values, categories): """" utility routine to turn values into codes given the specified categories @@ -1450,7 +1434,7 @@ def _get_codes_for_values(values, categories): (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(categories)) t.map_locations(com._values_from_object(categories)) - return _coerce_codes_dtype(t.lookup(values), categories) + return com._coerce_indexer_dtype(t.lookup(values), categories) def _convert_to_list_like(list_like): if hasattr(list_like, "dtype"): diff --git a/pandas/core/common.py b/pandas/core/common.py index 2839b54b7d71a..51464e1809e75 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -49,7 +49,9 @@ class AmbiguousIndexError(PandasError, KeyError): _INT64_DTYPE = np.dtype(np.int64) _DATELIKE_DTYPES = set([np.dtype(t) for t in ['M8[ns]', 'M8[ns]', 'm8[ns]', 'm8[ns]']]) - +_int8_max = np.iinfo(np.int8).max +_int16_max = np.iinfo(np.int16).max +_int32_max = np.iinfo(np.int32).max # define abstract base classes to enable isinstance type checking on our # objects @@ -723,6 +725,7 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): return func def func(arr, indexer, out, fill_value=np.nan): + indexer = _ensure_int64(indexer) _take_nd_generic(arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info) return func @@ -815,6 +818,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) + indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: @@ -961,6 +965,16 @@ def diff(arr, n, axis=0): return out_arr +def _coerce_indexer_dtype(indexer, categories): + """ coerce the indexer input array to the smallest dtype possible """ + l = len(categories) + if l < _int8_max: + return _ensure_int8(indexer) + elif l < _int16_max: + return _ensure_int16(indexer) + elif l < _int32_max: + return _ensure_int32(indexer) + return _ensure_int64(indexer) def _coerce_to_dtypes(result, dtypes): """ given a dtypes and a result set, coerce the result elements to the diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 182d9c4c2620b..f998a01a1a165 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1564,7 +1564,7 @@ def memory_usage(self, index=False): result = Series([ c.values.nbytes for col, c in self.iteritems() ], index=self.columns) if index: - result = Series(self.index.values.nbytes, + result = Series(self.index.nbytes, index=['Index']).append(result) return result diff --git a/pandas/core/index.py b/pandas/core/index.py index d56354833012a..4f5a0c1f212c2 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -7,6 +7,7 @@ from pandas import compat import numpy as np +from sys import getsizeof import pandas.tslib as tslib import pandas.lib as lib import pandas.algos as _algos @@ -17,7 +18,7 @@ from pandas.core.common import isnull, array_equivalent import pandas.core.common as com from pandas.core.common import (_values_from_object, is_float, is_integer, - ABCSeries, _ensure_object) + ABCSeries, _ensure_object, _ensure_int64) from pandas.core.config import get_option # simplify @@ -2680,13 +2681,13 @@ def _set_labels(self, labels, level=None, copy=False, validate=True, raise ValueError('Length of labels must match length of levels.') if level is None: - new_labels = FrozenList(_ensure_frozen(v, copy=copy)._shallow_copy() - for v in labels) + new_labels = FrozenList(_ensure_frozen(lab, lev, copy=copy)._shallow_copy() + for lev, lab in zip(self.levels, labels)) else: level = [self._get_level_number(l) for l in level] new_labels = list(self._labels) - for l, v in zip(level, labels): - new_labels[l] = _ensure_frozen(v, copy=copy)._shallow_copy() + for l, lev, lab in zip(level, self.levels, labels): + new_labels[l] = _ensure_frozen(lab, lev, copy=copy)._shallow_copy() new_labels = FrozenList(new_labels) self._labels = new_labels @@ -2824,6 +2825,14 @@ def _array_values(self): def dtype(self): return np.dtype('O') + @cache_readonly + def nbytes(self): + """ return the number of bytes in the underlying data """ + level_nbytes = sum(( i.nbytes for i in self.levels )) + label_nbytes = sum(( i.nbytes for i in self.labels )) + names_nbytes = sum(( getsizeof(i) for i in self.names )) + return level_nbytes + label_nbytes + names_nbytes + def __repr__(self): encoding = get_option('display.encoding') attrs = [('levels', default_pprint(self.levels)), @@ -4361,7 +4370,7 @@ def insert(self, loc, item): lev_loc = level.get_loc(k) new_levels.append(level) - new_labels.append(np.insert(labels, loc, lev_loc)) + new_labels.append(np.insert(_ensure_int64(labels), loc, lev_loc)) return MultiIndex(levels=new_levels, labels=new_labels, names=self.names, verify_integrity=False) @@ -4474,8 +4483,8 @@ def _ensure_index(index_like, copy=False): return Index(index_like) -def _ensure_frozen(array_like, copy=False): - array_like = np.asanyarray(array_like, dtype=np.int_) +def _ensure_frozen(array_like, categories, copy=False): + array_like = com._coerce_indexer_dtype(array_like, categories) array_like = array_like.view(FrozenNDArray) if copy: array_like = array_like.copy() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index fb9124bf19958..a19d9d651a656 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -6766,6 +6766,15 @@ def test_info_memory_usage(self): size_df = np.size(df.columns.values) # index=False; default self.assertEqual(size_df, np.size(df.memory_usage())) + # test for validity + DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True) + DataFrame(1,index=['a'],columns=['A']).index.nbytes + DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes + DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes + DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).memory_usage(index=True) + DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes + DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes + def test_dtypes(self): self.mixed_frame['bool'] = self.mixed_frame['A'] > 0 result = self.mixed_frame.dtypes diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 75af8f4e26302..fe92cd55f1573 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -37,6 +37,7 @@ class Base(object): """ base class for index sub-class tests """ _holder = None + _compat_props = ['shape', 'ndim', 'size', 'itemsize', 'nbytes'] def verify_pickle(self,index): unpickled = self.round_trip_pickle(index) @@ -90,9 +91,12 @@ def test_ndarray_compat_properties(self): self.assertTrue(idx.transpose().equals(idx)) values = idx.values - for prop in ['shape', 'ndim', 'size', 'itemsize', 'nbytes']: + for prop in self._compat_props: self.assertEqual(getattr(idx, prop), getattr(values, prop)) + # test for validity + idx.nbytes + idx.values.nbytes class TestIndex(Base, tm.TestCase): _holder = Index @@ -1837,6 +1841,7 @@ def test_pickle_compat_construction(self): class TestMultiIndex(Base, tm.TestCase): _holder = MultiIndex _multiprocess_can_split_ = True + _compat_props = ['shape', 'ndim', 'size', 'itemsize'] def setUp(self): major_axis = Index(['foo', 'bar', 'baz', 'qux']) @@ -1865,6 +1870,24 @@ def f(): pass tm.assertRaisesRegexp(ValueError,'The truth value of a',f) + def test_labels_dtypes(self): + + # GH 8456 + i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) + self.assertTrue(i.labels[0].dtype == 'int8') + self.assertTrue(i.labels[1].dtype == 'int8') + + i = MultiIndex.from_product([['a'],range(40)]) + self.assertTrue(i.labels[1].dtype == 'int8') + i = MultiIndex.from_product([['a'],range(400)]) + self.assertTrue(i.labels[1].dtype == 'int16') + i = MultiIndex.from_product([['a'],range(40000)]) + self.assertTrue(i.labels[1].dtype == 'int32') + + i = pd.MultiIndex.from_product([['a'],range(1000)]) + self.assertTrue((i.labels[0]>=0).all()) + self.assertTrue((i.labels[1]>=0).all()) + def test_hash_error(self): with tm.assertRaisesRegexp(TypeError, "unhashable type: %r" %