Skip to content

Commit 5d22bd1

Browse files
committed
Merge pull request #8676 from jreback/algos
PERF: set multiindex labels with a coerced dtype (GH8456)
2 parents 5cf3d85 + c11e75c commit 5d22bd1

File tree

9 files changed

+95
-35
lines changed

9 files changed

+95
-35
lines changed

doc/source/whatsnew/v0.15.1.txt

+24
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,30 @@ users upgrade to this version.
2020
API changes
2121
~~~~~~~~~~~
2222

23+
- Represent ``MultiIndex`` labels with a dtype that utilizes memory based on the level size. In prior versions, the memory usage was a constant 8 bytes per element in each level. In addition, in prior versions, the *reported* memory usage was incorrect as it didn't show the usage for the memory occupied by the underling data array. (:issue:`8456`)
24+
25+
.. ipython:: python
26+
27+
dfi = DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A'])
28+
29+
previous behavior:
30+
31+
.. code-block:: python
32+
33+
# this was underreported and actually took (in < 0.15.1) about 24008 bytes
34+
In [1]: dfi.memory_usage(index=True)
35+
Out[1]:
36+
Index 8000
37+
A 8000
38+
dtype: int64
39+
40+
41+
current behavior:
42+
43+
.. ipython:: python
44+
45+
dfi.memory_usage(index=True)
46+
2347
- ``groupby`` with ``as_index=False`` will not add erroneous extra columns to
2448
result (:issue:`8582`):
2549

pandas/core/algorithms.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
166166
elif is_timedelta:
167167
uniques = uniques.astype('m8[ns]')
168168
if isinstance(values, Index):
169-
uniques = values._simple_new(uniques, None, freq=getattr(values, 'freq', None),
170-
tz=getattr(values, 'tz', None))
169+
uniques = values._shallow_copy(uniques, name=None)
171170
elif isinstance(values, Series):
172171
uniques = Index(uniques)
173172
return labels, uniques

pandas/core/base.py

-2
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,6 @@ def __repr__(self):
232232
__setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled
233233
pop = append = extend = remove = sort = insert = _disabled
234234

235-
236235
class FrozenNDArray(PandasObject, np.ndarray):
237236

238237
# no __array_finalize__ for now because no metadata
@@ -540,4 +539,3 @@ def duplicated(self, take_last=False):
540539

541540
def _update_inplace(self, result, **kwargs):
542541
raise NotImplementedError
543-

pandas/core/categorical.py

+4-20
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
196196

197197
if fastpath:
198198
# fast path
199-
self._codes = _coerce_codes_dtype(values, categories)
199+
self._codes = com._coerce_indexer_dtype(values, categories)
200200
self.name = name
201201
self.categories = categories
202202
self.ordered = ordered
@@ -289,7 +289,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
289289
self.ordered = False if ordered is None else ordered
290290
self.categories = categories
291291
self.name = name
292-
self._codes = _coerce_codes_dtype(codes, categories)
292+
self._codes = com._coerce_indexer_dtype(codes, categories)
293293

294294
def copy(self):
295295
""" Copy constructor. """
@@ -609,7 +609,7 @@ def add_categories(self, new_categories, inplace=False):
609609
new_categories = self._validate_categories(new_categories)
610610
cat = self if inplace else self.copy()
611611
cat._categories = new_categories
612-
cat._codes = _coerce_codes_dtype(cat._codes, new_categories)
612+
cat._codes = com._coerce_indexer_dtype(cat._codes, new_categories)
613613
if not inplace:
614614
return cat
615615

@@ -1422,22 +1422,6 @@ def _delegate_method(self, name, *args, **kwargs):
14221422

14231423
##### utility routines #####
14241424

1425-
_int8_max = np.iinfo(np.int8).max
1426-
_int16_max = np.iinfo(np.int16).max
1427-
_int32_max = np.iinfo(np.int32).max
1428-
1429-
def _coerce_codes_dtype(codes, categories):
1430-
""" coerce the code input array to an appropriate dtype """
1431-
codes = np.array(codes,copy=False)
1432-
l = len(categories)
1433-
if l < _int8_max:
1434-
return codes.astype('int8')
1435-
elif l < _int16_max:
1436-
return codes.astype('int16')
1437-
elif l < _int32_max:
1438-
return codes.astype('int32')
1439-
return codes.astype('int64')
1440-
14411425
def _get_codes_for_values(values, categories):
14421426
""""
14431427
utility routine to turn values into codes given the specified categories
@@ -1450,7 +1434,7 @@ def _get_codes_for_values(values, categories):
14501434
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
14511435
t = hash_klass(len(categories))
14521436
t.map_locations(com._values_from_object(categories))
1453-
return _coerce_codes_dtype(t.lookup(values), categories)
1437+
return com._coerce_indexer_dtype(t.lookup(values), categories)
14541438

14551439
def _convert_to_list_like(list_like):
14561440
if hasattr(list_like, "dtype"):

pandas/core/common.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ class AmbiguousIndexError(PandasError, KeyError):
4949
_INT64_DTYPE = np.dtype(np.int64)
5050
_DATELIKE_DTYPES = set([np.dtype(t) for t in ['M8[ns]', '<M8[ns]', '>M8[ns]',
5151
'm8[ns]', '<m8[ns]', '>m8[ns]']])
52-
52+
_int8_max = np.iinfo(np.int8).max
53+
_int16_max = np.iinfo(np.int16).max
54+
_int32_max = np.iinfo(np.int32).max
5355

5456
# define abstract base classes to enable isinstance type checking on our
5557
# objects
@@ -723,6 +725,7 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None):
723725
return func
724726

725727
def func(arr, indexer, out, fill_value=np.nan):
728+
indexer = _ensure_int64(indexer)
726729
_take_nd_generic(arr, indexer, out, axis=axis,
727730
fill_value=fill_value, mask_info=mask_info)
728731
return func
@@ -815,6 +818,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan,
815818
func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype,
816819
axis=axis, mask_info=mask_info)
817820

821+
indexer = _ensure_int64(indexer)
818822
func(arr, indexer, out, fill_value)
819823

820824
if flip_order:
@@ -961,6 +965,16 @@ def diff(arr, n, axis=0):
961965

962966
return out_arr
963967

968+
def _coerce_indexer_dtype(indexer, categories):
969+
""" coerce the indexer input array to the smallest dtype possible """
970+
l = len(categories)
971+
if l < _int8_max:
972+
return _ensure_int8(indexer)
973+
elif l < _int16_max:
974+
return _ensure_int16(indexer)
975+
elif l < _int32_max:
976+
return _ensure_int32(indexer)
977+
return _ensure_int64(indexer)
964978

965979
def _coerce_to_dtypes(result, dtypes):
966980
""" given a dtypes and a result set, coerce the result elements to the

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1564,7 +1564,7 @@ def memory_usage(self, index=False):
15641564
result = Series([ c.values.nbytes for col, c in self.iteritems() ],
15651565
index=self.columns)
15661566
if index:
1567-
result = Series(self.index.values.nbytes,
1567+
result = Series(self.index.nbytes,
15681568
index=['Index']).append(result)
15691569
return result
15701570

pandas/core/index.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pandas import compat
88
import numpy as np
99

10+
from sys import getsizeof
1011
import pandas.tslib as tslib
1112
import pandas.lib as lib
1213
import pandas.algos as _algos
@@ -17,7 +18,7 @@
1718
from pandas.core.common import isnull, array_equivalent
1819
import pandas.core.common as com
1920
from pandas.core.common import (_values_from_object, is_float, is_integer,
20-
ABCSeries, _ensure_object)
21+
ABCSeries, _ensure_object, _ensure_int64)
2122
from pandas.core.config import get_option
2223

2324
# simplify
@@ -2680,13 +2681,13 @@ def _set_labels(self, labels, level=None, copy=False, validate=True,
26802681
raise ValueError('Length of labels must match length of levels.')
26812682

26822683
if level is None:
2683-
new_labels = FrozenList(_ensure_frozen(v, copy=copy)._shallow_copy()
2684-
for v in labels)
2684+
new_labels = FrozenList(_ensure_frozen(lab, lev, copy=copy)._shallow_copy()
2685+
for lev, lab in zip(self.levels, labels))
26852686
else:
26862687
level = [self._get_level_number(l) for l in level]
26872688
new_labels = list(self._labels)
2688-
for l, v in zip(level, labels):
2689-
new_labels[l] = _ensure_frozen(v, copy=copy)._shallow_copy()
2689+
for l, lev, lab in zip(level, self.levels, labels):
2690+
new_labels[l] = _ensure_frozen(lab, lev, copy=copy)._shallow_copy()
26902691
new_labels = FrozenList(new_labels)
26912692

26922693
self._labels = new_labels
@@ -2824,6 +2825,14 @@ def _array_values(self):
28242825
def dtype(self):
28252826
return np.dtype('O')
28262827

2828+
@cache_readonly
2829+
def nbytes(self):
2830+
""" return the number of bytes in the underlying data """
2831+
level_nbytes = sum(( i.nbytes for i in self.levels ))
2832+
label_nbytes = sum(( i.nbytes for i in self.labels ))
2833+
names_nbytes = sum(( getsizeof(i) for i in self.names ))
2834+
return level_nbytes + label_nbytes + names_nbytes
2835+
28272836
def __repr__(self):
28282837
encoding = get_option('display.encoding')
28292838
attrs = [('levels', default_pprint(self.levels)),
@@ -4361,7 +4370,7 @@ def insert(self, loc, item):
43614370
lev_loc = level.get_loc(k)
43624371

43634372
new_levels.append(level)
4364-
new_labels.append(np.insert(labels, loc, lev_loc))
4373+
new_labels.append(np.insert(_ensure_int64(labels), loc, lev_loc))
43654374

43664375
return MultiIndex(levels=new_levels, labels=new_labels,
43674376
names=self.names, verify_integrity=False)
@@ -4474,8 +4483,8 @@ def _ensure_index(index_like, copy=False):
44744483
return Index(index_like)
44754484

44764485

4477-
def _ensure_frozen(array_like, copy=False):
4478-
array_like = np.asanyarray(array_like, dtype=np.int_)
4486+
def _ensure_frozen(array_like, categories, copy=False):
4487+
array_like = com._coerce_indexer_dtype(array_like, categories)
44794488
array_like = array_like.view(FrozenNDArray)
44804489
if copy:
44814490
array_like = array_like.copy()

pandas/tests/test_frame.py

+9
Original file line numberDiff line numberDiff line change
@@ -6766,6 +6766,15 @@ def test_info_memory_usage(self):
67666766
size_df = np.size(df.columns.values) # index=False; default
67676767
self.assertEqual(size_df, np.size(df.memory_usage()))
67686768

6769+
# test for validity
6770+
DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True)
6771+
DataFrame(1,index=['a'],columns=['A']).index.nbytes
6772+
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes
6773+
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes
6774+
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).memory_usage(index=True)
6775+
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes
6776+
DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes
6777+
67696778
def test_dtypes(self):
67706779
self.mixed_frame['bool'] = self.mixed_frame['A'] > 0
67716780
result = self.mixed_frame.dtypes

pandas/tests/test_index.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
class Base(object):
3838
""" base class for index sub-class tests """
3939
_holder = None
40+
_compat_props = ['shape', 'ndim', 'size', 'itemsize', 'nbytes']
4041

4142
def verify_pickle(self,index):
4243
unpickled = self.round_trip_pickle(index)
@@ -90,9 +91,12 @@ def test_ndarray_compat_properties(self):
9091
self.assertTrue(idx.transpose().equals(idx))
9192

9293
values = idx.values
93-
for prop in ['shape', 'ndim', 'size', 'itemsize', 'nbytes']:
94+
for prop in self._compat_props:
9495
self.assertEqual(getattr(idx, prop), getattr(values, prop))
9596

97+
# test for validity
98+
idx.nbytes
99+
idx.values.nbytes
96100

97101
class TestIndex(Base, tm.TestCase):
98102
_holder = Index
@@ -1837,6 +1841,7 @@ def test_pickle_compat_construction(self):
18371841
class TestMultiIndex(Base, tm.TestCase):
18381842
_holder = MultiIndex
18391843
_multiprocess_can_split_ = True
1844+
_compat_props = ['shape', 'ndim', 'size', 'itemsize']
18401845

18411846
def setUp(self):
18421847
major_axis = Index(['foo', 'bar', 'baz', 'qux'])
@@ -1865,6 +1870,24 @@ def f():
18651870
pass
18661871
tm.assertRaisesRegexp(ValueError,'The truth value of a',f)
18671872

1873+
def test_labels_dtypes(self):
1874+
1875+
# GH 8456
1876+
i = MultiIndex.from_tuples([('A', 1), ('A', 2)])
1877+
self.assertTrue(i.labels[0].dtype == 'int8')
1878+
self.assertTrue(i.labels[1].dtype == 'int8')
1879+
1880+
i = MultiIndex.from_product([['a'],range(40)])
1881+
self.assertTrue(i.labels[1].dtype == 'int8')
1882+
i = MultiIndex.from_product([['a'],range(400)])
1883+
self.assertTrue(i.labels[1].dtype == 'int16')
1884+
i = MultiIndex.from_product([['a'],range(40000)])
1885+
self.assertTrue(i.labels[1].dtype == 'int32')
1886+
1887+
i = pd.MultiIndex.from_product([['a'],range(1000)])
1888+
self.assertTrue((i.labels[0]>=0).all())
1889+
self.assertTrue((i.labels[1]>=0).all())
1890+
18681891
def test_hash_error(self):
18691892
with tm.assertRaisesRegexp(TypeError,
18701893
"unhashable type: %r" %

0 commit comments

Comments
 (0)