Skip to content

Commit 0b12eaa

Browse files
committed
PERF: optimize storage type for codes in Categoricals (GH8453)
1 parent f7426d6 commit 0b12eaa

File tree

4 files changed

+79
-11
lines changed

4 files changed

+79
-11
lines changed

doc/source/categorical.rst

+22-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ the `categories` array.
4747
The categorical data type is useful in the following cases:
4848

4949
* A string variable consisting of only a few different values. Converting such a string
50-
variable to a categorical variable will save some memory.
50+
variable to a categorical variable will save some memory, see :ref:`here<categorical.memory>`.
5151
* The lexical order of a variable is not the same as the logical order ("one", "two", "three").
5252
By converting to a categorical and specifying an order on the categories, sorting and
5353
min/max will use the logical order instead of the lexical order.
@@ -633,6 +633,27 @@ The following differences to R's factor functions can be observed:
633633
Gotchas
634634
-------
635635

636+
.. _categorical.memory:
637+
638+
Memory Usage
639+
~~~~~~~~~~~~
640+
641+
The memory usage of a ``Categorical`` is proportional to the length of the categories times the length of the data. In contrast,
642+
the an ``object`` dtype is a fixed function of the length of the data.
643+
644+
.. ipython:: python
645+
646+
s = Series(['foo','bar']*1000)
647+
648+
# object dtype
649+
s.nbytes
650+
651+
# category dtype
652+
s.astype('category').nbytes
653+
654+
Note that if the number of categories approaches the length of the data, the ``Categorical`` will use nearly (or more) memory than an
655+
equivalent ``object`` dtype representation.
656+
636657
Old style constructor usage
637658
~~~~~~~~~~~~~~~~~~~~~~~~~~~
638659

doc/source/v0.15.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ Categoricals in Series/DataFrame
563563
:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
564564
methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
565565
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`,
566-
:issue:`8075`, :issue:`8076`, :issue:`8143`).
566+
:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`).
567567

568568
For full docs, see the :ref:`categorical introduction <categorical>` and the
569569
:ref:`API documentation <api.categorical>`.

pandas/core/categorical.py

+29-4
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ def _maybe_to_categorical(array):
6767
return array.values
6868
return array
6969

70-
7170
_codes_doc = """The category codes of this categorical.
7271
7372
Level codes are an array if integer which are the positions of the real
@@ -194,7 +193,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
194193

195194
if fastpath:
196195
# fast path
197-
self._codes = values
196+
self._codes = _coerce_codes_dtype(values, categories)
198197
self.name = name
199198
self.categories = categories
200199
self.ordered = ordered
@@ -285,9 +284,9 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
285284
ordered = True
286285

287286
self.ordered = False if ordered is None else ordered
288-
self._codes = codes
289287
self.categories = categories
290288
self.name = name
289+
self._codes = _coerce_codes_dtype(codes, categories)
291290

292291
def copy(self):
293292
""" Copy constructor. """
@@ -607,6 +606,7 @@ def add_categories(self, new_categories, inplace=False):
607606
new_categories = self._validate_categories(new_categories)
608607
cat = self if inplace else self.copy()
609608
cat._categories = new_categories
609+
cat._codes = _coerce_codes_dtype(cat._codes, new_categories)
610610
if not inplace:
611611
return cat
612612

@@ -1105,6 +1105,12 @@ def __unicode__(self):
11051105

11061106
return result
11071107

1108+
def _maybe_coerce_indexer(self, indexer):
1109+
""" return an indexer coerced to the codes dtype """
1110+
if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i':
1111+
indexer = indexer.astype(self._codes.dtype)
1112+
return indexer
1113+
11081114
def __getitem__(self, key):
11091115
""" Return an item. """
11101116
if isinstance(key, (int, np.integer)):
@@ -1114,6 +1120,7 @@ def __getitem__(self, key):
11141120
else:
11151121
return self.categories[i]
11161122
else:
1123+
key = self._maybe_coerce_indexer(key)
11171124
return Categorical(values=self._codes[key], categories=self.categories,
11181125
ordered=self.ordered, fastpath=True)
11191126

@@ -1181,6 +1188,8 @@ def __setitem__(self, key, value):
11811188
nan_pos = np.where(com.isnull(self.categories))[0]
11821189
lindexer[lindexer == -1] = nan_pos
11831190

1191+
key = self._maybe_coerce_indexer(key)
1192+
lindexer = self._maybe_coerce_indexer(lindexer)
11841193
self._codes[key] = lindexer
11851194

11861195
#### reduction ops ####
@@ -1395,6 +1404,22 @@ def _delegate_method(self, name, *args, **kwargs):
13951404

13961405
##### utility routines #####
13971406

1407+
_int8_max = np.iinfo(np.int8).max
1408+
_int16_max = np.iinfo(np.int16).max
1409+
_int32_max = np.iinfo(np.int32).max
1410+
1411+
def _coerce_codes_dtype(codes, categories):
1412+
""" coerce the code input array to an appropriate dtype """
1413+
codes = np.array(codes,copy=False)
1414+
l = len(categories)
1415+
if l < _int8_max:
1416+
return codes.astype('int8')
1417+
elif l < _int16_max:
1418+
return codes.astype('int16')
1419+
elif l < _int32_max:
1420+
return codes.astype('int32')
1421+
return codes.astype('int64')
1422+
13981423
def _get_codes_for_values(values, categories):
13991424
""""
14001425
utility routine to turn values into codes given the specified categories
@@ -1407,7 +1432,7 @@ def _get_codes_for_values(values, categories):
14071432
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
14081433
t = hash_klass(len(categories))
14091434
t.map_locations(com._values_from_object(categories))
1410-
return com._ensure_platform_int(t.lookup(values))
1435+
return _coerce_codes_dtype(t.lookup(values), categories)
14111436

14121437
def _convert_to_list_like(list_like):
14131438
if hasattr(list_like, "dtype"):

pandas/tests/test_categorical.py

+27-5
Original file line numberDiff line numberDiff line change
@@ -714,12 +714,12 @@ def test_codes_immutable(self):
714714

715715
# Codes should be read only
716716
c = Categorical(["a","b","c","a", np.nan])
717-
exp = np.array([0,1,2,0, -1])
717+
exp = np.array([0,1,2,0,-1],dtype='int8')
718718
self.assert_numpy_array_equal(c.codes, exp)
719719

720720
# Assignments to codes should raise
721721
def f():
722-
c.codes = np.array([0,1,2,0,1])
722+
c.codes = np.array([0,1,2,0,1],dtype='int8')
723723
self.assertRaises(ValueError, f)
724724

725725
# changes in the codes array should raise
@@ -731,10 +731,10 @@ def f():
731731

732732
# But even after getting the codes, the original array should still be writeable!
733733
c[4] = "a"
734-
exp = np.array([0,1,2,0, 0])
734+
exp = np.array([0,1,2,0,0],dtype='int8')
735735
self.assert_numpy_array_equal(c.codes, exp)
736736
c._codes[4] = 2
737-
exp = np.array([0,1,2,0, 2])
737+
exp = np.array([0,1,2,0, 2],dtype='int8')
738738
self.assert_numpy_array_equal(c.codes, exp)
739739

740740

@@ -975,6 +975,28 @@ def f():
975975
expected = Series([True,False,False],index=index)
976976
tm.assert_series_equal(result, expected)
977977

978+
def test_codes_dtypes(self):
979+
980+
# GH 8453
981+
result = Categorical(['foo','bar','baz'])
982+
self.assertTrue(result.codes.dtype == 'int8')
983+
984+
result = Categorical(['foo%05d' % i for i in range(400) ])
985+
self.assertTrue(result.codes.dtype == 'int16')
986+
987+
result = Categorical(['foo%05d' % i for i in range(40000) ])
988+
self.assertTrue(result.codes.dtype == 'int32')
989+
990+
# adding cats
991+
result = Categorical(['foo','bar','baz'])
992+
self.assertTrue(result.codes.dtype == 'int8')
993+
result = result.add_categories(['foo%05d' % i for i in range(400) ])
994+
self.assertTrue(result.codes.dtype == 'int16')
995+
996+
# removing cats
997+
result = result.remove_categories(['foo%05d' % i for i in range(300) ])
998+
self.assertTrue(result.codes.dtype == 'int8')
999+
9781000
def test_basic(self):
9791001

9801002
# test basic creation / coercion of categoricals
@@ -1192,7 +1214,7 @@ def test_series_delegations(self):
11921214
exp_categories = np.array([1,2,3])
11931215
self.assert_numpy_array_equal(s.cat.categories, exp_categories)
11941216

1195-
exp_codes = Series(com._ensure_platform_int([0,1,2,0]))
1217+
exp_codes = Series([0,1,2,0],dtype='int8')
11961218
tm.assert_series_equal(s.cat.codes, exp_codes)
11971219

11981220
self.assertEqual(s.cat.ordered, True)

0 commit comments

Comments
 (0)