Skip to content

PERF: optimize storage type for codes in Categoricals (GH8453) #8455

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 4, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ the `categories` array.
The categorical data type is useful in the following cases:

* A string variable consisting of only a few different values. Converting such a string
variable to a categorical variable will save some memory.
variable to a categorical variable will save some memory, see :ref:`here<categorical.memory>`.
* The lexical order of a variable is not the same as the logical order ("one", "two", "three").
By converting to a categorical and specifying an order on the categories, sorting and
min/max will use the logical order instead of the lexical order.
Expand Down Expand Up @@ -633,6 +633,27 @@ The following differences to R's factor functions can be observed:
Gotchas
-------

.. _categorical.memory:

Memory Usage
~~~~~~~~~~~~

The memory usage of a ``Categorical`` is proportional to the length of the categories times the length of the data. In contrast,
the an ``object`` dtype is a fixed function of the length of the data.

.. ipython:: python

s = Series(['foo','bar']*1000)

# object dtype
s.nbytes

# category dtype
s.astype('category').nbytes

Note that if the number of categories approaches the length of the data, the ``Categorical`` will use nearly (or more) memory than an
equivalent ``object`` dtype representation.

Old style constructor usage
~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ Categoricals in Series/DataFrame
:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`,
:issue:`8075`, :issue:`8076`, :issue:`8143`).
:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`).

For full docs, see the :ref:`categorical introduction <categorical>` and the
:ref:`API documentation <api.categorical>`.
Expand Down
33 changes: 29 additions & 4 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ def _maybe_to_categorical(array):
return array.values
return array


_codes_doc = """The category codes of this categorical.

Level codes are an array if integer which are the positions of the real
Expand Down Expand Up @@ -194,7 +193,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa

if fastpath:
# fast path
self._codes = values
self._codes = _coerce_codes_dtype(values, categories)
self.name = name
self.categories = categories
self.ordered = ordered
Expand Down Expand Up @@ -285,9 +284,9 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
ordered = True

self.ordered = False if ordered is None else ordered
self._codes = codes
self.categories = categories
self.name = name
self._codes = _coerce_codes_dtype(codes, categories)

def copy(self):
""" Copy constructor. """
Expand Down Expand Up @@ -607,6 +606,7 @@ def add_categories(self, new_categories, inplace=False):
new_categories = self._validate_categories(new_categories)
cat = self if inplace else self.copy()
cat._categories = new_categories
cat._codes = _coerce_codes_dtype(cat._codes, new_categories)
if not inplace:
return cat

Expand Down Expand Up @@ -1105,6 +1105,12 @@ def __unicode__(self):

return result

def _maybe_coerce_indexer(self, indexer):
""" return an indexer coerced to the codes dtype """
if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i':
indexer = indexer.astype(self._codes.dtype)
return indexer

def __getitem__(self, key):
""" Return an item. """
if isinstance(key, (int, np.integer)):
Expand All @@ -1114,6 +1120,7 @@ def __getitem__(self, key):
else:
return self.categories[i]
else:
key = self._maybe_coerce_indexer(key)
return Categorical(values=self._codes[key], categories=self.categories,
ordered=self.ordered, fastpath=True)

Expand Down Expand Up @@ -1181,6 +1188,8 @@ def __setitem__(self, key, value):
nan_pos = np.where(com.isnull(self.categories))[0]
lindexer[lindexer == -1] = nan_pos

key = self._maybe_coerce_indexer(key)
lindexer = self._maybe_coerce_indexer(lindexer)
self._codes[key] = lindexer

#### reduction ops ####
Expand Down Expand Up @@ -1395,6 +1404,22 @@ def _delegate_method(self, name, *args, **kwargs):

##### utility routines #####

_int8_max = np.iinfo(np.int8).max
_int16_max = np.iinfo(np.int16).max
_int32_max = np.iinfo(np.int32).max

def _coerce_codes_dtype(codes, categories):
""" coerce the code input array to an appropriate dtype """
codes = np.array(codes,copy=False)
l = len(categories)
if l < _int8_max:
return codes.astype('int8')
elif l < _int16_max:
return codes.astype('int16')
elif l < _int32_max:
return codes.astype('int32')
return codes.astype('int64')

def _get_codes_for_values(values, categories):
""""
utility routine to turn values into codes given the specified categories
Expand All @@ -1407,7 +1432,7 @@ def _get_codes_for_values(values, categories):
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
t = hash_klass(len(categories))
t.map_locations(com._values_from_object(categories))
return com._ensure_platform_int(t.lookup(values))
return _coerce_codes_dtype(t.lookup(values), categories)

def _convert_to_list_like(list_like):
if hasattr(list_like, "dtype"):
Expand Down
32 changes: 27 additions & 5 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,12 +714,12 @@ def test_codes_immutable(self):

# Codes should be read only
c = Categorical(["a","b","c","a", np.nan])
exp = np.array([0,1,2,0, -1])
exp = np.array([0,1,2,0,-1],dtype='int8')
self.assert_numpy_array_equal(c.codes, exp)

# Assignments to codes should raise
def f():
c.codes = np.array([0,1,2,0,1])
c.codes = np.array([0,1,2,0,1],dtype='int8')
self.assertRaises(ValueError, f)

# changes in the codes array should raise
Expand All @@ -731,10 +731,10 @@ def f():

# But even after getting the codes, the original array should still be writeable!
c[4] = "a"
exp = np.array([0,1,2,0, 0])
exp = np.array([0,1,2,0,0],dtype='int8')
self.assert_numpy_array_equal(c.codes, exp)
c._codes[4] = 2
exp = np.array([0,1,2,0, 2])
exp = np.array([0,1,2,0, 2],dtype='int8')
self.assert_numpy_array_equal(c.codes, exp)


Expand Down Expand Up @@ -975,6 +975,28 @@ def f():
expected = Series([True,False,False],index=index)
tm.assert_series_equal(result, expected)

def test_codes_dtypes(self):

# GH 8453
result = Categorical(['foo','bar','baz'])
self.assertTrue(result.codes.dtype == 'int8')

result = Categorical(['foo%05d' % i for i in range(400) ])
self.assertTrue(result.codes.dtype == 'int16')

result = Categorical(['foo%05d' % i for i in range(40000) ])
self.assertTrue(result.codes.dtype == 'int32')

# adding cats
result = Categorical(['foo','bar','baz'])
self.assertTrue(result.codes.dtype == 'int8')
result = result.add_categories(['foo%05d' % i for i in range(400) ])
self.assertTrue(result.codes.dtype == 'int16')

# removing cats
result = result.remove_categories(['foo%05d' % i for i in range(300) ])
self.assertTrue(result.codes.dtype == 'int8')

def test_basic(self):

# test basic creation / coercion of categoricals
Expand Down Expand Up @@ -1192,7 +1214,7 @@ def test_series_delegations(self):
exp_categories = np.array([1,2,3])
self.assert_numpy_array_equal(s.cat.categories, exp_categories)

exp_codes = Series(com._ensure_platform_int([0,1,2,0]))
exp_codes = Series([0,1,2,0],dtype='int8')
tm.assert_series_equal(s.cat.codes, exp_codes)

self.assertEqual(s.cat.ordered, True)
Expand Down