pandas-dev · jreback · Oct 4, 2014 · Oct 3, 2014
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -47,7 +47,7 @@ the `categories` array.
 The categorical data type is useful in the following cases:
 
 * A string variable consisting of only a few different values. Converting such a string
-  variable to a categorical variable will save some memory.
+  variable to a categorical variable will save some memory, see :ref:`here<categorical.memory>`.
 * The lexical order of a variable is not the same as the logical order ("one", "two", "three").
   By converting to a categorical and specifying an order on the categories, sorting and
   min/max will use the logical order instead of the lexical order.
@@ -633,6 +633,27 @@ The following differences to R's factor functions can be observed:
 Gotchas
 -------
 
+.. _categorical.memory:
+
+Memory Usage
+~~~~~~~~~~~~
+
+The memory usage of a ``Categorical`` is proportional to the length of the categories times the length of the data. In contrast,
+the an ``object`` dtype is a fixed function of the length of the data.
+
+.. ipython:: python
+
+   s = Series(['foo','bar']*1000)
+
+   # object dtype
+   s.nbytes
+
+   # category dtype
+   s.astype('category').nbytes
+
+Note that if the number of categories approaches the length of the data, the ``Categorical`` will use nearly (or more) memory than an
+equivalent ``object`` dtype representation.
+
 Old style constructor usage
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -563,7 +563,7 @@ Categoricals in Series/DataFrame
 :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
 methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
 :issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`,
-:issue:`8075`, :issue:`8076`, :issue:`8143`).
+:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`).
 
 For full docs, see the :ref:`categorical introduction <categorical>` and the
 :ref:`API documentation <api.categorical>`.

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -67,7 +67,6 @@ def _maybe_to_categorical(array):
         return array.values
     return array
 
-
 _codes_doc = """The category codes of this categorical.
 
 Level codes are an array if integer which are the positions of the real
@@ -194,7 +193,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
 
         if fastpath:
             # fast path
-            self._codes = values
+            self._codes = _coerce_codes_dtype(values, categories)
             self.name = name
             self.categories = categories
             self.ordered = ordered
@@ -285,9 +284,9 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
                 ordered = True
 
         self.ordered = False if ordered is None else ordered
-        self._codes = codes
         self.categories = categories
         self.name = name
+        self._codes = _coerce_codes_dtype(codes, categories)
 
     def copy(self):
         """ Copy constructor. """
@@ -607,6 +606,7 @@ def add_categories(self, new_categories, inplace=False):
         new_categories = self._validate_categories(new_categories)
         cat = self if inplace else self.copy()
         cat._categories = new_categories
+        cat._codes = _coerce_codes_dtype(cat._codes, new_categories)
         if not inplace:
             return cat
 
@@ -1105,6 +1105,12 @@ def __unicode__(self):
 
         return result
 
+    def _maybe_coerce_indexer(self, indexer):
+        """ return an indexer coerced to the codes dtype """
+        if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i':
+            indexer = indexer.astype(self._codes.dtype)
+        return indexer
+
     def __getitem__(self, key):
         """ Return an item. """
         if isinstance(key, (int, np.integer)):
@@ -1114,6 +1120,7 @@ def __getitem__(self, key):
             else:
                 return self.categories[i]
         else:
+            key = self._maybe_coerce_indexer(key)
             return Categorical(values=self._codes[key], categories=self.categories,
                                ordered=self.ordered, fastpath=True)
 
@@ -1181,6 +1188,8 @@ def __setitem__(self, key, value):
             nan_pos = np.where(com.isnull(self.categories))[0]
             lindexer[lindexer == -1] = nan_pos
 
+        key = self._maybe_coerce_indexer(key)
+        lindexer = self._maybe_coerce_indexer(lindexer)
         self._codes[key] = lindexer
 
     #### reduction ops ####
@@ -1395,6 +1404,22 @@ def _delegate_method(self, name, *args, **kwargs):
 
 ##### utility routines #####
 
+_int8_max = np.iinfo(np.int8).max
+_int16_max = np.iinfo(np.int16).max
+_int32_max = np.iinfo(np.int32).max
+
+def _coerce_codes_dtype(codes, categories):
+    """ coerce the code input array to an appropriate dtype """
+    codes = np.array(codes,copy=False)
+    l = len(categories)
+    if l < _int8_max:
+        return codes.astype('int8')
+    elif l < _int16_max:
+        return codes.astype('int16')
+    elif l < _int32_max:
+        return codes.astype('int32')
+    return codes.astype('int64')
+
 def _get_codes_for_values(values, categories):
     """"
     utility routine to turn values into codes given the specified categories
@@ -1407,7 +1432,7 @@ def _get_codes_for_values(values, categories):
     (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
     t = hash_klass(len(categories))
     t.map_locations(com._values_from_object(categories))
-    return com._ensure_platform_int(t.lookup(values))
+    return _coerce_codes_dtype(t.lookup(values), categories)
 
 def _convert_to_list_like(list_like):
     if hasattr(list_like, "dtype"):

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -714,12 +714,12 @@ def test_codes_immutable(self):
 
         # Codes should be read only
         c = Categorical(["a","b","c","a", np.nan])
-        exp = np.array([0,1,2,0, -1])
+        exp = np.array([0,1,2,0,-1],dtype='int8')
         self.assert_numpy_array_equal(c.codes, exp)
 
         # Assignments to codes should raise
         def f():
-            c.codes = np.array([0,1,2,0,1])
+            c.codes = np.array([0,1,2,0,1],dtype='int8')
         self.assertRaises(ValueError, f)
 
         # changes in the codes array should raise
@@ -731,10 +731,10 @@ def f():
 
         # But even after getting the codes, the original array should still be writeable!
         c[4] = "a"
-        exp = np.array([0,1,2,0, 0])
+        exp = np.array([0,1,2,0,0],dtype='int8')
         self.assert_numpy_array_equal(c.codes, exp)
         c._codes[4] = 2
-        exp = np.array([0,1,2,0, 2])
+        exp = np.array([0,1,2,0, 2],dtype='int8')
         self.assert_numpy_array_equal(c.codes, exp)
 
 
@@ -975,6 +975,28 @@ def f():
         expected = Series([True,False,False],index=index)
         tm.assert_series_equal(result, expected)
 
+    def test_codes_dtypes(self):
+
+        # GH 8453
+        result = Categorical(['foo','bar','baz'])
+        self.assertTrue(result.codes.dtype == 'int8')
+
+        result = Categorical(['foo%05d' % i for i in range(400) ])
+        self.assertTrue(result.codes.dtype == 'int16')
+
+        result = Categorical(['foo%05d' % i for i in range(40000) ])
+        self.assertTrue(result.codes.dtype == 'int32')
+
+        # adding cats
+        result = Categorical(['foo','bar','baz'])
+        self.assertTrue(result.codes.dtype == 'int8')
+        result = result.add_categories(['foo%05d' % i for i in range(400) ])
+        self.assertTrue(result.codes.dtype == 'int16')
+
+        # removing cats
+        result = result.remove_categories(['foo%05d' % i for i in range(300) ])
+        self.assertTrue(result.codes.dtype == 'int8')
+
     def test_basic(self):
 
         # test basic creation / coercion of categoricals
@@ -1192,7 +1214,7 @@ def test_series_delegations(self):
         exp_categories = np.array([1,2,3])
         self.assert_numpy_array_equal(s.cat.categories, exp_categories)
 
-        exp_codes = Series(com._ensure_platform_int([0,1,2,0]))
+        exp_codes = Series([0,1,2,0],dtype='int8')
         tm.assert_series_equal(s.cat.codes, exp_codes)
 
         self.assertEqual(s.cat.ordered, True)