diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 669a39d437a34..a5f76fb2b5941 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -47,7 +47,7 @@ the `categories` array. The categorical data type is useful in the following cases: * A string variable consisting of only a few different values. Converting such a string - variable to a categorical variable will save some memory. + variable to a categorical variable will save some memory, see :ref:`here`. * The lexical order of a variable is not the same as the logical order ("one", "two", "three"). By converting to a categorical and specifying an order on the categories, sorting and min/max will use the logical order instead of the lexical order. @@ -633,6 +633,27 @@ The following differences to R's factor functions can be observed: Gotchas ------- +.. _categorical.memory: + +Memory Usage +~~~~~~~~~~~~ + +The memory usage of a ``Categorical`` is proportional to the length of the categories times the length of the data. In contrast, +the an ``object`` dtype is a fixed function of the length of the data. + +.. ipython:: python + + s = Series(['foo','bar']*1000) + + # object dtype + s.nbytes + + # category dtype + s.astype('category').nbytes + +Note that if the number of categories approaches the length of the data, the ``Categorical`` will use nearly (or more) memory than an +equivalent ``object`` dtype representation. + Old style constructor usage ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 5f8c2e7dcd30f..4fc808fe0409f 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -563,7 +563,7 @@ Categoricals in Series/DataFrame :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, -:issue:`8075`, :issue:`8076`, :issue:`8143`). +:issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`). For full docs, see the :ref:`categorical introduction ` and the :ref:`API documentation `. diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index d2708890c5ec2..aa5fa29784912 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -67,7 +67,6 @@ def _maybe_to_categorical(array): return array.values return array - _codes_doc = """The category codes of this categorical. Level codes are an array if integer which are the positions of the real @@ -194,7 +193,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa if fastpath: # fast path - self._codes = values + self._codes = _coerce_codes_dtype(values, categories) self.name = name self.categories = categories self.ordered = ordered @@ -285,9 +284,9 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa ordered = True self.ordered = False if ordered is None else ordered - self._codes = codes self.categories = categories self.name = name + self._codes = _coerce_codes_dtype(codes, categories) def copy(self): """ Copy constructor. """ @@ -607,6 +606,7 @@ def add_categories(self, new_categories, inplace=False): new_categories = self._validate_categories(new_categories) cat = self if inplace else self.copy() cat._categories = new_categories + cat._codes = _coerce_codes_dtype(cat._codes, new_categories) if not inplace: return cat @@ -1105,6 +1105,12 @@ def __unicode__(self): return result + def _maybe_coerce_indexer(self, indexer): + """ return an indexer coerced to the codes dtype """ + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': + indexer = indexer.astype(self._codes.dtype) + return indexer + def __getitem__(self, key): """ Return an item. """ if isinstance(key, (int, np.integer)): @@ -1114,6 +1120,7 @@ def __getitem__(self, key): else: return self.categories[i] else: + key = self._maybe_coerce_indexer(key) return Categorical(values=self._codes[key], categories=self.categories, ordered=self.ordered, fastpath=True) @@ -1181,6 +1188,8 @@ def __setitem__(self, key, value): nan_pos = np.where(com.isnull(self.categories))[0] lindexer[lindexer == -1] = nan_pos + key = self._maybe_coerce_indexer(key) + lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer #### reduction ops #### @@ -1395,6 +1404,22 @@ def _delegate_method(self, name, *args, **kwargs): ##### utility routines ##### +_int8_max = np.iinfo(np.int8).max +_int16_max = np.iinfo(np.int16).max +_int32_max = np.iinfo(np.int32).max + +def _coerce_codes_dtype(codes, categories): + """ coerce the code input array to an appropriate dtype """ + codes = np.array(codes,copy=False) + l = len(categories) + if l < _int8_max: + return codes.astype('int8') + elif l < _int16_max: + return codes.astype('int16') + elif l < _int32_max: + return codes.astype('int32') + return codes.astype('int64') + def _get_codes_for_values(values, categories): """" utility routine to turn values into codes given the specified categories @@ -1407,7 +1432,7 @@ def _get_codes_for_values(values, categories): (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(categories)) t.map_locations(com._values_from_object(categories)) - return com._ensure_platform_int(t.lookup(values)) + return _coerce_codes_dtype(t.lookup(values), categories) def _convert_to_list_like(list_like): if hasattr(list_like, "dtype"): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 7681289cf41ac..a2643b38e4133 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -714,12 +714,12 @@ def test_codes_immutable(self): # Codes should be read only c = Categorical(["a","b","c","a", np.nan]) - exp = np.array([0,1,2,0, -1]) + exp = np.array([0,1,2,0,-1],dtype='int8') self.assert_numpy_array_equal(c.codes, exp) # Assignments to codes should raise def f(): - c.codes = np.array([0,1,2,0,1]) + c.codes = np.array([0,1,2,0,1],dtype='int8') self.assertRaises(ValueError, f) # changes in the codes array should raise @@ -731,10 +731,10 @@ def f(): # But even after getting the codes, the original array should still be writeable! c[4] = "a" - exp = np.array([0,1,2,0, 0]) + exp = np.array([0,1,2,0,0],dtype='int8') self.assert_numpy_array_equal(c.codes, exp) c._codes[4] = 2 - exp = np.array([0,1,2,0, 2]) + exp = np.array([0,1,2,0, 2],dtype='int8') self.assert_numpy_array_equal(c.codes, exp) @@ -975,6 +975,28 @@ def f(): expected = Series([True,False,False],index=index) tm.assert_series_equal(result, expected) + def test_codes_dtypes(self): + + # GH 8453 + result = Categorical(['foo','bar','baz']) + self.assertTrue(result.codes.dtype == 'int8') + + result = Categorical(['foo%05d' % i for i in range(400) ]) + self.assertTrue(result.codes.dtype == 'int16') + + result = Categorical(['foo%05d' % i for i in range(40000) ]) + self.assertTrue(result.codes.dtype == 'int32') + + # adding cats + result = Categorical(['foo','bar','baz']) + self.assertTrue(result.codes.dtype == 'int8') + result = result.add_categories(['foo%05d' % i for i in range(400) ]) + self.assertTrue(result.codes.dtype == 'int16') + + # removing cats + result = result.remove_categories(['foo%05d' % i for i in range(300) ]) + self.assertTrue(result.codes.dtype == 'int8') + def test_basic(self): # test basic creation / coercion of categoricals @@ -1192,7 +1214,7 @@ def test_series_delegations(self): exp_categories = np.array([1,2,3]) self.assert_numpy_array_equal(s.cat.categories, exp_categories) - exp_codes = Series(com._ensure_platform_int([0,1,2,0])) + exp_codes = Series([0,1,2,0],dtype='int8') tm.assert_series_equal(s.cat.codes, exp_codes) self.assertEqual(s.cat.ordered, True)