diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 7a10e8d1073d0..a41b0c9521f99 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -973,6 +973,7 @@ Categorical - Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`). - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). +- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`). Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8735284617f31..1bc0d18bead83 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1768,8 +1768,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): Parameters ---------- - indexer : sequence of integers - allow_fill : bool, default None. + indexer : sequence of int + The indices in `self` to take. The meaning of negative values in + `indexer` depends on the value of `allow_fill`. + allow_fill : bool, default None How to handle negative values in `indexer`. * False: negative values in `indices` indicate positional indices @@ -1786,11 +1788,52 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): default is ``True``. In the future, this will change to ``False``. + fill_value : object + The value to use for `indices` that are missing (-1), when + ``allow_fill=True``. This should be the category, i.e. a value + in ``self.categories``, not a code. + Returns ------- Categorical This Categorical will have the same categories and ordered as `self`. + + See Also + -------- + Series.take : Similar method for Series. + numpy.ndarray.take : Similar method for NumPy arrays. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'a', 'b']) + >>> cat + [a, a, b] + Categories (2, object): [a, b] + + Specify ``allow_fill==False`` to have negative indices mean indexing + from the right. + + >>> cat.take([0, -1, -2], allow_fill=False) + [a, b, a] + Categories (2, object): [a, b] + + With ``allow_fill=True``, indices equal to ``-1`` mean "missing" + values that should be filled with the `fill_value`, which is + ``np.nan`` by default. + + >>> cat.take([0, -1, -1], allow_fill=True) + [a, NaN, NaN] + Categories (2, object): [a, b] + + The fill value can be specified. + + >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') + [a, a, a] + Categories (3, object): [a, b] + + Specifying a fill value that's not in ``self.categories`` + will raise a ``TypeError``. """ indexer = np.asarray(indexer, dtype=np.intp) if allow_fill is None: @@ -1798,14 +1841,26 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): warn(_take_msg, FutureWarning, stacklevel=2) allow_fill = True + dtype = self.dtype + if isna(fill_value): - # For categorical, any NA value is considered a user-facing - # NA value. Our storage NA value is -1. fill_value = -1 + elif allow_fill: + # convert user-provided `fill_value` to codes + if fill_value in self.categories: + fill_value = self.categories.get_loc(fill_value) + else: + msg = ( + "'fill_value' ('{}') is not in this Categorical's " + "categories." + ) + raise TypeError(msg.format(fill_value)) codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) - result = self._constructor(codes, dtype=self.dtype, fastpath=True) + result = type(self).from_codes(codes, + categories=dtype.categories, + ordered=dtype.ordered) return result take = take_nd diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index dcf2081ae32fe..d4a70e9a1ec2e 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -111,3 +111,32 @@ def test_positional_take_unobserved(self, ordered): expected = pd.Categorical(['b', 'a'], categories=cat.categories, ordered=ordered) tm.assert_categorical_equal(result, expected) + + def test_take_allow_fill(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = pd.Categorical(['a', 'a', 'b']) + result = cat.take([0, -1, -1], allow_fill=True) + expected = pd.Categorical(['a', np.nan, np.nan], + categories=['a', 'b']) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_with_negative_one(self): + # -1 was a category + cat = pd.Categorical([-1, 0, 1]) + result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1) + expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1]) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_value(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = pd.Categorical(['a', 'b', 'c']) + result = cat.take([0, 1, -1], fill_value='a', allow_fill=True) + expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_value_new_raises(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = pd.Categorical(['a', 'b', 'c']) + xpr = r"'fill_value' \('d'\) is not in this Categorical's categories." + with tm.assert_raises_regex(TypeError, xpr): + cat.take([0, 1, -1], fill_value='d', allow_fill=True)