From 8ec774639d93ca658da112d211501c3dfe592eb6 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 23 Oct 2018 08:56:26 -0500
Subject: [PATCH 1/3] BUG: Handle fill_value in Categorical.take

Closes https://github.com/pandas-dev/pandas/issues/23296
---
 doc/source/whatsnew/v0.24.0.txt               |  1 +
 pandas/core/arrays/categorical.py             | 75 +++++++++++++++----
 pandas/core/dtypes/dtypes.py                  | 17 +++++
 pandas/tests/arrays/categorical/test_algos.py | 34 +++++++++
 4 files changed, 113 insertions(+), 14 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 7a10e8d1073d0..a41b0c9521f99 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -973,6 +973,7 @@ Categorical
 - Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
 - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
 - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
+- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`).
 
 Datetimelike
 ^^^^^^^^^^^^
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 8735284617f31..6a5ce6472f6a8 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1019,15 +1019,7 @@ def add_categories(self, new_categories, inplace=False):
         set_categories
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        if not is_list_like(new_categories):
-            new_categories = [new_categories]
-        already_included = set(new_categories) & set(self.dtype.categories)
-        if len(already_included) != 0:
-            msg = ("new categories must not include old categories: "
-                   "{already_included!s}")
-            raise ValueError(msg.format(already_included=already_included))
-        new_categories = list(self.dtype.categories) + list(new_categories)
-        new_dtype = CategoricalDtype(new_categories, self.ordered)
+        new_dtype = self.dtype._add_categories(new_categories)
 
         cat = self if inplace else self.copy()
         cat._dtype = new_dtype
@@ -1768,8 +1760,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
 
         Parameters
         ----------
-        indexer : sequence of integers
-        allow_fill : bool, default None.
+        indexer : sequence of int
+            The indices in `self` to take. The meaning of negative values in
+            `indexer` depends on the value of `allow_fill`.
+        allow_fill : bool, default None
             How to handle negative values in `indexer`.
 
             * False: negative values in `indices` indicate positional indices
@@ -1786,11 +1780,55 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
                default is ``True``. In the future, this will change to
                ``False``.
 
+        fill_value : object
+            The value to use for `indices` that are missing (-1), when
+            ``allow_fill=True``. This should be the category, i.e. a value
+            in ``self.categories``, not a code.
+
+            Specifying a `fill_value` that's not in ``self.categories`` is
+            allowed. The new category is added to the end of the existing
+            categories.
+
         Returns
         -------
         Categorical
             This Categorical will have the same categories and ordered as
             `self`.
+
+        See Also
+        --------
+        Series.take : Similar method for Series.
+        numpy.ndarray.take : Similar method for NumPy arrays.
+
+        Examples
+        --------
+        >>> cat = pd.Categorical(['a', 'a', 'b'])
+        >>> cat
+        [a, a, b]
+        Categories (2, object): [a, b]
+
+        Specify ``allow_fill==False`` to have negative indices mean indexing
+        from the right.
+
+        >>> cat.take([0, -1, -2], allow_fill=False)
+        [a, b, a]
+        Categories (2, object): [a, b]
+
+        With ``allow_fill=True``, indices equal to ``-1`` mean "missing"
+        values that should be filled with the `fill_value`, which is
+        ``np.nan`` by default.
+
+        >>> cat.take([0, -1, -1], allow_fill=True)
+        [a, NaN, NaN]
+        Categories (2, object): [a, b]
+
+        The fill value can be specified. Notice that if the `fill_value` was
+        not previously present in ``self.categories``, it is added to the end
+        of the categories in the output Categorical.
+
+        >>> cat.take([0, -1, -1], allow_fill=True, fill_value='c')
+        [a, c, c]
+        Categories (3, object): [a, b, c]
         """
         indexer = np.asarray(indexer, dtype=np.intp)
         if allow_fill is None:
@@ -1798,14 +1836,23 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
                 warn(_take_msg, FutureWarning, stacklevel=2)
                 allow_fill = True
 
+        dtype = self.dtype
+
         if isna(fill_value):
-            # For categorical, any NA value is considered a user-facing
-            # NA value. Our storage NA value is -1.
             fill_value = -1
+        elif allow_fill and fill_value is not None:
+            # convert user-provided `fill_value` to codes
+            if fill_value in self.categories:
+                fill_value = self.categories.get_loc(fill_value)
+            else:
+                dtype = self.dtype._add_categories(fill_value)
+                fill_value = dtype.categories.get_loc(fill_value)
 
         codes = take(self._codes, indexer, allow_fill=allow_fill,
                      fill_value=fill_value)
-        result = self._constructor(codes, dtype=self.dtype, fastpath=True)
+        result = type(self).from_codes(codes,
+                                       categories=dtype.categories,
+                                       ordered=dtype.ordered)
         return result
 
     take = take_nd
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index f07fb3cd80eab..3f0e74e8ef8c7 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -469,6 +469,23 @@ def _is_boolean(self):
 
         return is_bool_dtype(self.categories)
 
+    def _add_categories(self, new_categories):
+        """
+        Return a new CategoricalDtype with new categories added at the end.
+
+        """
+        from pandas.core.dtypes.common import is_list_like
+
+        if not is_list_like(new_categories):
+            new_categories = [new_categories]
+        already_included = set(new_categories) & set(self.categories)
+        if len(already_included) != 0:
+            msg = ("new categories must not include old categories: "
+                   "{already_included!s}")
+            raise ValueError(msg.format(already_included=already_included))
+        new_categories = list(self.categories) + list(new_categories)
+        return CategoricalDtype(new_categories, self.ordered)
+
 
 class DatetimeTZDtype(PandasExtensionDtype):
 
diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py
index dcf2081ae32fe..52362e8a98236 100644
--- a/pandas/tests/arrays/categorical/test_algos.py
+++ b/pandas/tests/arrays/categorical/test_algos.py
@@ -111,3 +111,37 @@ def test_positional_take_unobserved(self, ordered):
         expected = pd.Categorical(['b', 'a'], categories=cat.categories,
                                   ordered=ordered)
         tm.assert_categorical_equal(result, expected)
+
+    def test_take_allow_fill(self):
+        cat = pd.Categorical(['a', 'a', 'b'])
+        result = cat.take([0, -1, -1], allow_fill=True)
+        expected = pd.Categorical(['a', np.nan, np.nan],
+                                  categories=['a', 'b'])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_take_fill_with_negative_one(self):
+        # -1 was a category
+        cat = pd.Categorical([-1, 0, 1])
+        result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
+        expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1])
+        tm.assert_categorical_equal(result, expected)
+
+        # -1 was not a category
+        cat = pd.Categorical([0, 1])
+        result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
+        expected = pd.Categorical([0, -1, 1], categories=[0, 1, -1])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_take_fill_value(self):
+        # https://github.com/pandas-dev/pandas/issues/23296
+        cat = pd.Categorical(['a', 'b', 'c'])
+        result = cat.take([0, 1, -1], fill_value='a', allow_fill=True)
+        expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c'])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_take_fill_value_adds_categories(self):
+        # https://github.com/pandas-dev/pandas/issues/23296
+        cat = pd.Categorical(['a', 'b', 'c'])
+        result = cat.take([0, 1, -1], fill_value='d', allow_fill=True)
+        expected = pd.Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c', 'd'])
+        tm.assert_categorical_equal(result, expected)

From 9b5ed63834eee5b55d3e0e3ebfed88ef322b24cd Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 23 Oct 2018 09:02:54 -0500
Subject: [PATCH 2/3] no new categories

---
 pandas/core/arrays/categorical.py             | 25 +++++++++----------
 pandas/tests/arrays/categorical/test_algos.py | 15 ++++-------
 2 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 6a5ce6472f6a8..54e5813844851 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1785,10 +1785,6 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
             ``allow_fill=True``. This should be the category, i.e. a value
             in ``self.categories``, not a code.
 
-            Specifying a `fill_value` that's not in ``self.categories`` is
-            allowed. The new category is added to the end of the existing
-            categories.
-
         Returns
         -------
         Categorical
@@ -1822,13 +1818,14 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
         [a, NaN, NaN]
         Categories (2, object): [a, b]
 
-        The fill value can be specified. Notice that if the `fill_value` was
-        not previously present in ``self.categories``, it is added to the end
-        of the categories in the output Categorical.
+        The fill value can be specified.
 
-        >>> cat.take([0, -1, -1], allow_fill=True, fill_value='c')
-        [a, c, c]
-        Categories (3, object): [a, b, c]
+        >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a')
+        [a, a, a]
+        Categories (3, object): [a, b]
+
+        Specifying a fill value that's not in ``self.categories``
+        will raise a ``TypeError``.
         """
         indexer = np.asarray(indexer, dtype=np.intp)
         if allow_fill is None:
@@ -1840,13 +1837,15 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
 
         if isna(fill_value):
             fill_value = -1
-        elif allow_fill and fill_value is not None:
+        elif allow_fill:
             # convert user-provided `fill_value` to codes
             if fill_value in self.categories:
                 fill_value = self.categories.get_loc(fill_value)
             else:
-                dtype = self.dtype._add_categories(fill_value)
-                fill_value = dtype.categories.get_loc(fill_value)
+                msg = (
+                    "'fill_value' ('{}') is not in this Categorical's categories."
+                )
+                raise TypeError(msg.format(fill_value))
 
         codes = take(self._codes, indexer, allow_fill=allow_fill,
                      fill_value=fill_value)
diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py
index 52362e8a98236..d4a70e9a1ec2e 100644
--- a/pandas/tests/arrays/categorical/test_algos.py
+++ b/pandas/tests/arrays/categorical/test_algos.py
@@ -113,6 +113,7 @@ def test_positional_take_unobserved(self, ordered):
         tm.assert_categorical_equal(result, expected)
 
     def test_take_allow_fill(self):
+        # https://github.com/pandas-dev/pandas/issues/23296
         cat = pd.Categorical(['a', 'a', 'b'])
         result = cat.take([0, -1, -1], allow_fill=True)
         expected = pd.Categorical(['a', np.nan, np.nan],
@@ -126,12 +127,6 @@ def test_take_fill_with_negative_one(self):
         expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1])
         tm.assert_categorical_equal(result, expected)
 
-        # -1 was not a category
-        cat = pd.Categorical([0, 1])
-        result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
-        expected = pd.Categorical([0, -1, 1], categories=[0, 1, -1])
-        tm.assert_categorical_equal(result, expected)
-
     def test_take_fill_value(self):
         # https://github.com/pandas-dev/pandas/issues/23296
         cat = pd.Categorical(['a', 'b', 'c'])
@@ -139,9 +134,9 @@ def test_take_fill_value(self):
         expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c'])
         tm.assert_categorical_equal(result, expected)
 
-    def test_take_fill_value_adds_categories(self):
+    def test_take_fill_value_new_raises(self):
         # https://github.com/pandas-dev/pandas/issues/23296
         cat = pd.Categorical(['a', 'b', 'c'])
-        result = cat.take([0, 1, -1], fill_value='d', allow_fill=True)
-        expected = pd.Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c', 'd'])
-        tm.assert_categorical_equal(result, expected)
+        xpr = r"'fill_value' \('d'\) is not in this Categorical's categories."
+        with tm.assert_raises_regex(TypeError, xpr):
+            cat.take([0, 1, -1], fill_value='d', allow_fill=True)

From fb64335479f7a3ec13db7decf55c77448857b2bc Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Tue, 23 Oct 2018 09:06:33 -0500
Subject: [PATCH 3/3] revert add_categories

---
 pandas/core/arrays/categorical.py | 13 +++++++++++--
 pandas/core/dtypes/dtypes.py      | 17 -----------------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 54e5813844851..1bc0d18bead83 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1019,7 +1019,15 @@ def add_categories(self, new_categories, inplace=False):
         set_categories
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        new_dtype = self.dtype._add_categories(new_categories)
+        if not is_list_like(new_categories):
+            new_categories = [new_categories]
+        already_included = set(new_categories) & set(self.dtype.categories)
+        if len(already_included) != 0:
+            msg = ("new categories must not include old categories: "
+                   "{already_included!s}")
+            raise ValueError(msg.format(already_included=already_included))
+        new_categories = list(self.dtype.categories) + list(new_categories)
+        new_dtype = CategoricalDtype(new_categories, self.ordered)
 
         cat = self if inplace else self.copy()
         cat._dtype = new_dtype
@@ -1843,7 +1851,8 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
                 fill_value = self.categories.get_loc(fill_value)
             else:
                 msg = (
-                    "'fill_value' ('{}') is not in this Categorical's categories."
+                    "'fill_value' ('{}') is not in this Categorical's "
+                    "categories."
                 )
                 raise TypeError(msg.format(fill_value))
 
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index 3f0e74e8ef8c7..f07fb3cd80eab 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -469,23 +469,6 @@ def _is_boolean(self):
 
         return is_bool_dtype(self.categories)
 
-    def _add_categories(self, new_categories):
-        """
-        Return a new CategoricalDtype with new categories added at the end.
-
-        """
-        from pandas.core.dtypes.common import is_list_like
-
-        if not is_list_like(new_categories):
-            new_categories = [new_categories]
-        already_included = set(new_categories) & set(self.categories)
-        if len(already_included) != 0:
-            msg = ("new categories must not include old categories: "
-                   "{already_included!s}")
-            raise ValueError(msg.format(already_included=already_included))
-        new_categories = list(self.categories) + list(new_categories)
-        return CategoricalDtype(new_categories, self.ordered)
-
 
 class DatetimeTZDtype(PandasExtensionDtype):