From 8ec774639d93ca658da112d211501c3dfe592eb6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 23 Oct 2018 08:56:26 -0500 Subject: [PATCH 1/3] BUG: Handle fill_value in Categorical.take Closes https://github.com/pandas-dev/pandas/issues/23296 --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/arrays/categorical.py | 75 +++++++++++++++---- pandas/core/dtypes/dtypes.py | 17 +++++ pandas/tests/arrays/categorical/test_algos.py | 34 +++++++++ 4 files changed, 113 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 7a10e8d1073d0..a41b0c9521f99 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -973,6 +973,7 @@ Categorical - Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`). - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). +- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`). Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8735284617f31..6a5ce6472f6a8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1019,15 +1019,7 @@ def add_categories(self, new_categories, inplace=False): set_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - if not is_list_like(new_categories): - new_categories = [new_categories] - already_included = set(new_categories) & set(self.dtype.categories) - if len(already_included) != 0: - msg = ("new categories must not include old categories: " - "{already_included!s}") - raise ValueError(msg.format(already_included=already_included)) - new_categories = list(self.dtype.categories) + list(new_categories) - new_dtype = CategoricalDtype(new_categories, self.ordered) + new_dtype = self.dtype._add_categories(new_categories) cat = self if inplace else self.copy() cat._dtype = new_dtype @@ -1768,8 +1760,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): Parameters ---------- - indexer : sequence of integers - allow_fill : bool, default None. + indexer : sequence of int + The indices in `self` to take. The meaning of negative values in + `indexer` depends on the value of `allow_fill`. + allow_fill : bool, default None How to handle negative values in `indexer`. * False: negative values in `indices` indicate positional indices @@ -1786,11 +1780,55 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): default is ``True``. In the future, this will change to ``False``. + fill_value : object + The value to use for `indices` that are missing (-1), when + ``allow_fill=True``. This should be the category, i.e. a value + in ``self.categories``, not a code. + + Specifying a `fill_value` that's not in ``self.categories`` is + allowed. The new category is added to the end of the existing + categories. + Returns ------- Categorical This Categorical will have the same categories and ordered as `self`. + + See Also + -------- + Series.take : Similar method for Series. + numpy.ndarray.take : Similar method for NumPy arrays. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'a', 'b']) + >>> cat + [a, a, b] + Categories (2, object): [a, b] + + Specify ``allow_fill==False`` to have negative indices mean indexing + from the right. + + >>> cat.take([0, -1, -2], allow_fill=False) + [a, b, a] + Categories (2, object): [a, b] + + With ``allow_fill=True``, indices equal to ``-1`` mean "missing" + values that should be filled with the `fill_value`, which is + ``np.nan`` by default. + + >>> cat.take([0, -1, -1], allow_fill=True) + [a, NaN, NaN] + Categories (2, object): [a, b] + + The fill value can be specified. Notice that if the `fill_value` was + not previously present in ``self.categories``, it is added to the end + of the categories in the output Categorical. + + >>> cat.take([0, -1, -1], allow_fill=True, fill_value='c') + [a, c, c] + Categories (3, object): [a, b, c] """ indexer = np.asarray(indexer, dtype=np.intp) if allow_fill is None: @@ -1798,14 +1836,23 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): warn(_take_msg, FutureWarning, stacklevel=2) allow_fill = True + dtype = self.dtype + if isna(fill_value): - # For categorical, any NA value is considered a user-facing - # NA value. Our storage NA value is -1. fill_value = -1 + elif allow_fill and fill_value is not None: + # convert user-provided `fill_value` to codes + if fill_value in self.categories: + fill_value = self.categories.get_loc(fill_value) + else: + dtype = self.dtype._add_categories(fill_value) + fill_value = dtype.categories.get_loc(fill_value) codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) - result = self._constructor(codes, dtype=self.dtype, fastpath=True) + result = type(self).from_codes(codes, + categories=dtype.categories, + ordered=dtype.ordered) return result take = take_nd diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index f07fb3cd80eab..3f0e74e8ef8c7 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -469,6 +469,23 @@ def _is_boolean(self): return is_bool_dtype(self.categories) + def _add_categories(self, new_categories): + """ + Return a new CategoricalDtype with new categories added at the end. + + """ + from pandas.core.dtypes.common import is_list_like + + if not is_list_like(new_categories): + new_categories = [new_categories] + already_included = set(new_categories) & set(self.categories) + if len(already_included) != 0: + msg = ("new categories must not include old categories: " + "{already_included!s}") + raise ValueError(msg.format(already_included=already_included)) + new_categories = list(self.categories) + list(new_categories) + return CategoricalDtype(new_categories, self.ordered) + class DatetimeTZDtype(PandasExtensionDtype): diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index dcf2081ae32fe..52362e8a98236 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -111,3 +111,37 @@ def test_positional_take_unobserved(self, ordered): expected = pd.Categorical(['b', 'a'], categories=cat.categories, ordered=ordered) tm.assert_categorical_equal(result, expected) + + def test_take_allow_fill(self): + cat = pd.Categorical(['a', 'a', 'b']) + result = cat.take([0, -1, -1], allow_fill=True) + expected = pd.Categorical(['a', np.nan, np.nan], + categories=['a', 'b']) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_with_negative_one(self): + # -1 was a category + cat = pd.Categorical([-1, 0, 1]) + result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1) + expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1]) + tm.assert_categorical_equal(result, expected) + + # -1 was not a category + cat = pd.Categorical([0, 1]) + result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1) + expected = pd.Categorical([0, -1, 1], categories=[0, 1, -1]) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_value(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = pd.Categorical(['a', 'b', 'c']) + result = cat.take([0, 1, -1], fill_value='a', allow_fill=True) + expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + def test_take_fill_value_adds_categories(self): + # https://github.com/pandas-dev/pandas/issues/23296 + cat = pd.Categorical(['a', 'b', 'c']) + result = cat.take([0, 1, -1], fill_value='d', allow_fill=True) + expected = pd.Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c', 'd']) + tm.assert_categorical_equal(result, expected) From 9b5ed63834eee5b55d3e0e3ebfed88ef322b24cd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 23 Oct 2018 09:02:54 -0500 Subject: [PATCH 2/3] no new categories --- pandas/core/arrays/categorical.py | 25 +++++++++---------- pandas/tests/arrays/categorical/test_algos.py | 15 ++++------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6a5ce6472f6a8..54e5813844851 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1785,10 +1785,6 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): ``allow_fill=True``. This should be the category, i.e. a value in ``self.categories``, not a code. - Specifying a `fill_value` that's not in ``self.categories`` is - allowed. The new category is added to the end of the existing - categories. - Returns ------- Categorical @@ -1822,13 +1818,14 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): [a, NaN, NaN] Categories (2, object): [a, b] - The fill value can be specified. Notice that if the `fill_value` was - not previously present in ``self.categories``, it is added to the end - of the categories in the output Categorical. + The fill value can be specified. - >>> cat.take([0, -1, -1], allow_fill=True, fill_value='c') - [a, c, c] - Categories (3, object): [a, b, c] + >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') + [a, a, a] + Categories (3, object): [a, b] + + Specifying a fill value that's not in ``self.categories`` + will raise a ``TypeError``. """ indexer = np.asarray(indexer, dtype=np.intp) if allow_fill is None: @@ -1840,13 +1837,15 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): if isna(fill_value): fill_value = -1 - elif allow_fill and fill_value is not None: + elif allow_fill: # convert user-provided `fill_value` to codes if fill_value in self.categories: fill_value = self.categories.get_loc(fill_value) else: - dtype = self.dtype._add_categories(fill_value) - fill_value = dtype.categories.get_loc(fill_value) + msg = ( + "'fill_value' ('{}') is not in this Categorical's categories." + ) + raise TypeError(msg.format(fill_value)) codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 52362e8a98236..d4a70e9a1ec2e 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -113,6 +113,7 @@ def test_positional_take_unobserved(self, ordered): tm.assert_categorical_equal(result, expected) def test_take_allow_fill(self): + # https://github.com/pandas-dev/pandas/issues/23296 cat = pd.Categorical(['a', 'a', 'b']) result = cat.take([0, -1, -1], allow_fill=True) expected = pd.Categorical(['a', np.nan, np.nan], @@ -126,12 +127,6 @@ def test_take_fill_with_negative_one(self): expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1]) tm.assert_categorical_equal(result, expected) - # -1 was not a category - cat = pd.Categorical([0, 1]) - result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1) - expected = pd.Categorical([0, -1, 1], categories=[0, 1, -1]) - tm.assert_categorical_equal(result, expected) - def test_take_fill_value(self): # https://github.com/pandas-dev/pandas/issues/23296 cat = pd.Categorical(['a', 'b', 'c']) @@ -139,9 +134,9 @@ def test_take_fill_value(self): expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) - def test_take_fill_value_adds_categories(self): + def test_take_fill_value_new_raises(self): # https://github.com/pandas-dev/pandas/issues/23296 cat = pd.Categorical(['a', 'b', 'c']) - result = cat.take([0, 1, -1], fill_value='d', allow_fill=True) - expected = pd.Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c', 'd']) - tm.assert_categorical_equal(result, expected) + xpr = r"'fill_value' \('d'\) is not in this Categorical's categories." + with tm.assert_raises_regex(TypeError, xpr): + cat.take([0, 1, -1], fill_value='d', allow_fill=True) From fb64335479f7a3ec13db7decf55c77448857b2bc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 23 Oct 2018 09:06:33 -0500 Subject: [PATCH 3/3] revert add_categories --- pandas/core/arrays/categorical.py | 13 +++++++++++-- pandas/core/dtypes/dtypes.py | 17 ----------------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 54e5813844851..1bc0d18bead83 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1019,7 +1019,15 @@ def add_categories(self, new_categories, inplace=False): set_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - new_dtype = self.dtype._add_categories(new_categories) + if not is_list_like(new_categories): + new_categories = [new_categories] + already_included = set(new_categories) & set(self.dtype.categories) + if len(already_included) != 0: + msg = ("new categories must not include old categories: " + "{already_included!s}") + raise ValueError(msg.format(already_included=already_included)) + new_categories = list(self.dtype.categories) + list(new_categories) + new_dtype = CategoricalDtype(new_categories, self.ordered) cat = self if inplace else self.copy() cat._dtype = new_dtype @@ -1843,7 +1851,8 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): fill_value = self.categories.get_loc(fill_value) else: msg = ( - "'fill_value' ('{}') is not in this Categorical's categories." + "'fill_value' ('{}') is not in this Categorical's " + "categories." ) raise TypeError(msg.format(fill_value)) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 3f0e74e8ef8c7..f07fb3cd80eab 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -469,23 +469,6 @@ def _is_boolean(self): return is_bool_dtype(self.categories) - def _add_categories(self, new_categories): - """ - Return a new CategoricalDtype with new categories added at the end. - - """ - from pandas.core.dtypes.common import is_list_like - - if not is_list_like(new_categories): - new_categories = [new_categories] - already_included = set(new_categories) & set(self.categories) - if len(already_included) != 0: - msg = ("new categories must not include old categories: " - "{already_included!s}") - raise ValueError(msg.format(already_included=already_included)) - new_categories = list(self.categories) + list(new_categories) - return CategoricalDtype(new_categories, self.ordered) - class DatetimeTZDtype(PandasExtensionDtype):