Skip to content

Commit 4afc756

Browse files
TomAugspurgerjorisvandenbossche
authored andcommitted
BUG/DEPR: deprecate Categorical take default behaviour + fix Series[categorical].take (#20841)
We're changing how Categorical.take handles negative indices to be in line with Series and other EAs.
1 parent fe0d8db commit 4afc756

File tree

8 files changed

+130
-12
lines changed

8 files changed

+130
-12
lines changed

doc/source/whatsnew/v0.23.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,7 @@ Deprecations
891891
removed in a future version (:issue:`20419`).
892892
- ``DatetimeIndex.offset`` is deprecated. Use ``DatetimeIndex.freq`` instead (:issue:`20716`)
893893
- ``Index.get_duplicates()`` is deprecated and will be removed in a future version (:issue:`20239`)
894+
- The previous default behavior of negative indices in ``Categorical.take`` is deprecated. In a future version it will change from meaning missing values to meaning positional indices from the right. The future behavior is consistent with :meth:`Series.take` (:issue:`20664`).
894895

895896
.. _whatsnew_0230.prior_deprecations:
896897

@@ -1024,6 +1025,7 @@ Categorical
10241025
- Bug in ``Categorical.__iter__`` not converting to Python types (:issue:`19909`)
10251026
- Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`)
10261027
- Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`)
1028+
- Bug in :meth:`Series.take` with categorical data interpreting ``-1`` in `indices` as missing value markers, rather than the last element of the Series (:issue:`20664`)
10271029

10281030
Datetimelike
10291031
^^^^^^^^^^^^

pandas/core/arrays/categorical.py

+53-9
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import numpy as np
44
from warnings import warn
5+
import textwrap
56
import types
67

78
from pandas import compat
@@ -29,7 +30,7 @@
2930
is_scalar,
3031
is_dict_like)
3132

32-
from pandas.core.algorithms import factorize, take_1d, unique1d
33+
from pandas.core.algorithms import factorize, take_1d, unique1d, take
3334
from pandas.core.accessor import PandasDelegate
3435
from pandas.core.base import (PandasObject,
3536
NoNewAttributesMixin, _shared_docs)
@@ -48,6 +49,17 @@
4849
from .base import ExtensionArray
4950

5051

52+
_take_msg = textwrap.dedent("""\
53+
Interpreting negative values in 'indexer' as missing values.
54+
In the future, this will change to meaning positional indicies
55+
from the right.
56+
57+
Use 'allow_fill=True' to retain the previous behavior and silence this
58+
warning.
59+
60+
Use 'allow_fill=False' to accept the new behavior.""")
61+
62+
5163
def _cat_compare_op(op):
5264
def f(self, other):
5365
# On python2, you can usually compare any type to any type, and
@@ -1732,17 +1744,49 @@ def fillna(self, value=None, method=None, limit=None):
17321744
return self._constructor(values, categories=self.categories,
17331745
ordered=self.ordered, fastpath=True)
17341746

1735-
def take_nd(self, indexer, allow_fill=True, fill_value=None):
1736-
""" Take the codes by the indexer, fill with the fill_value.
1737-
1738-
For internal compatibility with numpy arrays.
1747+
def take_nd(self, indexer, allow_fill=None, fill_value=None):
17391748
"""
1749+
Take elements from the Categorical.
17401750
1741-
# filling must always be None/nan here
1742-
# but is passed thru internally
1743-
assert isna(fill_value)
1751+
Parameters
1752+
----------
1753+
indexer : sequence of integers
1754+
allow_fill : bool, default None.
1755+
How to handle negative values in `indexer`.
17441756
1745-
codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1)
1757+
* False: negative values in `indices` indicate positional indices
1758+
from the right. This is similar to
1759+
:func:`numpy.take`.
1760+
1761+
* True: negative values in `indices` indicate missing values
1762+
(the default). These values are set to `fill_value`. Any other
1763+
other negative values raise a ``ValueError``.
1764+
1765+
.. versionchanged:: 0.23.0
1766+
1767+
Deprecated the default value of `allow_fill`. The deprecated
1768+
default is ``True``. In the future, this will change to
1769+
``False``.
1770+
1771+
Returns
1772+
-------
1773+
Categorical
1774+
This Categorical will have the same categories and ordered as
1775+
`self`.
1776+
"""
1777+
indexer = np.asarray(indexer, dtype=np.intp)
1778+
if allow_fill is None:
1779+
if (indexer < 0).any():
1780+
warn(_take_msg, FutureWarning, stacklevel=2)
1781+
allow_fill = True
1782+
1783+
if isna(fill_value):
1784+
# For categorical, any NA value is considered a user-facing
1785+
# NA value. Our storage NA value is -1.
1786+
fill_value = -1
1787+
1788+
codes = take(self._codes, indexer, allow_fill=allow_fill,
1789+
fill_value=fill_value)
17461790
result = self._constructor(codes, categories=self.categories,
17471791
ordered=self.ordered, fastpath=True)
17481792
return result

pandas/core/series.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -3499,7 +3499,14 @@ def _take(self, indices, axis=0, convert=True, is_copy=False):
34993499

35003500
indices = _ensure_platform_int(indices)
35013501
new_index = self.index.take(indices)
3502-
new_values = self._values.take(indices)
3502+
3503+
if is_categorical_dtype(self):
3504+
# https://github.com/pandas-dev/pandas/issues/20664
3505+
# TODO: remove when the default Categorical.take behavior changes
3506+
kwargs = {'allow_fill': False}
3507+
else:
3508+
kwargs = {}
3509+
new_values = self._values.take(indices, **kwargs)
35033510

35043511
result = (self._constructor(new_values, index=new_index,
35053512
fastpath=True).__finalize__(self))

pandas/tests/categorical/conftest.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import pytest
2+
3+
4+
@pytest.fixture(params=[True, False])
5+
def allow_fill(request):
6+
"""Boolean 'allow_fill' parameter for Categorical.take"""
7+
return request.param
8+
9+
10+
@pytest.fixture(params=[True, False])
11+
def ordered(request):
12+
"""Boolean 'ordered' parameter for Categorical."""
13+
return request.param

pandas/tests/categorical/test_algos.py

+42
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,45 @@ def test_isin_empty(empty):
6969

7070
result = s.isin(empty)
7171
tm.assert_numpy_array_equal(expected, result)
72+
73+
74+
class TestTake(object):
75+
# https://github.com/pandas-dev/pandas/issues/20664
76+
77+
def test_take_warns(self):
78+
cat = pd.Categorical(['a', 'b'])
79+
with tm.assert_produces_warning(FutureWarning):
80+
cat.take([0, -1])
81+
82+
def test_take_positive_no_warning(self):
83+
cat = pd.Categorical(['a', 'b'])
84+
with tm.assert_produces_warning(None):
85+
cat.take([0, 0])
86+
87+
def test_take_bounds(self, allow_fill):
88+
# https://github.com/pandas-dev/pandas/issues/20664
89+
cat = pd.Categorical(['a', 'b', 'a'])
90+
with pytest.raises(IndexError):
91+
cat.take([4, 5], allow_fill=allow_fill)
92+
93+
def test_take_empty(self, allow_fill):
94+
# https://github.com/pandas-dev/pandas/issues/20664
95+
cat = pd.Categorical([], categories=['a', 'b'])
96+
with pytest.raises(IndexError):
97+
cat.take([0], allow_fill=allow_fill)
98+
99+
def test_positional_take(self, ordered):
100+
cat = pd.Categorical(['a', 'a', 'b', 'b'], categories=['b', 'a'],
101+
ordered=ordered)
102+
result = cat.take([0, 1, 2], allow_fill=False)
103+
expected = pd.Categorical(['a', 'a', 'b'], categories=cat.categories,
104+
ordered=ordered)
105+
tm.assert_categorical_equal(result, expected)
106+
107+
def test_positional_take_unobserved(self, ordered):
108+
cat = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'],
109+
ordered=ordered)
110+
result = cat.take([1, 0], allow_fill=False)
111+
expected = pd.Categorical(['b', 'a'], categories=cat.categories,
112+
ordered=ordered)
113+
tm.assert_categorical_equal(result, expected)

pandas/tests/extension/category/test_categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def test_take_non_na_fill_value(self):
115115
def test_take_out_of_bounds_raises(self):
116116
pass
117117

118-
@skip_take
118+
@pytest.mark.skip(reason="GH-20747. Unobserved categories.")
119119
def test_take_series(self):
120120
pass
121121

pandas/tests/extension/json/test_json.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def data():
3232
while len(data[0]) == len(data[1]):
3333
data = make_data()
3434

35-
return JSONArray(make_data())
35+
return JSONArray(data)
3636

3737

3838
@pytest.fixture

pandas/tests/series/indexing/test_indexing.py

+10
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,16 @@ def test_take():
753753
s.take([-1, 3, 4], convert=False)
754754

755755

756+
def test_take_categorical():
757+
# https://github.com/pandas-dev/pandas/issues/20664
758+
s = Series(pd.Categorical(['a', 'b', 'c']))
759+
result = s.take([-2, -2, 0])
760+
expected = Series(pd.Categorical(['b', 'b', 'a'],
761+
categories=['a', 'b', 'c']),
762+
index=[1, 1, 0])
763+
assert_series_equal(result, expected)
764+
765+
756766
def test_head_tail(test_data):
757767
assert_series_equal(test_data.series.head(), test_data.series[:5])
758768
assert_series_equal(test_data.series.head(0), test_data.series[0:0])

0 commit comments

Comments
 (0)