Skip to content

Commit 74b5a8f

Browse files
TomAugspurgertm9k1
authored andcommitted
Categorical take fill value (pandas-dev#23297)
* BUG: Handle fill_value in Categorical.take Closes pandas-dev#23296 * no new categories * revert add_categories
1 parent dee6de7 commit 74b5a8f

File tree

3 files changed

+90
-5
lines changed

3 files changed

+90
-5
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,7 @@ Categorical
974974
- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
975975
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
976976
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
977+
- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`).
977978

978979
Datetimelike
979980
^^^^^^^^^^^^

pandas/core/arrays/categorical.py

+60-5
Original file line numberDiff line numberDiff line change
@@ -1768,8 +1768,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
17681768
17691769
Parameters
17701770
----------
1771-
indexer : sequence of integers
1772-
allow_fill : bool, default None.
1771+
indexer : sequence of int
1772+
The indices in `self` to take. The meaning of negative values in
1773+
`indexer` depends on the value of `allow_fill`.
1774+
allow_fill : bool, default None
17731775
How to handle negative values in `indexer`.
17741776
17751777
* False: negative values in `indices` indicate positional indices
@@ -1786,26 +1788,79 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
17861788
default is ``True``. In the future, this will change to
17871789
``False``.
17881790
1791+
fill_value : object
1792+
The value to use for `indices` that are missing (-1), when
1793+
``allow_fill=True``. This should be the category, i.e. a value
1794+
in ``self.categories``, not a code.
1795+
17891796
Returns
17901797
-------
17911798
Categorical
17921799
This Categorical will have the same categories and ordered as
17931800
`self`.
1801+
1802+
See Also
1803+
--------
1804+
Series.take : Similar method for Series.
1805+
numpy.ndarray.take : Similar method for NumPy arrays.
1806+
1807+
Examples
1808+
--------
1809+
>>> cat = pd.Categorical(['a', 'a', 'b'])
1810+
>>> cat
1811+
[a, a, b]
1812+
Categories (2, object): [a, b]
1813+
1814+
Specify ``allow_fill==False`` to have negative indices mean indexing
1815+
from the right.
1816+
1817+
>>> cat.take([0, -1, -2], allow_fill=False)
1818+
[a, b, a]
1819+
Categories (2, object): [a, b]
1820+
1821+
With ``allow_fill=True``, indices equal to ``-1`` mean "missing"
1822+
values that should be filled with the `fill_value`, which is
1823+
``np.nan`` by default.
1824+
1825+
>>> cat.take([0, -1, -1], allow_fill=True)
1826+
[a, NaN, NaN]
1827+
Categories (2, object): [a, b]
1828+
1829+
The fill value can be specified.
1830+
1831+
>>> cat.take([0, -1, -1], allow_fill=True, fill_value='a')
1832+
[a, a, a]
1833+
Categories (3, object): [a, b]
1834+
1835+
Specifying a fill value that's not in ``self.categories``
1836+
will raise a ``TypeError``.
17941837
"""
17951838
indexer = np.asarray(indexer, dtype=np.intp)
17961839
if allow_fill is None:
17971840
if (indexer < 0).any():
17981841
warn(_take_msg, FutureWarning, stacklevel=2)
17991842
allow_fill = True
18001843

1844+
dtype = self.dtype
1845+
18011846
if isna(fill_value):
1802-
# For categorical, any NA value is considered a user-facing
1803-
# NA value. Our storage NA value is -1.
18041847
fill_value = -1
1848+
elif allow_fill:
1849+
# convert user-provided `fill_value` to codes
1850+
if fill_value in self.categories:
1851+
fill_value = self.categories.get_loc(fill_value)
1852+
else:
1853+
msg = (
1854+
"'fill_value' ('{}') is not in this Categorical's "
1855+
"categories."
1856+
)
1857+
raise TypeError(msg.format(fill_value))
18051858

18061859
codes = take(self._codes, indexer, allow_fill=allow_fill,
18071860
fill_value=fill_value)
1808-
result = self._constructor(codes, dtype=self.dtype, fastpath=True)
1861+
result = type(self).from_codes(codes,
1862+
categories=dtype.categories,
1863+
ordered=dtype.ordered)
18091864
return result
18101865

18111866
take = take_nd

pandas/tests/arrays/categorical/test_algos.py

+29
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,32 @@ def test_positional_take_unobserved(self, ordered):
111111
expected = pd.Categorical(['b', 'a'], categories=cat.categories,
112112
ordered=ordered)
113113
tm.assert_categorical_equal(result, expected)
114+
115+
def test_take_allow_fill(self):
116+
# https://github.com/pandas-dev/pandas/issues/23296
117+
cat = pd.Categorical(['a', 'a', 'b'])
118+
result = cat.take([0, -1, -1], allow_fill=True)
119+
expected = pd.Categorical(['a', np.nan, np.nan],
120+
categories=['a', 'b'])
121+
tm.assert_categorical_equal(result, expected)
122+
123+
def test_take_fill_with_negative_one(self):
124+
# -1 was a category
125+
cat = pd.Categorical([-1, 0, 1])
126+
result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
127+
expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1])
128+
tm.assert_categorical_equal(result, expected)
129+
130+
def test_take_fill_value(self):
131+
# https://github.com/pandas-dev/pandas/issues/23296
132+
cat = pd.Categorical(['a', 'b', 'c'])
133+
result = cat.take([0, 1, -1], fill_value='a', allow_fill=True)
134+
expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c'])
135+
tm.assert_categorical_equal(result, expected)
136+
137+
def test_take_fill_value_new_raises(self):
138+
# https://github.com/pandas-dev/pandas/issues/23296
139+
cat = pd.Categorical(['a', 'b', 'c'])
140+
xpr = r"'fill_value' \('d'\) is not in this Categorical's categories."
141+
with tm.assert_raises_regex(TypeError, xpr):
142+
cat.take([0, 1, -1], fill_value='d', allow_fill=True)

0 commit comments

Comments
 (0)