Skip to content

Commit e6fee8e

Browse files
batterseapowerMeeseeksDev[bot]
authored and
MeeseeksDev[bot]
committed
Backport PR pandas-dev#25368: BUG: Fix potential segfault after pd.Categorical(pd.Series(...), categories=...)
1 parent 03fe61d commit e6fee8e

File tree

3 files changed

+18
-9
lines changed

3 files changed

+18
-9
lines changed

doc/source/whatsnew/v0.24.2.rst

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ Fixed Regressions
3131
- Fixed regression in :class:`TimedeltaIndex` where `np.sum(index)` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`)
3232
- Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`)
3333

34+
- Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`)
35+
3436
.. _whatsnew_0242.enhancements:
3537

3638
Enhancements

pandas/core/arrays/categorical.py

+4-9
Original file line numberDiff line numberDiff line change
@@ -323,14 +323,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
323323
# we may have dtype.categories be None, and we need to
324324
# infer categories in a factorization step futher below
325325

326-
if is_categorical(values):
327-
# GH23814, for perf, if values._values already an instance of
328-
# Categorical, set values to codes, and run fastpath
329-
if (isinstance(values, (ABCSeries, ABCIndexClass)) and
330-
isinstance(values._values, type(self))):
331-
values = values._values.codes.copy()
332-
fastpath = True
333-
334326
if fastpath:
335327
self._codes = coerce_indexer_dtype(values, dtype.categories)
336328
self._dtype = self._dtype.update_dtype(dtype)
@@ -382,7 +374,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
382374
dtype = CategoricalDtype(categories, dtype.ordered)
383375

384376
elif is_categorical_dtype(values):
385-
old_codes = (values.cat.codes if isinstance(values, ABCSeries)
377+
old_codes = (values._values.codes if isinstance(values, ABCSeries)
386378
else values.codes)
387379
codes = _recode_for_categories(old_codes, values.dtype.categories,
388380
dtype.categories)
@@ -2627,6 +2619,9 @@ def _recode_for_categories(codes, old_categories, new_categories):
26272619
if len(old_categories) == 0:
26282620
# All null anyway, so just retain the nulls
26292621
return codes.copy()
2622+
elif new_categories.equals(old_categories):
2623+
# Same categories, so no need to actually recode
2624+
return codes.copy()
26302625
indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories),
26312626
new_categories)
26322627
new_codes = take_1d(indexer, codes.copy(), fill_value=-1)

pandas/tests/arrays/categorical/test_constructors.py

+12
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,18 @@ def test_constructor(self):
212212
c = Categorical(np.array([], dtype='int64'), # noqa
213213
categories=[3, 2, 1], ordered=True)
214214

215+
def test_constructor_with_existing_categories(self):
216+
# GH25318: constructing with pd.Series used to bogusly skip recoding
217+
# categories
218+
c0 = Categorical(["a", "b", "c", "a"])
219+
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
220+
221+
c2 = Categorical(c0, categories=c1.categories)
222+
tm.assert_categorical_equal(c1, c2)
223+
224+
c3 = Categorical(Series(c0), categories=c1.categories)
225+
tm.assert_categorical_equal(c1, c3)
226+
215227
def test_constructor_not_sequence(self):
216228
# https://github.com/pandas-dev/pandas/issues/16022
217229
msg = r"^Parameter 'categories' must be list-like, was"

0 commit comments

Comments
 (0)