pandas-dev · jreback · Dec 10, 2018 · Dec 3, 2018 · Dec 5, 2018 · Dec 5, 2018
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1136,6 +1136,8 @@ Deprecations
 - :func:`pandas.types.is_datetimetz` is deprecated in favor of `pandas.types.is_datetime64tz` (:issue:`23917`)
 - Creating a :class:`TimedeltaIndex` or :class:`DatetimeIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range` and :func:`date_range` (:issue:`23919`)
 - Passing a string alias like ``'datetime64[ns, UTC]'`` as the `unit` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`).
+- In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`).
+
 
 .. _whatsnew_0240.deprecations.datetimelike_int_ops:
 
@@ -1244,6 +1246,7 @@ Performance Improvements
 - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`)
 - Fixed a performance regression on Windows with Python 3.7 of :func:`pd.read_csv` (:issue:`23516`)
 - Improved performance of :class:`Categorical` constructor for `Series` objects (:issue:`23814`)
+- Improved performance of :meth:`~DataFrame.where` for Categorical data (:issue:`24077`)
 
 .. _whatsnew_0240.docs:
 
@@ -1270,6 +1273,7 @@ Categorical
 - In meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`)
 - Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`)
 - Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`)
+- Bug in :meth:`Series.where` losing the categorical dtype for categorical data (:issue:`24077`)
 
 Datetimelike
 ^^^^^^^^^^^^
@@ -1305,6 +1309,7 @@ Datetimelike
 - Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`)
 - Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`)
 - Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`)
+- Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are undordered and have the same categories, but in a different order (:issue:`24142`)
 
 Timedelta
 ^^^^^^^^^

diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -116,6 +116,7 @@ def get_range_parameters(data):
     reduce = functools.reduce
     long = int
     unichr = chr
+    import reprlib
 
     # This was introduced in Python 3.3, but we don't support
     # Python 3.x < 3.5, so checking PY3 is safe.
@@ -271,6 +272,7 @@ class to receive bound method
     class_types = type,
     text_type = str
     binary_type = bytes
+    import reprlib
 
     def u(s):
         return s
@@ -323,6 +325,7 @@ def set_function_name(f, name, cls):
     class_types = (type, types.ClassType)
     text_type = unicode
     binary_type = str
+    import repr as reprlib
 
     def u(s):
         return unicode(s, "unicode_escape")

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -220,6 +220,8 @@ def __setitem__(self, key, value):
         #   example, a string like '2018-01-01' is coerced to a datetime
         #   when setting on a datetime64ns array. In general, if the
         #   __init__ method coerces that value, then so should __setitem__
+        # Note, also, that Series/DataFrame.where internally use __setitem__
+        # on a copy of the data.
         raise NotImplementedError(_not_implemented_message.format(
             type(self), '__setitem__')
         )

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2078,11 +2078,21 @@ def __setitem__(self, key, value):
             `Categorical` does not have the same categories
         """
 
+        if isinstance(value, (ABCIndexClass, ABCSeries)):
+            value = value.array
+
         # require identical categories set
         if isinstance(value, Categorical):
-            if not value.categories.equals(self.categories):
+            if not is_dtype_equal(self, value):
                 raise ValueError("Cannot set a Categorical with another, "
                                  "without identical categories")
+            if not self.categories.equals(value.categories):
+                new_codes = _recode_for_categories(
+                    value.codes, value.categories, self.categories
+                )
+                value = Categorical.from_codes(new_codes,
+                                               categories=self.categories,
+                                               ordered=self.ordered)
 
         rvalue = value if is_list_like(value) else [value]
 

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
@@ -706,6 +706,8 @@ def __array__(self, dtype=None, copy=True):
 
     def __setitem__(self, key, value):
         # I suppose we could allow setting of non-fill_value elements.
+        # TODO(SparseArray.__setitem__): remove special cases in
+        # ExtensionBlock.where
         msg = "SparseArray does not support item assignment via setitem"
         raise TypeError(msg)
 

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -501,10 +501,13 @@ def _can_reindex(self, indexer):
 
     @Appender(_index_shared_docs['where'])
     def where(self, cond, other=None):
+        # TODO: Investigate an alternative implementation with
+        # 1. copy the underyling Categorical
+        # 2. setitem with `cond` and `other`
+        # 3. Rebuild CategoricalIndex.
         if other is None:
             other = self._na_value
         values = np.where(cond, self.values, other)
-
         cat = Categorical(values, dtype=self.dtype)
         return self._shallow_copy(cat, **self._get_attributes_dict())
 

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -28,12 +28,13 @@
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype, DatetimeTZDtype, ExtensionDtype, PandasExtensionDtype)
 from pandas.core.dtypes.generic import (
-    ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass, ABCSeries)
+    ABCDataFrame, ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass,
+    ABCSeries)
 from pandas.core.dtypes.missing import (
     _isna_compat, array_equivalent, is_null_datelike_scalar, isna, notna)
 
 import pandas.core.algorithms as algos
-from pandas.core.arrays import Categorical, ExtensionArray
+from pandas.core.arrays import Categorical, ExtensionArray, SparseArray
 from pandas.core.base import PandasObject
 import pandas.core.common as com
 from pandas.core.indexes.datetimes import DatetimeIndex
@@ -1967,6 +1968,57 @@ def shift(self, periods, axis=0):
                                            placement=self.mgr_locs,
                                            ndim=self.ndim)]
 
+    def where(self, other, cond, align=True, errors='raise',
+              try_cast=False, axis=0, transpose=False):
+        if isinstance(other, (ABCIndexClass, ABCSeries)):
+            other = other.array
+
+        if isinstance(cond, ABCDataFrame):
+            assert cond.shape[1] == 1
+            cond = cond.iloc[:, 0].array
+
+        if isinstance(other, ABCDataFrame):
+            assert other.shape[1] == 1
+            other = other.iloc[:, 0].array
+
+        if isinstance(cond, (ABCIndexClass, ABCSeries)):
+            cond = cond.array
+
+        if lib.is_scalar(other) and isna(other):
+            # The default `other` for Series / Frame is np.nan
+            # we want to replace that with the correct NA value
+            # for the type
+            other = self.dtype.na_value
+
+        if is_sparse(self.values):
+            # ugly workaround for ensure that the dtype is OK
+            # after we insert NaNs.
+            if is_sparse(other):
+                otype = other.dtype.subtype
+            else:
+                otype = other
+            dtype = self.dtype.update_dtype(
+                np.result_type(self.values.dtype.subtype, otype)
+            )
+        else:
+            dtype = self.dtype
+
+        # rough heuristic to see if the other array implements setitem
+        if (self._holder.__setitem__ == ExtensionArray.__setitem__
+                or self._holder.__setitem__ == SparseArray.__setitem__):
+            result = self._holder._from_sequence(
+                np.where(cond, self.values, other),
+                dtype=dtype,
+            )
+        else:
+            result = self.values.copy()
+            icond = ~cond
+            if lib.is_scalar(other):
+                result[icond] = other
+            else:
+                result[icond] = other[icond]
+        return self.make_block_same_class(result, placement=self.mgr_locs)
+
     @property
     def _ftype(self):
         return getattr(self.values, '_pandas_ftype', Block._ftype)
@@ -2646,6 +2698,37 @@ def concat_same_type(self, to_concat, placement=None):
             values, placement=placement or slice(0, len(values), 1),
             ndim=self.ndim)
 
+    def where(self, other, cond, align=True, errors='raise',
+              try_cast=False, axis=0, transpose=False):
+        # This can all be deleted in favor of ExtensionBlock.where once
+        # we enforce the deprecation.
+        object_msg = (
+            "Implicitly converting categorical to object-dtype ndarray. "
+            "The values `{}' are not present in this categorical's "
+            "categories. A future version of pandas will raise a ValueError "
+            "when 'other' contains different categories.\n\n"
+            "To preserve the current behavior, add the new categories to "
+            "the categorical before calling 'where', or convert the "
+            "categorical to a different dtype."
+        )
+        try:
+            # Attempt to do preserve categorical dtype.
+            result = super(CategoricalBlock, self).where(
+                other, cond, align, errors, try_cast, axis, transpose
+            )
+        except (TypeError, ValueError):
+            if lib.is_scalar(other):
+                msg = object_msg.format(other)
+            else:
+                msg = compat.reprlib.repr(other)
+
+            warnings.warn(msg, FutureWarning, stacklevel=6)
+            result = self.astype(object).where(other, cond, align=align,
+                                               errors=errors,
+                                               try_cast=try_cast,
+                                               axis=axis, transpose=transpose)
+        return result
+
 
 class DatetimeBlock(DatetimeLikeBlockMixin, Block):
     __slots__ = ()

diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 
+import pandas as pd
 from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series
 import pandas.core.common as com
 from pandas.tests.arrays.categorical.common import TestCategorical
@@ -43,6 +44,45 @@ def test_setitem(self):
 
         tm.assert_categorical_equal(c, expected)
 
+    @pytest.mark.parametrize('other', [
+        pd.Categorical(['b', 'a']),
+        pd.Categorical(['b', 'a'], categories=['b', 'a']),
+    ])
+    def test_setitem_same_but_unordered(self, other):
+        # GH-24142
+        target = pd.Categorical(['a', 'b'], categories=['a', 'b'])
+        mask = np.array([True, False])
+        target[mask] = other[mask]
+        expected = pd.Categorical(['b', 'b'], categories=['a', 'b'])
+        tm.assert_categorical_equal(target, expected)
+
+    @pytest.mark.parametrize('other', [
+        pd.Categorical(['b', 'a'], categories=['b', 'a', 'c']),
+        pd.Categorical(['b', 'a'], categories=['a', 'b', 'c']),
+        pd.Categorical(['a', 'a'], categories=['a']),
+        pd.Categorical(['b', 'b'], categories=['b']),
+    ])
+    def test_setitem_different_unordered_raises(self, other):
+        # GH-24142
+        target = pd.Categorical(['a', 'b'], categories=['a', 'b'])
+        mask = np.array([True, False])
+        with pytest.raises(ValueError):
+            target[mask] = other[mask]
+
+    @pytest.mark.parametrize('other', [
+        pd.Categorical(['b', 'a']),
+        pd.Categorical(['b', 'a'], categories=['b', 'a'], ordered=True),
+        pd.Categorical(['b', 'a'], categories=['a', 'b', 'c'], ordered=True),
+    ])
+    def test_setitem_same_ordered_rasies(self, other):
+        # Gh-24142
+        target = pd.Categorical(['a', 'b'], categories=['a', 'b'],
+                                ordered=True)
+        mask = np.array([True, False])
+
+        with pytest.raises(ValueError):
+            target[mask] = other[mask]
+
 
 class TestCategoricalIndexing(object):
 
@@ -122,6 +162,60 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class):
             tm.assert_numpy_array_equal(expected, result)
             tm.assert_numpy_array_equal(exp_miss, res_miss)
 
+    def test_where_unobserved_nan(self):
+        ser = pd.Series(pd.Categorical(['a', 'b']))
+        result = ser.where([True, False])
+        expected = pd.Series(pd.Categorical(['a', None],
+                                            categories=['a', 'b']))
+        tm.assert_series_equal(result, expected)
+
+        # all NA
+        ser = pd.Series(pd.Categorical(['a', 'b']))
+        result = ser.where([False, False])
+        expected = pd.Series(pd.Categorical([None, None],
+                                            categories=['a', 'b']))
+        tm.assert_series_equal(result, expected)
+
+    def test_where_unobserved_categories(self):
+        ser = pd.Series(
+            Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
+        )
+        result = ser.where([True, True, False], other='b')
+        expected = pd.Series(
+            Categorical(['a', 'b', 'b'], categories=ser.cat.categories)
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_where_other_categorical(self):
+        ser = pd.Series(
+            Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
+        )
+        other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'])
+        result = ser.where([True, False, True], other)
+        expected = pd.Series(Categorical(['a', 'c', 'c'], dtype=ser.dtype))
+        tm.assert_series_equal(result, expected)
+
+    def test_where_warns(self):
+        ser = pd.Series(Categorical(['a', 'b', 'c']))
+        with tm.assert_produces_warning(FutureWarning):
+            result = ser.where([True, False, True], 'd')
+
+        expected = pd.Series(np.array(['a', 'd', 'c'], dtype='object'))
+        tm.assert_series_equal(result, expected)
+
+    def test_where_ordered_differs_rasies(self):
+        ser = pd.Series(
+            Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'],
+                        ordered=True)
+        )
+        other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'],
+                            ordered=True)
+        with tm.assert_produces_warning(FutureWarning):
+            result = ser.where([True, False, True], other)
+
+        expected = pd.Series(np.array(['a', 'c', 'c'], dtype=object))
+        tm.assert_series_equal(result, expected)
+
 
 @pytest.mark.parametrize("index", [True, False])
 def test_mask_with_boolean(index):

diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py
@@ -2,7 +2,8 @@
 import numpy as np
 import pytest
 
-from pandas import Index, IntervalIndex, date_range, timedelta_range
+import pandas as pd
+from pandas import Index, Interval, IntervalIndex, date_range, timedelta_range
 from pandas.core.arrays import IntervalArray
 import pandas.util.testing as tm
 
@@ -50,6 +51,17 @@ def test_set_closed(self, closed, new_closed):
         expected = IntervalArray.from_breaks(range(10), closed=new_closed)
         tm.assert_extension_array_equal(result, expected)
 
+    @pytest.mark.parametrize('other', [
+        Interval(0, 1, closed='right'),
+        IntervalArray.from_breaks([1, 2, 3, 4], closed='right'),
+    ])
+    def test_where_raises(self, other):
+        ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4],
+                                                  closed='left'))
+        match = "'value.closed' is 'right', expected 'left'."
+        with pytest.raises(ValueError, match=match):
+            ser.where([True, False, True], other=other)
+
 
 class TestSetitem(object):
 

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -357,10 +357,10 @@ def setitem():
         def setslice():
             self.arr[1:5] = 2
 
-        with pytest.raises(TypeError, match="item assignment"):
+        with pytest.raises(TypeError, match="assignment via setitem"):
             setitem()
 
-        with pytest.raises(TypeError, match="item assignment"):
+        with pytest.raises(TypeError, match="assignment via setitem"):
             setslice()
 
     def test_constructor_from_too_large_array(self):