Squashed commit of the following:

TomAugspurger · TomAugspurger · commit abd019af5cc0 · 2018-12-07T07:24:13.000-06:00
commit 9e0d87d Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri Dec 7 07:18:58 2018 -0600 update docs, cleanup commit 1271d3d Merge: 033ac9c f74fc59 Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri Dec 7 07:12:49 2018 -0600 Merge remote-tracking branch 'upstream/master' into ea-where commit 033ac9c Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Fri Dec 7 06:30:18 2018 -0600 Setitem-based where commit e9665b8 Merge: 5e14414 03134cb Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Thu Dec 6 21:38:42 2018 -0600 Merge remote-tracking branch 'upstream/master' into ea-where commit 5e14414 Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Thu Dec 6 09:18:54 2018 -0600 where versionadded commit d90f384 Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Thu Dec 6 09:17:43 2018 -0600 deprecation note for categorical commit 4715ef6 Merge: edff47e b78aa8d Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Thu Dec 6 08:15:26 2018 -0600 Merge remote-tracking branch 'upstream/master' into ea-where commit edff47e Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Thu Dec 6 08:15:21 2018 -0600 32-bit compat commit badb5be Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Thu Dec 6 06:21:44 2018 -0600 compat, revert commit 911a2da Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Wed Dec 5 15:55:24 2018 -0600 debug 32-bit issue commit a69dbb3 Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Wed Dec 5 15:49:17 2018 -0600 warn for categorical commit 6f79282 Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Wed Dec 5 12:45:54 2018 -0600 32-bit compat commit 56470c3 Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Wed Dec 5 11:39:48 2018 -0600 Fixups: * Ensure data generated OK. * Remove erroneous comments about alignment. That was user error. commit c4604df Author: Tom Augspurger <tom.w.augspurger@gmail.com> Date: Mon Dec 3 14:23:25 2018 -0600 API: Added ExtensionArray.where We need some way to do `.where` on EA object for DatetimeArray. Adding it to the interface is, I think, the easiest way. Initially I started to write a version on ExtensionBlock, but it proved to be unwieldy. to write a version that performed well for all types. It *may* be possible to do using `_ndarray_values` but we'd need a few more things around that (missing values, converting an arbitrary array to the "same' ndarary_values, error handling, re-constructing). It seemed easier to push this down to the array. The implementation on ExtensionArray is readable, but likely slow since it'll involve a conversion to object-dtype. Closes pandas-dev#24077
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1137,6 +1137,8 @@ Deprecations
 - :func:`pandas.types.is_datetimetz` is deprecated in favor of `pandas.types.is_datetime64tz` (:issue:`23917`)
 - Creating a :class:`TimedeltaIndex` or :class:`DatetimeIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range` and :func:`date_range` (:issue:`23919`)
 - Passing a string alias like ``'datetime64[ns, UTC]'`` as the `unit` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`).
+- In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype first, or add the ``other`` to the categories first (:issue:`24077`).
+
 
 .. _whatsnew_0240.deprecations.datetimelike_int_ops:
 
@@ -1308,6 +1310,7 @@ Datetimelike
 - Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`)
 - Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`)
 - Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`)
+- Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are undordered and have the same categories, but in a different order (:issue:`24142`)
 
 Timedelta
 ^^^^^^^^^
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -116,6 +116,7 @@ def get_range_parameters(data):
     reduce = functools.reduce
     long = int
     unichr = chr
+    import reprlib
 
     # This was introduced in Python 3.3, but we don't support
     # Python 3.x < 3.5, so checking PY3 is safe.
@@ -271,6 +272,7 @@ class to receive bound method
     class_types = type,
     text_type = str
     binary_type = bytes
+    import reprlib
 
     def u(s):
         return s
@@ -323,6 +325,7 @@ def set_function_name(f, name, cls):
     class_types = (type, types.ClassType)
     text_type = unicode
     binary_type = str
+    import repr as reprlib
 
     def u(s):
         return unicode(s, "unicode_escape")
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -221,6 +221,8 @@ def __setitem__(self, key, value):
         #   example, a string like '2018-01-01' is coerced to a datetime
         #   when setting on a datetime64ns array. In general, if the
         #   __init__ method coerces that value, then so should __setitem__
+        # Note, also, that Series/DataFrame.where internally use __setitem__
+        # on a copy of the data.
         raise NotImplementedError(_not_implemented_message.format(
             type(self), '__setitem__')
         )
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2121,11 +2121,21 @@ def __setitem__(self, key, value):
             `Categorical` does not have the same categories
         """
 
+        if isinstance(value, (ABCIndexClass, ABCSeries)):
+            value = value.array
+
         # require identical categories set
         if isinstance(value, Categorical):
-            if not value.categories.equals(self.categories):
+            if not is_dtype_equal(self, value):
                 raise ValueError("Cannot set a Categorical with another, "
                                  "without identical categories")
+            if not self.categories.equals(value.categories):
+                new_codes = _recode_for_categories(
+                    value.codes, value.categories, self.categories
+                )
+                value = Categorical.from_codes(new_codes,
+                                               categories=self.categories,
+                                               ordered=self.ordered)
 
         rvalue = value if is_list_like(value) else [value]
 
diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
@@ -704,11 +704,6 @@ def __array__(self, dtype=None, copy=True):
         out[self.sp_index.to_int_index().indices] = self.sp_values
         return out
 
-    def __setitem__(self, key, value):
-        # I suppose we could allow setting of non-fill_value elements.
-        msg = "SparseArray does not support item assignment via setitem"
-        raise TypeError(msg)
-
     @classmethod
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         return cls(scalars, dtype=dtype)
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -501,7 +501,14 @@ def _can_reindex(self, indexer):
 
     @Appender(_index_shared_docs['where'])
     def where(self, cond, other=None):
-        cat = self.values.where(cond, other=other)
+        # TODO: Investigate an alternative implementation with
+        # 1. copy the underyling Categorical
+        # 2. setitem with `cond` and `other`
+        # 3. Rebuild CategoricalIndex.
+        if other is None:
+            other = self._na_value
+        values = np.where(cond, self.values, other)
+        cat = Categorical(values, dtype=self.dtype)
         return self._shallow_copy(cat, **self._get_attributes_dict())
 
     def reindex(self, target, method=None, level=None, limit=None,
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1991,7 +1991,33 @@ def where(self, other, cond, align=True, errors='raise',
             # we want to replace that with the correct NA value
             # for the type
             other = self.dtype.na_value
-        result = self.values.where(cond, other)
+
+        if is_sparse(self.values):
+            # ugly workaround for ensure that the dtype is OK
+            # after we insert NaNs.
+            if is_sparse(other):
+                otype = other.dtype.subtype
+            else:
+                otype = other
+            dtype = self.dtype.update_dtype(
+                np.result_type(self.values.dtype.subtype, otype)
+            )
+        else:
+            dtype = self.dtype
+
+        # rough heuristic to see if the other array implements setitem
+        if self._holder.__setitem__ is ExtensionArray.__setitem__:
+            result = self._holder._from_sequence(
+                np.where(cond, self.values, other),
+                dtype=dtype,
+            )
+        else:
+            result = self.values.copy()
+            icond = ~cond
+            if lib.is_scalar(other):
+                result[icond] = other
+            else:
+                result[icond] = other[icond]
         return self.make_block_same_class(result, placement=self.mgr_locs)
 
     @property
@@ -2701,13 +2727,55 @@ def concat_same_type(self, to_concat, placement=None):
 
     def where(self, other, cond, align=True, errors='raise',
               try_cast=False, axis=0, transpose=False):
-        result = super(CategoricalBlock, self).where(
-            other, cond, align, errors, try_cast, axis, transpose
+        # This can all be deleted in favor of ExtensionBlock.where once
+        # we enforce the deprecation.
+        object_msg = (
+            "Implicitly converting categorical to object-dtype ndarray. "
+            "The values `{}' are not present in this categorical's "
+            "categories. A future version of pandas will raise a ValueError "
+            "when 'other' contains different categories.\n\n"
+            "To preserve the current behavior, add the new categories to "
+            "the categorical before calling 'where', or convert the "
+            "categorical to a different dtype."
         )
-        if result.values.dtype != self.values.dtype:
-            # For backwards compatability, we allow upcasting to object.
-            # This fallback will be removed in the future.
-            result = result.astype(object)
+
+        scalar_other = lib.is_scalar(other)
+        categorical_other = is_categorical_dtype(other)
+        if isinstance(other, ABCDataFrame):
+            # should be 1d
+            assert other.shape[1] == 1
+            other = other.iloc[:, 0]
+
+        if isinstance(other, (ABCSeries, ABCIndexClass)):
+            other = other._values
+
+        do_as_object = (
+            # Two categoricals with different dtype (ignoring order)
+            (categorical_other and not is_dtype_equal(self.values, other)) or
+            # a not-na scalar not present in our categories
+            (scalar_other and (other not in self.values.categories
+                               and notna(other))) or
+            # an array not present in our categories
+            (not scalar_other and
+             (self.values.categories.get_indexer(
+                 other[notna(other)]) < 0).any())
+        )
+
+        if do_as_object:
+            if scalar_other:
+                msg = object_msg.format(other)
+            else:
+                msg = compat.reprlib.repr(other)
+
+            warnings.warn(msg, FutureWarning, stacklevel=6)
+            result = self.astype(object).where(other, cond, align=align,
+                                               errors=errors,
+                                               try_cast=try_cast,
+                                               axis=axis, transpose=transpose)
+        else:
+            result = super(CategoricalBlock, self).where(
+                other, cond, align, errors, try_cast, axis, transpose
+            )
         return result
 
 
diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 
+import pandas as pd
 from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series
 import pandas.core.common as com
 from pandas.tests.arrays.categorical.common import TestCategorical
@@ -43,6 +44,45 @@ def test_setitem(self):
 
         tm.assert_categorical_equal(c, expected)
 
+    @pytest.mark.parametrize('other', [
+        pd.Categorical(['b', 'a']),
+        pd.Categorical(['b', 'a'], categories=['b', 'a']),
+    ])
+    def test_setitem_same_but_unordered(self, other):
+        # GH-24142
+        target = pd.Categorical(['a', 'b'], categories=['a', 'b'])
+        mask = np.array([True, False])
+        target[mask] = other[mask]
+        expected = pd.Categorical(['b', 'b'], categories=['a', 'b'])
+        tm.assert_categorical_equal(target, expected)
+
+    @pytest.mark.parametrize('other', [
+        pd.Categorical(['b', 'a'], categories=['b', 'a', 'c']),
+        pd.Categorical(['b', 'a'], categories=['a', 'b', 'c']),
+        pd.Categorical(['a', 'a'], categories=['a']),
+        pd.Categorical(['b', 'b'], categories=['b']),
+    ])
+    def test_setitem_different_unordered_raises(self, other):
+        # GH-24142
+        target = pd.Categorical(['a', 'b'], categories=['a', 'b'])
+        mask = np.array([True, False])
+        with pytest.raises(ValueError):
+            target[mask] = other[mask]
+
+    @pytest.mark.parametrize('other', [
+        pd.Categorical(['b', 'a']),
+        pd.Categorical(['b', 'a'], categories=['b', 'a'], ordered=True),
+        pd.Categorical(['b', 'a'], categories=['a', 'b', 'c'], ordered=True),
+    ])
+    def test_setitem_same_ordered_rasies(self, other):
+        # Gh-24142
+        target = pd.Categorical(['a', 'b'], categories=['a', 'b'],
+                                ordered=True)
+        mask = np.array([True, False])
+
+        with pytest.raises(ValueError):
+            target[mask] = other[mask]
+
 
 class TestCategoricalIndexing(object):
 
@@ -122,37 +162,59 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class):
             tm.assert_numpy_array_equal(expected, result)
             tm.assert_numpy_array_equal(exp_miss, res_miss)
 
+    def test_where_unobserved_nan(self):
+        ser = pd.Series(pd.Categorical(['a', 'b']))
+        result = ser.where([True, False])
+        expected = pd.Series(pd.Categorical(['a', None],
+                                            categories=['a', 'b']))
+        tm.assert_series_equal(result, expected)
+
+        # all NA
+        ser = pd.Series(pd.Categorical(['a', 'b']))
+        result = ser.where([False, False])
+        expected = pd.Series(pd.Categorical([None, None],
+                                            categories=['a', 'b']))
+        tm.assert_series_equal(result, expected)
+
     def test_where_unobserved_categories(self):
-        arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
-        result = arr.where([True, True, False], other='b')
-        expected = Categorical(['a', 'b', 'b'], categories=arr.categories)
-        tm.assert_categorical_equal(result, expected)
+        ser = pd.Series(
+            Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
+        )
+        result = ser.where([True, True, False], other='b')
+        expected = pd.Series(
+            Categorical(['a', 'b', 'b'], categories=ser.cat.categories)
+        )
+        tm.assert_series_equal(result, expected)
 
     def test_where_other_categorical(self):
-        arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
+        ser = pd.Series(
+            Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'])
+        )
         other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'])
-        result = arr.where([True, False, True], other)
-        expected = Categorical(['a', 'c', 'c'], dtype=arr.dtype)
-        tm.assert_categorical_equal(result, expected)
+        result = ser.where([True, False, True], other)
+        expected = pd.Series(Categorical(['a', 'c', 'c'], dtype=ser.dtype))
+        tm.assert_series_equal(result, expected)
 
     def test_where_warns(self):
-        arr = Categorical(['a', 'b', 'c'])
+        ser = pd.Series(Categorical(['a', 'b', 'c']))
         with tm.assert_produces_warning(FutureWarning):
-            result = arr.where([True, False, True], 'd')
+            result = ser.where([True, False, True], 'd')
 
-        expected = np.array(['a', 'd', 'c'], dtype='object')
-        tm.assert_numpy_array_equal(result, expected)
+        expected = pd.Series(np.array(['a', 'd', 'c'], dtype='object'))
+        tm.assert_series_equal(result, expected)
 
     def test_where_ordered_differs_rasies(self):
-        arr = Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'],
-                          ordered=True)
+        ser = pd.Series(
+            Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'],
+                        ordered=True)
+        )
         other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'],
                             ordered=True)
         with tm.assert_produces_warning(FutureWarning):
-            result = arr.where([True, False, True], other)
+            result = ser.where([True, False, True], other)
 
-        expected = np.array(['a', 'c', 'c'], dtype=object)
-        tm.assert_numpy_array_equal(result, expected)
+        expected = pd.Series(np.array(['a', 'c', 'c'], dtype=object))
+        tm.assert_series_equal(result, expected)
 
 
 @pytest.mark.parametrize("index", [True, False])
diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pytest
 
+import pandas as pd
 from pandas import Index, Interval, IntervalIndex, date_range, timedelta_range
 from pandas.core.arrays import IntervalArray
 import pandas.util.testing as tm
@@ -55,10 +56,11 @@ def test_set_closed(self, closed, new_closed):
         IntervalArray.from_breaks([1, 2, 3, 4], closed='right'),
     ])
     def test_where_raises(self, other):
-        arr = IntervalArray.from_breaks([1, 2, 3, 4], closed='left')
-        match = "'other.closed' is 'right', expected 'left'."
+        ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4],
+                                                  closed='left'))
+        match = "'value.closed' is 'right', expected 'left'."
         with pytest.raises(ValueError, match=match):
-            arr.where([True, False, True], other=other)
+            ser.where([True, False, True], other=other)
 
 
 class TestSetitem(object):
diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py
@@ -207,11 +207,11 @@ def test_sub_period():
     period_array(['2000', '2001', '2000'], freq='H')
 ])
 def test_where_different_freq_raises(other):
-    arr = period_array(['2000', '2001', '2002'], freq='D')
+    ser = pd.Series(period_array(['2000', '2001', '2002'], freq='D'))
     cond = np.array([True, False, True])
     with pytest.raises(IncompatibleFrequency,
                        match="Input has different freq=H"):
-        arr.where(cond, other)
+        ser.where(cond, other)
 
 
 # ----------------------------------------------------------------------------
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
@@ -12,7 +12,7 @@ def make_data(fill_value):
     if np.isnan(fill_value):
         data = np.random.uniform(size=100).astype('float64')
     else:
-        data = np.random.randint(1, 100, size=100, dtype='int64')
+        data = np.random.randint(1, 100, size=100)
         if data[0] == data[1]:
             data[0] += 1
 
@@ -266,13 +266,13 @@ def test_where_series(self, data, na_value):
 
         cond = np.array([True, True, False, False])
         result = ser.where(cond)
-        # new_dtype is the only difference
+
         new_dtype = SparseDtype('float', 0.0)
         expected = pd.Series(cls._from_sequence([a, a, na_value, na_value],
                                                 dtype=new_dtype))
         self.assert_series_equal(result, expected)
 
-        other = cls._from_sequence([a, b, a, b])
+        other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
         cond = np.array([True, False, True, True])
         result = ser.where(cond, other)
         expected = pd.Series(cls._from_sequence([a, b, b, b],

Original file line number	Diff line number	Diff line change
`@@ -221,6 +221,8 @@ def __setitem__(self, key, value):`
`221`	`221`	`# example, a string like '2018-01-01' is coerced to a datetime`
`222`	`222`	`# when setting on a datetime64ns array. In general, if the`
`223`	`223`	`# __init__ method coerces that value, then so should __setitem__`
	`224`	`+ # Note, also, that Series/DataFrame.where internally use __setitem__`
	`225`	`+ # on a copy of the data.`
`224`	`226`	`raise NotImplementedError(_not_implemented_message.format(`
`225`	`227`	`type(self), '__setitem__')`
`226`	`228`	`)`